In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval

### Importing BASKET data and Freq Itemset

In [23]:
dataset = pd.read_csv('../Apriori/Dataset/basket.csv')

def preprocess_data(df):
    transactions = []
    for index, row in df.iterrows():
        transaction = set(row.dropna().unique())
        if transaction:
            transactions.append(list(transaction))
    return transactions

transactions = preprocess_data(dataset)
transactions[:5]

[["['Entertainment', 'Entertainment', 'Entertainment', 'Entertainment', 'Groceries', 'Entertainment', 'Motor/Travel', 'Housing', 'Motor/Travel', 'Entertainment', 'Entertainment', 'Entertainment', 'Entertainment', 'Entertainment', 'Clothing', 'Entertainment', 'Entertainment', 'Entertainment', 'Entertainment', 'Education', 'Bills and Utilities', 'Health', 'Entertainment', 'Bills and Utilities', 'Entertainment', 'Education', 'Savings', 'Entertainment', 'Entertainment', 'Entertainment', 'Motor/Travel', 'Groceries', 'Savings', 'Groceries', 'Tax', 'Entertainment', 'Motor/Travel', 'Tax', 'Groceries', 'Entertainment', 'Groceries', 'Groceries', 'Motor/Travel', 'Groceries', 'Entertainment', 'Motor/Travel', 'Entertainment', 'Entertainment', 'Housing', 'Entertainment', 'Entertainment', 'Entertainment', 'Entertainment', 'Entertainment', 'Housing', 'Housing', 'Groceries', 'Education', 'Bills and Utilities', 'Entertainment', 'Entertainment', 'Entertainment', 'Entertainment', 'Entertainment', 'Enterta

In [18]:
# freq_dataset = pd.read_csv('../Apriori/Dataset/apriori_dataset.csv')
freq_dataset = pd.read_csv('../Apriori/Frequent Itemsets/basket_0.02_5.csv', converters={"Itemset": literal_eval} )
freq_dataset = freq_dataset[freq_dataset.Itemset.map(len) == 1]
freq_dataset.to_csv("../Clustering/Dataset/basket_0.02_freq_vector_mapping.csv")
freq_dataset

Unnamed: 0,Itemset,Support
0,[whole milk],0.157923
1,[pastry],0.051728
2,[yogurt],0.085879
3,[sausage],0.060349
4,[soda],0.097106
5,[canned beer],0.046916
6,[rolls/buns],0.110005
7,[frankfurter],0.03776
8,[whipped/sour cream],0.043708
9,[curd],0.033683


### Creating the BASKET transaction Vectors using frequent itemsets as features

In [19]:
transaction_freq_vectors = []
for transaction in transactions:
    temp_vector = []
    for freq_item in freq_dataset['Itemset'].values:
        if freq_item in transaction:
            temp_vector.append(1)
        else:
            temp_vector.append(0)
    transaction_freq_vectors.append(temp_vector)

# transaction_vectors[:5]

In [20]:
result_transactions_w_vectors = pd.DataFrame({'Transaction':transactions, 'Vector':transaction_freq_vectors})
result_transactions_w_vectors.head()

Unnamed: 0,Transaction,Vector
0,"[pastry, salty snack, whole milk]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[sausage, yogurt, whole milk, semi-finished br...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[soda, pickled vegetables]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[canned beer, misc. beverages]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[sausage, hygiene articles]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [21]:
result_transactions_w_vectors.to_csv("../Clustering/Dataset/basket_0.02_freq_vector.csv")

### Vectorising BASKET Transactions Normally

In [15]:
vectorised_transaction = []

for transaction in transactions:
    for item in transaction:
        if item not in vectorised_transaction:
            vectorised_transaction.append(item)

print(len(vectorised_transaction))
vectorised_transaction[:5]

basket_itemset = pd.DataFrame({'Itemset':vectorised_transaction})
basket_itemset.head()

167


Unnamed: 0,Itemset
0,pastry
1,whole milk
2,salty snack
3,sausage
4,semi-finished bread


In [16]:
basket_itemset.to_csv("../Clustering/Dataset/basket_infreq_vector_mapping.csv")

In [10]:
transaction_vectors = []
for transaction in transactions:
    temp_vector = []
    for freq_item in vectorised_transaction:
        if freq_item in transaction:
            temp_vector.append(1)
        else:
            temp_vector.append(0)
    transaction_vectors.append(temp_vector)

In [11]:
result_freq_transactions_w_vectors = pd.DataFrame({'Transaction':transactions, 'Vector':transaction_vectors})
result_freq_transactions_w_vectors.head()

Unnamed: 0,Transaction,Vector
0,"[pastry, whole milk, salty snack]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[sausage, whole milk, semi-finished bread, yog...","[0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[pickled vegetables, soda]","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[canned beer, misc. beverages]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."
4,"[sausage, hygiene articles]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [14]:
result_freq_transactions_w_vectors.to_csv("../Clustering/Dataset/basket_infreq_vector.csv")

### Importing Job data and Freq Itemset

In [33]:
dataset = pd.read_csv('../Apriori/Dataset/job_title.csv', converters={"Title": literal_eval})
def preprocess_data(df):
    transactions = []
    for index, row in df.iterrows():
        transaction = row["Title"]
        transactions.append(transaction)
    return transactions

jobs = preprocess_data(dataset)
jobs[:5]

[['digital', 'marketing', 'specialist'],
 ['web', 'developer'],
 ['operations', 'manager'],
 ['network', 'engineer'],
 ['event', 'manager']]

In [64]:
job_freq_dataset = pd.read_csv('../Apriori/Frequent Itemsets/job_frequent_itemsets.csv', converters={"Itemset": literal_eval})
job_freq_dataset = job_freq_dataset[job_freq_dataset.Itemset.map(len) == 1]
job_freq_dataset["Itemset"] = job_freq_dataset["Itemset"].apply(lambda x: ",".join(map(str,x)))
job_freq_dataset.head()

Unnamed: 0,Itemset,Support
0,digital,0.017312
1,specialist,0.08394
2,marketing,0.060434
3,web,0.010695
4,developer,0.038824


### Creating the Job transaction Vectors using frequent itemsets as features

In [66]:
title_freq_vectors = []
for job in jobs:
    temp_vector = []
    for freq_item in job_freq_dataset['Itemset'].values:
        if freq_item in job:
            temp_vector.append(1)
        else:
            temp_vector.append(0)
    title_freq_vectors.append(temp_vector)

# title_freq_vectors[:5]

In [67]:
result_title_w_vectors = pd.DataFrame({'Title':jobs, 'Vector':title_freq_vectors})
result_title_w_vectors.head()

Unnamed: 0,Title,Vector
0,"[digital, marketing, specialist]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[web, developer]","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[operations, manager]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[network, engineer]","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[event, manager]","[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [68]:
result_title_w_vectors.to_csv("../Clustering/Dataset/job_freq_vector.csv")

### Vectorising Job Transactions Normally

In [72]:
vectorised_title = []

for job in jobs:
    for word in job:
        if word not in vectorised_title:
            vectorised_title.append(word)

print("Dimension:",len(vectorised_title))

job_infreq_itemset = pd.DataFrame({'Itemset':vectorised_title})
job_infreq_itemset.head()

Dimension: 132


Unnamed: 0,Itemset
0,digital
1,marketing
2,specialist
3,web
4,developer


In [71]:
job_infreq_itemset.to_csv("../Clustering/Dataset/job_infreq_vector_mapping.csv")

In [73]:
title_vectors = []
for title in jobs:
    temp_vector = []
    for freq_item in vectorised_title:
        if freq_item in title:
            temp_vector.append(1)
        else:
            temp_vector.append(0)
    title_vectors.append(temp_vector)

result_freq_title_w_vectors = pd.DataFrame({'Title':jobs, 'Vector':title_vectors})
result_freq_title_w_vectors.head()

Unnamed: 0,Title,Vector
0,"[digital, marketing, specialist]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[web, developer]","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[operations, manager]","[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[network, engineer]","[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ..."
4,"[event, manager]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ..."


In [74]:
result_freq_title_w_vectors.to_csv("../Clustering/Dataset/job_infreq_vector.csv")

### Book Review

In [10]:
import pandas as pd
from ast import literal_eval

dataset = pd.read_csv('../Apriori/Dataset/book_review.csv', names=['description'], converters={"description": literal_eval})

def preprocess_data(df):
    transactions = []
    for index, row in df.iterrows():
        transaction = row["description"]
        transactions.append(transaction)
    return transactions

descriptions = preprocess_data(dataset)
descriptions[:5]

[['rug',
  'sackett',
  'famili',
  'stood',
  'breed',
  'apart',
  'fellow',
  'pioneer',
  'time',
  'tame',
  'santa',
  'fe',
  'trail',
  'tyrel',
  'orrin',
  'sackettth',
  'fastest',
  'gun',
  'alivewer',
  'match'],
 ['allinclus',
  'guid',
  'design',
  'vacat',
  'retreat',
  'hospit',
  'build',
  'today',
  'fastestgrow',
  'construct',
  'categori',
  'resort',
  'design',
  'plan',
  'architectur',
  'interior',
  'intern',
  'expert',
  'margaret',
  'huffadin',
  'give',
  'foundat',
  'work',
  'success',
  'wide',
  'rang',
  'resort',
  'project',
  'casinobas',
  'spa',
  'sport',
  'beach',
  'ecotour',
  'urban',
  'theme',
  'resort',
  'anywher',
  'world',
  'fromthegroundup',
  'guid',
  'take',
  'feasibl',
  'studi',
  'planningfinanc',
  'design',
  'stage',
  'render',
  'photograph',
  'architectur',
  'interior',
  'design',
  'detail',
  'public',
  'area',
  'guest',
  'room',
  'renown',
  'resort',
  'rich',
  'illustr',
  'profitdetermin',
  'con

In [13]:
book_freq_dataset = pd.read_csv('../Apriori/Frequent Itemsets/books_0.09_1.csv', converters={"Itemset": literal_eval})
book_freq_dataset = book_freq_dataset[book_freq_dataset.Itemset.map(len) == 1]
book_freq_dataset["Itemset"] = book_freq_dataset["Itemset"].apply(lambda x: ",".join(map(str,x)))
book_freq_dataset.head()

Unnamed: 0,Itemset,Support
0,time,0.2298
1,famili,0.139
2,work,0.1498
3,world,0.212
4,help,0.099


In [16]:
book_freq_dataset.to_csv("../Clustering/Dataset/book_0.09_freq_vector_mapping.csv")

In [14]:
book_freq_vectors = []
for description in descriptions:
    temp_vector = []
    for freq_item in book_freq_dataset['Itemset'].values:
        if freq_item in description:
            temp_vector.append(description.count(freq_item))
            # temp_vector.append(1)
        else:
            temp_vector.append(0)
    book_freq_vectors.append(temp_vector)

book_freq_vectors[:5]

[[1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  2,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  2,
  0,
  0,
  1,
  3,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  2,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]]

In [15]:
result_freq_transactions_w_vectors = pd.DataFrame({'Description':descriptions, 'Vector':book_freq_vectors})
result_freq_transactions_w_vectors.head()

Unnamed: 0,Description,Vector
0,"[rug, sackett, famili, stood, breed, apart, fe...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[allinclus, guid, design, vacat, retreat, hosp...","[0, 0, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[world, war, two, luftwaff, aircraft, pilot, s...","[0, 0, 0, 2, 0, 0, 1, 3, 1, 0, 0, 0, 0, 0, 0, ..."
3,"[fiction, histori, fourthcenturi, irish, monk,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
4,"[1941, killakeet, island, windswept, outer, ba...","[0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 2, 2, 1, 1, 1, ..."


In [17]:
result_freq_transactions_w_vectors.to_csv("../Clustering/Dataset/Book/book_0.09_freq_vector.csv")

In [18]:
import pandas as pd
from ast import literal_eval

dataset = pd.read_csv('../Apriori/Dataset/book_review.csv', names=['description'], converters={"description": literal_eval})

def preprocess_data(df):
    transactions = []
    for index, row in df.iterrows():
        transaction = row["description"]
        transactions.append(transaction)
    return transactions

descriptions = preprocess_data(dataset)

for i in range(5,9,1):
    book_freq_dataset = pd.read_csv('../Apriori/Frequent Itemsets/books_0.0{}_1.csv'.format(i), converters={"Itemset": literal_eval})
    book_freq_dataset = book_freq_dataset[book_freq_dataset.Itemset.map(len) == 1]
    book_freq_dataset["Itemset"] = book_freq_dataset["Itemset"].apply(lambda x: ",".join(map(str,x)))

    book_freq_dataset.to_csv("../Clustering/Dataset/Book/book_0.0{}_freq_vector_mapping.csv".format(i))

    book_freq_vectors = []
    for description in descriptions:
        temp_vector = []
        for freq_item in book_freq_dataset['Itemset'].values:
            if freq_item in description:
                temp_vector.append(description.count(freq_item))
                # temp_vector.append(1)
            else:
                temp_vector.append(0)
        book_freq_vectors.append(temp_vector)

    result_freq_transactions_w_vectors = pd.DataFrame({'Description':descriptions, 'Vector':book_freq_vectors})

    result_freq_transactions_w_vectors.to_csv("../Clustering/Dataset/Book/book_0.0{}_freq_vector.csv".format(i))