In [25]:
import pandas as pd
import numpy as np
from ast import literal_eval

### Importing BASKET data and Freq Itemset

In [3]:
dataset = pd.read_csv('../Apriori/Dataset/basket.csv')

def preprocess_data(df):
    transactions = []
    for index, row in df.iterrows():
        transaction = set(row.dropna().unique())
        if transaction:
            transactions.append(list(transaction))
    return transactions

transactions = preprocess_data(dataset)
transactions[:5]

[['pastry', 'whole milk', 'salty snack'],
 ['sausage', 'whole milk', 'semi-finished bread', 'yogurt'],
 ['pickled vegetables', 'soda'],
 ['canned beer', 'misc. beverages'],
 ['sausage', 'hygiene articles']]

In [72]:
freq_dataset = pd.read_csv('../Apriori/Dataset/apriori_dataset.csv')
freq_dataset = freq_dataset[freq_dataset['Size'] == 1]
freq_dataset.to_csv("../Clustering/Dataset/basket_freq_vector_mapping.csv")
freq_dataset

Unnamed: 0,Size,Itemset
0,1,salty snack
1,1,whole milk
2,1,pastry
3,1,yogurt
4,1,sausage
...,...,...
59,1,pork
60,1,ice cream
61,1,waffles
62,1,cat food


### Creating the BASKET transaction Vectors using frequent itemsets as features

In [69]:
transaction_freq_vectors = []
for transaction in transactions:
    temp_vector = []
    for freq_item in freq_dataset['Itemset'].values:
        if freq_item in transaction:
            temp_vector.append(1)
        else:
            temp_vector.append(0)
    transaction_freq_vectors.append(temp_vector)

# transaction_vectors[:5]


In [70]:
result_transactions_w_vectors = pd.DataFrame({'Transaction':transactions, 'Vector':transaction_freq_vectors})
result_transactions_w_vectors.head()

Unnamed: 0,Transaction,Vector
0,"[salty snack, pastry, whole milk]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[yogurt, semi-finished bread, whole milk, saus...","[0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[soda, pickled vegetables]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[misc. beverages, canned beer]","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[hygiene articles, sausage]","[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [71]:
result_transactions_w_vectors.to_csv("../Clustering/Dataset/basket_freq_vector.csv")

### Vectorising BASKET Transactions Normally

In [15]:
vectorised_transaction = []

for transaction in transactions:
    for item in transaction:
        if item not in vectorised_transaction:
            vectorised_transaction.append(item)

print(len(vectorised_transaction))
vectorised_transaction[:5]

basket_itemset = pd.DataFrame({'Itemset':vectorised_transaction})
basket_itemset.head()

167


Unnamed: 0,Itemset
0,pastry
1,whole milk
2,salty snack
3,sausage
4,semi-finished bread


In [16]:
basket_itemset.to_csv("../Clustering/Dataset/basket_infreq_vector_mapping.csv")

In [10]:
transaction_vectors = []
for transaction in transactions:
    temp_vector = []
    for freq_item in vectorised_transaction:
        if freq_item in transaction:
            temp_vector.append(1)
        else:
            temp_vector.append(0)
    transaction_vectors.append(temp_vector)

In [11]:
result_freq_transactions_w_vectors = pd.DataFrame({'Transaction':transactions, 'Vector':transaction_vectors})
result_freq_transactions_w_vectors.head()

Unnamed: 0,Transaction,Vector
0,"[pastry, whole milk, salty snack]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[sausage, whole milk, semi-finished bread, yog...","[0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[pickled vegetables, soda]","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[canned beer, misc. beverages]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."
4,"[sausage, hygiene articles]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [14]:
result_freq_transactions_w_vectors.to_csv("../Clustering/Dataset/basket_infreq_vector.csv")

### Importing Job data and Freq Itemset

In [33]:
dataset = pd.read_csv('../Apriori/Dataset/job_title.csv', converters={"Title": literal_eval})
def preprocess_data(df):
    transactions = []
    for index, row in df.iterrows():
        transaction = row["Title"]
        transactions.append(transaction)
    return transactions

jobs = preprocess_data(dataset)
jobs[:5]

[['digital', 'marketing', 'specialist'],
 ['web', 'developer'],
 ['operations', 'manager'],
 ['network', 'engineer'],
 ['event', 'manager']]

In [64]:
job_freq_dataset = pd.read_csv('../Apriori/Frequent Itemsets/job_frequent_itemsets.csv', converters={"Itemset": literal_eval})
job_freq_dataset = job_freq_dataset[job_freq_dataset.Itemset.map(len) == 1]
job_freq_dataset["Itemset"] = job_freq_dataset["Itemset"].apply(lambda x: ",".join(map(str,x)))
job_freq_dataset.head()

Unnamed: 0,Itemset,Support
0,digital,0.017312
1,specialist,0.08394
2,marketing,0.060434
3,web,0.010695
4,developer,0.038824


### Creating the Job transaction Vectors using frequent itemsets as features

In [66]:
title_freq_vectors = []
for job in jobs:
    temp_vector = []
    for freq_item in job_freq_dataset['Itemset'].values:
        if freq_item in job:
            temp_vector.append(1)
        else:
            temp_vector.append(0)
    title_freq_vectors.append(temp_vector)

# title_freq_vectors[:5]

In [67]:
result_title_w_vectors = pd.DataFrame({'Title':jobs, 'Vector':title_freq_vectors})
result_title_w_vectors.head()

Unnamed: 0,Title,Vector
0,"[digital, marketing, specialist]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[web, developer]","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[operations, manager]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[network, engineer]","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[event, manager]","[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [68]:
result_title_w_vectors.to_csv("../Clustering/Dataset/job_freq_vector.csv")

### Vectorising Job Transactions Normally

In [72]:
vectorised_title = []

for job in jobs:
    for word in job:
        if word not in vectorised_title:
            vectorised_title.append(word)

print("Dimension:",len(vectorised_title))

job_infreq_itemset = pd.DataFrame({'Itemset':vectorised_title})
job_infreq_itemset.head()

Dimension: 132


Unnamed: 0,Itemset
0,digital
1,marketing
2,specialist
3,web
4,developer


In [71]:
job_infreq_itemset.to_csv("../Clustering/Dataset/job_infreq_vector_mapping.csv")

In [73]:
title_vectors = []
for title in jobs:
    temp_vector = []
    for freq_item in vectorised_title:
        if freq_item in title:
            temp_vector.append(1)
        else:
            temp_vector.append(0)
    title_vectors.append(temp_vector)

result_freq_title_w_vectors = pd.DataFrame({'Title':jobs, 'Vector':title_vectors})
result_freq_title_w_vectors.head()

Unnamed: 0,Title,Vector
0,"[digital, marketing, specialist]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[web, developer]","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[operations, manager]","[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[network, engineer]","[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ..."
4,"[event, manager]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ..."


In [74]:
result_freq_title_w_vectors.to_csv("../Clustering/Dataset/job_infreq_vector.csv")