In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

### Importing BASKET data and Freq Itemset

In [2]:
dataset = pd.read_csv('../Apriori/Dataset/basket.csv')

def preprocess_data(df):
    transactions = []
    for index, row in df.iterrows():
        transaction = set(row.dropna().unique())
        if transaction:
            transactions.append(list(transaction))
    return transactions

transactions = preprocess_data(dataset)
# transactions[:5]

In [6]:
freq_dataset = pd.read_csv('../Apriori/Frequent Itemsets/Basket/basket_0.02_1.csv', converters={"Itemset": literal_eval} )
freq_dataset = freq_dataset[freq_dataset.Itemset.map(len) == 1]
freq_dataset.to_csv("../Clustering/Dataset/basket_0.02_freq_vector_mapping.csv")
freq_dataset.head()

Unnamed: 0,Itemset,Support
0,[whole milk],0.157867
1,[sausage],0.060353
2,[yogurt],0.085884
3,[soda],0.097113
4,[canned beer],0.046919


### Creating the BASKET transaction Vectors using frequent itemsets as features

In [7]:
transaction_freq_vectors = []
for transaction in transactions:
    temp_vector = []
    for freq_item in freq_dataset['Itemset'].values:
        if freq_item in transaction:
            temp_vector.append(1)
        else:
            temp_vector.append(0)
    transaction_freq_vectors.append(temp_vector)

# transaction_vectors[:5]

In [8]:
result_transactions_w_vectors = pd.DataFrame({'Transaction':transactions, 'Vector':transaction_freq_vectors})
result_transactions_w_vectors.head()

Unnamed: 0,Transaction,Vector
0,"[salty snack, whole milk, pastry]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[yogurt, semi-finished bread, whole milk, saus...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[soda, pickled vegetables]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[misc. beverages, canned beer]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[hygiene articles, sausage]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
result_transactions_w_vectors.to_csv("../Clustering/Dataset/basket_0.02_freq_vector.csv")

### Combined Code to iterate through all available Minimum Support of the Basket Dataset

In [None]:
import pandas as pd
from ast import literal_eval

dataset = pd.read_csv('../Apriori/Dataset/basket.csv')

def preprocess_data(df):
    transactions = []
    for index, row in df.iterrows():
        transaction = set(row.dropna().unique())
        if transaction:
            transactions.append(list(transaction))
    return transactions

transactions = preprocess_data(dataset)

for i in range(1,6,1): 
    freq_dataset = pd.read_csv('../Apriori/Frequent Itemsets/Basket/basket_0.0{}_1.csv'.format(i), converters={"Itemset": literal_eval} )
    freq_dataset = freq_dataset[freq_dataset.Itemset.map(len) == 1]

    freq_dataset.to_csv("../Clustering/Dataset/Basket/basket_0.0{}_freq_vector_mapping.csv".format(i))

    transaction_freq_vectors = []
    for transaction in transactions:
        temp_vector = []
        for freq_item in freq_dataset['Itemset'].values:
            if freq_item[0] in transaction:
                # temp_vector.append(1)
                temp_vector.append(transaction.count(freq_item[0]))
            else:
                temp_vector.append(0)
        transaction_freq_vectors.append(temp_vector)

    result_transactions_w_vectors = pd.DataFrame({'Transaction':transactions, 'Vector':transaction_freq_vectors})

    result_transactions_w_vectors.to_csv("../Clustering/Dataset/Basket/basket_0.0{}_freq_vector.csv".format(i))

### Vectorising BASKET Transactions with its full dimension

In [10]:
vectorised_transaction = []

for transaction in transactions:
    for item in transaction:
        if item not in vectorised_transaction:
            vectorised_transaction.append(item)

basket_itemset = pd.DataFrame({'Itemset':vectorised_transaction})
basket_itemset.head()

Unnamed: 0,Itemset
0,salty snack
1,whole milk
2,pastry
3,yogurt
4,semi-finished bread


In [11]:
basket_itemset.to_csv("../Clustering/Dataset/basket_infreq_vector_mapping.csv")

In [12]:
transaction_vectors = []
for transaction in transactions:
    temp_vector = []
    for freq_item in vectorised_transaction:
        if freq_item in transaction:
            temp_vector.append(1)
        else:
            temp_vector.append(0)
    transaction_vectors.append(temp_vector)

In [13]:
result_freq_transactions_w_vectors = pd.DataFrame({'Transaction':transactions, 'Vector':transaction_vectors})
result_freq_transactions_w_vectors.head()

Unnamed: 0,Transaction,Vector
0,"[salty snack, whole milk, pastry]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[yogurt, semi-finished bread, whole milk, saus...","[0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[soda, pickled vegetables]","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[misc. beverages, canned beer]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."
4,"[hygiene articles, sausage]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [18]:
len(result_freq_transactions_w_vectors["Vector"][0])

167

In [19]:
result_freq_transactions_w_vectors.to_csv("../Clustering/Dataset/basket_infreq_vector.csv")

### Book Review

In [20]:
import pandas as pd
from ast import literal_eval

dataset = pd.read_csv('../Apriori/Dataset/book_review.csv', names=['description'], converters={"description": literal_eval})

def preprocess_data(df):
    transactions = []
    for index, row in df.iterrows():
        transaction = row["description"]
        transactions.append(transaction)
    return transactions

descriptions = preprocess_data(dataset)
# descriptions[:5]

In [23]:
# book_freq_dataset = pd.read_csv('../Apriori/Frequent Itemsets/Books/books_1500_tfidf.csv')
book_freq_dataset = pd.read_csv('../Apriori/Frequent Itemsets/Books/books_0.09_1.csv', converters={"Itemset": literal_eval})
book_freq_dataset = book_freq_dataset[book_freq_dataset.Itemset.map(len) == 1]
book_freq_dataset["Itemset"] = book_freq_dataset["Itemset"].apply(lambda x: ",".join(map(str,x)))
book_freq_dataset.head()

Unnamed: 0,Itemset,Support
0,time,0.2298
1,famili,0.139
2,help,0.099
3,world,0.212
4,take,0.1168


In [24]:
book_freq_dataset.to_csv("../Clustering/Dataset/book_0.09_freq_vector_mapping.csv")

In [25]:
book_freq_vectors = []
for description in descriptions:
    temp_vector = []
    for freq_item in book_freq_dataset['Itemset'].values:
        if freq_item in description:
            temp_vector.append(description.count(freq_item))
            # temp_vector.append(1)
        else:
            temp_vector.append(0)
    book_freq_vectors.append(temp_vector)

# book_freq_vectors[:5]

In [26]:
result_freq_transactions_w_vectors = pd.DataFrame({'Description':descriptions, 'Vector':book_freq_vectors})
result_freq_transactions_w_vectors.head()

Unnamed: 0,Description,Vector
0,"[rug, sackett, famili, stood, breed, apart, fe...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[allinclus, guid, design, vacat, retreat, hosp...","[0, 0, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[world, war, two, luftwaff, aircraft, pilot, s...","[0, 0, 0, 2, 0, 0, 1, 1, 3, 0, 0, 0, 0, 0, 0, ..."
3,"[fiction, histori, fourthcenturi, irish, monk,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
4,"[1941, killakeet, island, windswept, outer, ba...","[0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 2, 1, 1, 1, 1, ..."


In [27]:
result_freq_transactions_w_vectors.to_csv("../Clustering/Dataset/Book/book_1500_tfidf_vector.csv")

### Combined Code to iterate through all available Minimum Support of the Book Review Dataset

In [28]:
import pandas as pd
from ast import literal_eval

dataset = pd.read_csv('../Apriori/Dataset/book_review.csv', names=['description'], converters={"description": literal_eval})

def preprocess_data(df):
    transactions = []
    for index, row in df.iterrows():
        transaction = row["description"]
        transactions.append(transaction)
    return transactions

descriptions = preprocess_data(dataset)

for i in range(5,16,1): 
    if i < 10:
        i = "0{}".format(i)
    elif i == 10:
        i = 1
    book_freq_dataset = pd.read_csv('../Apriori/Frequent Itemsets/Books/books_0.{}_1.csv'.format(i), converters={"Itemset": literal_eval})
    book_freq_dataset = book_freq_dataset[book_freq_dataset.Itemset.map(len) == 1]
    book_freq_dataset["Itemset"] = book_freq_dataset["Itemset"].apply(lambda x: ",".join(map(str,x)))

    book_freq_dataset.to_csv("../Clustering/Dataset/Book/book_0.{}_freq_vector_mapping.csv".format(i))

    book_freq_vectors = []
    for description in descriptions:
        temp_vector = []
        for freq_item in book_freq_dataset['Itemset'].values:
            if freq_item in description:
                temp_vector.append(description.count(freq_item))
                # temp_vector.append(1)
            else:
                temp_vector.append(0)
        book_freq_vectors.append(temp_vector)

    result_freq_transactions_w_vectors = pd.DataFrame({'Description':descriptions, 'Vector':book_freq_vectors})

    result_freq_transactions_w_vectors.to_csv("../Clustering/Dataset/Book/book_0.{}_freq_vector.csv".format(i))

### Vectorise Description with its full dimension

In [29]:
vectorised_description = []

for description in descriptions:
    for word in description:
        if word not in vectorised_description:
            vectorised_description.append(word)

print("Dimension:",len(vectorised_description))

book_itemset = pd.DataFrame({'Itemset':vectorised_description})

book_itemset.to_csv("../Clustering/Dataset/Book/book_vector_mapping.csv")

description_vectors = []
for description in descriptions:
    temp_vector = []
    for freq_item in vectorised_description:
        if freq_item in description:
            temp_vector.append(description.count(freq_item))
            # temp_vector.append(1)
        else:
            temp_vector.append(0)
    description_vectors.append(temp_vector)

result_book_w_vectors = pd.DataFrame({'Description':descriptions, 'Vector':description_vectors})

result_book_w_vectors.to_csv("../Clustering/Dataset/Book/book_vector.csv")

Dimension: 30198
