In [2]:
import bz2
import csv
import io
import re
import random
from collections.abc import Generator
import os
import math
from lxml import etree
from tqdm import tqdm

import numpy as np
import pandas as pd
from nltk.corpus import words
import nltk
import wiki_dump


## PART 1
## Apriori Demo with the Subsetted Bread Basket Dataset, "Shortened Bread Basket.csv". 

Begin by choosing a short subset of the Bread Basket's rows and derive as transactions

In [3]:
# Truncate the file to just the first few rows to allow proof that
# the algo was run correctly

rows = 0
filepath = os.getcwd()
output = "bread basket_subset.csv"
maxrows = 10 # allow file to have a max of only `maxrows` rows

# Alternatively, if you want to run with full dataset, set maxrows = 20507


with open(f'{filepath}/bread basket.csv', 'r') as input_file:
  # Open the output file for writing
  with open(output, 'w', newline='') as output_file:

    # Create CSV writer
    csv_writer = csv.writer(output_file)
    # Write the headers to the output file
    header = next(input_file)
    csv_writer.writerow(header.strip().split(','))

    # Iterate through each row in the input file
    for row in input_file:
      if rows == maxrows:
        break
      rows += 1
      # Split the row into a list of values
      values = row.strip().split(',')
      # write values to the new csv file
      csv_writer.writerow(values)

# examine file size
file_size_bytes = os.stat(f'{filepath}/bread basket_subset.csv').st_size
print(f"subsetted data has been written. Size: {file_size_bytes} bytes.")

# Note that the headers are conserved too

subsetted data has been written. Size: 500 bytes.


In [6]:
# Further preprocessing such that all items bought with the same Transaction number
# appear within the same "transaction" (by its definition in ARM / Apriori algorithm).

# Within this dataset, each row corresponds to the purchase of a single item
import pandas as pd
breaddf = pd.read_csv(f'{filepath}/bread basket_subset.csv')

# Create binary columns for each unique item (dummies being "dummy variable" or "indicator variable")
df_dummies = pd.get_dummies(breaddf['Item'])

# Concatenate the dummy columns with the original DataFrame
df_wide = pd.concat([breaddf, df_dummies], axis=1)

# convert the above to wide form - this also creates a set of columns for each existing column
wide_table = pd.pivot_table(df_wide, values=df_dummies.columns, index='Transaction',
                            columns='Item', aggfunc='count', fill_value=0)

# eliminate duplicated columns from pivot table creations
tempdf = wide_table.iloc[:,:int(np.sqrt(wide_table.shape[1]))]
# drop the multilevel index on first column
tempdf.columns = tempdf.columns.droplevel(0)
tempdf

Item,Bread,Coffee,Cookies,Hot chocolate,Jam,Muffin,Pastry,Scandinavian
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,2
3,0,0,1,1,1,0,0,0
4,0,0,0,0,0,1,0,0
5,1,1,0,0,0,0,1,0


In [7]:
# list of transaction numbers 
transaction_no = pd.unique(tempdf.reset_index()["Transaction"])
transactions = dict()
# let the keys be the unique transaction_no and the value be a list of all items
# bought, occurring in the numbers

for i in transaction_no:
  items_list = []
  testdf = tempdf.reset_index().iloc[i-1].T
  # print(testdf)
  for idx in list(testdf.index):
    if idx != "Transaction":
      # number of the specified item existing in the transaction
      items_existing = testdf[idx]
      if items_existing > 0:
        items_list.append(list([idx] * items_existing))
  # flatten the lists
  flat_list = []
  for elemt in items_list:
    flat_list.extend(elemt)
  transactions[f"{i}"] = flat_list

print(transactions)

{'1': ['Bread'], '2': ['Scandinavian', 'Scandinavian'], '3': ['Cookies', 'Hot chocolate', 'Jam'], '4': ['Muffin'], '5': ['Bread', 'Coffee', 'Pastry']}


In [8]:
maxlen = 0 # we want this as length of the transaction with the most instances of any item
for i in transaction_no:
  unmodifiedlist = transactions[str(i)]
  if maxlen < len(unmodifiedlist):
    maxlen = len(unmodifiedlist)
  transactions[str(i)] = list(unmodifiedlist)


# After identifying maxlen, pad the lists shorter than maxlen with empty strings ''
for i in transaction_no:
  unmodifiedlist = transactions[str(i)]
  # Run these lines to use list (this pads them all to the same length
    # by filling the difference in array length with identical '')
  if len(unmodifiedlist) < maxlen:
    unmodifiedlist.extend(['']*(maxlen - len(unmodifiedlist)))


# directly write to CSV from a list of lists instead... (colnames not needed)
transactions_lists = [v for v in transactions.values()]


csv_file = "Shortened Bread Basket.csv"
# write this to a file in current directory
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    for row in transactions_lists:
        writer.writerow(set(row))
        print(row)



['Bread', '', '']
['Scandinavian', 'Scandinavian', '']
['Cookies', 'Hot chocolate', 'Jam']
['Muffin', '', '']
['Bread', 'Coffee', 'Pastry']


In [9]:
# investigate what we're supposed to look for in a frequent itemset. 
filepath = os.getcwd()
csv_file = "Shortened Bread Basket.csv"

# open csv file which has been structured to have no headers.
# Each Row corresponds to a distinct transaction.
transaction=0
with open(f"{filepath}/{csv_file}", 'r', newline='') as table1:
    for row in table1:
        transaction+=1
        print("Transaction",transaction,":", row)

# Let's say our minsup needs to be higher than or equal to 0.4.
# then we expect {"Bread"} to be the only frequent itemset, showing up in 2/5 transactions. 
# We will prove this in the next cells.

Transaction 1 : ,Bread

Transaction 2 : ,Scandinavian

Transaction 3 : Hot chocolate,Jam,Cookies

Transaction 4 : ,Muffin

Transaction 5 : Coffee,Bread,Pastry



In [10]:
from dask.distributed import Client
client = Client(n_workers=6, threads_per_worker=2)  # Adjust based on your CPU
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 6
Total threads: 12,Total memory: 7.85 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:54156,Workers: 6
Dashboard: http://127.0.0.1:8787/status,Total threads: 12
Started: Just now,Total memory: 7.85 GiB

0,1
Comm: tcp://127.0.0.1:54198,Total threads: 2
Dashboard: http://127.0.0.1:54199/status,Memory: 1.31 GiB
Nanny: tcp://127.0.0.1:54159,
Local directory: C:\Users\SEBAST~1\AppData\Local\Temp\dask-scratch-space\worker-gdzr2117,Local directory: C:\Users\SEBAST~1\AppData\Local\Temp\dask-scratch-space\worker-gdzr2117

0,1
Comm: tcp://127.0.0.1:54183,Total threads: 2
Dashboard: http://127.0.0.1:54191/status,Memory: 1.31 GiB
Nanny: tcp://127.0.0.1:54161,
Local directory: C:\Users\SEBAST~1\AppData\Local\Temp\dask-scratch-space\worker-5vuucnm3,Local directory: C:\Users\SEBAST~1\AppData\Local\Temp\dask-scratch-space\worker-5vuucnm3

0,1
Comm: tcp://127.0.0.1:54185,Total threads: 2
Dashboard: http://127.0.0.1:54187/status,Memory: 1.31 GiB
Nanny: tcp://127.0.0.1:54163,
Local directory: C:\Users\SEBAST~1\AppData\Local\Temp\dask-scratch-space\worker-98jhfg9v,Local directory: C:\Users\SEBAST~1\AppData\Local\Temp\dask-scratch-space\worker-98jhfg9v

0,1
Comm: tcp://127.0.0.1:54193,Total threads: 2
Dashboard: http://127.0.0.1:54196/status,Memory: 1.31 GiB
Nanny: tcp://127.0.0.1:54165,
Local directory: C:\Users\SEBAST~1\AppData\Local\Temp\dask-scratch-space\worker-8yc6maa3,Local directory: C:\Users\SEBAST~1\AppData\Local\Temp\dask-scratch-space\worker-8yc6maa3

0,1
Comm: tcp://127.0.0.1:54186,Total threads: 2
Dashboard: http://127.0.0.1:54194/status,Memory: 1.31 GiB
Nanny: tcp://127.0.0.1:54167,
Local directory: C:\Users\SEBAST~1\AppData\Local\Temp\dask-scratch-space\worker-cz7orc57,Local directory: C:\Users\SEBAST~1\AppData\Local\Temp\dask-scratch-space\worker-cz7orc57

0,1
Comm: tcp://127.0.0.1:54184,Total threads: 2
Dashboard: http://127.0.0.1:54188/status,Memory: 1.31 GiB
Nanny: tcp://127.0.0.1:54169,
Local directory: C:\Users\SEBAST~1\AppData\Local\Temp\dask-scratch-space\worker-xlv9xvm9,Local directory: C:\Users\SEBAST~1\AppData\Local\Temp\dask-scratch-space\worker-xlv9xvm9


In [None]:
# Running on Dataset 1: The (processed) Bread Basket CSV dataset...
# with updated apriori_general_v2

filepath = os.getcwd()
csv_file = "Shortened Bread Basket.csv"

import apriori_general_v2 as apriori2
import time

# begin timer
start = time.time()

# return frequent itemsets and their contents in separate variables
freq_itemsets, string_to_integer = apriori2.apriori_disk(f'{filepath}/{csv_file}',
                             [{}, None, {'\n'}, {''}], client = client,
                             min_support_percent = 0.39, # from 0-1
                             blocksize = 10)

end = time.time()

print("Time taken:", (end - start))

In [24]:
for j in string_to_integer:
    print(j)
# Indeed, {"Bread"} is the only frequent itemset!

Bread

