In [3]:
from collections import defaultdict
import dask.distributed
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from apriori_general_v2 import apriori_disk, check_itemsets
import numpy as np
import os
import pickle
import datetime

In [4]:
count_dict = defaultdict(int)

#setup stuff
client = dask.distributed.Client(n_workers=6, threads_per_worker=1)  # Adjust based on your CPU
nltk_stopwords = stopwords.words('english')
with open('additional_stopwords.txt', 'r') as file:
    extra_stopwords = [line.strip() for line in file.readlines()]

extra_stopwords.extend(nltk_stopwords)
extra_stopwords.extend(['links', 'external', 'see', 'may', 'refer', 'link', 'wa'])

porter_stemmer = PorterStemmer()
extra_stopwords_stemmed = [porter_stemmer.stem(word) for word in extra_stopwords]
stopwords_set = set(extra_stopwords_stemmed)

In [7]:
# data = 'data/combined_stemmed.csv'
# block_size = "100MB"

data = 'data/software_reviews_transactions.txt'
block_size = "200KB"

# min_supports_list = np.arange(start=0.5, stop=0.15, step=--0.05)

min_supports_list = [0.06]

for min_support in min_supports_list:
    
    print(f"Testing min_support = {min_support}")
    directory = f"data/softwarereviews/minsupp_{min_support}"
    if not os.path.exists(directory):
        os.mkdir(directory)

    start_time = datetime.datetime.now()

    # run the apriori disk function
    frequent_itemsets, string_mapping = apriori_disk(data_file=data,
                                                     exclude=stopwords_set,
                                                     min_support_percent=min_support,
                                                     blocksize=block_size,
                                                     client=client)
    
    end_time = datetime.datetime.now()

    # if function returns nothing, don't need to continue
    if not frequent_itemsets or not string_mapping:
        continue

    # if you want to recreate the list with the original strings
    with open(f"{directory}/itemsets_and_supports.txt", 'w') as f:
        for freq_itemset, support in frequent_itemsets.items():
            # Suppose you have a set of integers called int_set
            reconstructed_strings = set(key for key, value in string_mapping.items() if value in freq_itemset)
            f.write(f"{reconstructed_strings}: {support}\n")

    # the frequent itemsets are the keys of the frequent_itemsets dictionary
    freq_itemsets_list = list(frequent_itemsets.keys())

    # for clustering purposes, find if each itemset is present in each line
    itemset_features = check_itemsets(data, freq_itemsets_list, string_mapping, block_size, client=client)

    # save all variables so don't need to re-run script + for clustering
    with open(f'{directory}/frequent_itemsets.pkl', 'wb') as f:
        pickle.dump(frequent_itemsets, f)

    with open(f'{directory}/itemset_features.pkl', 'wb') as f:
        pickle.dump(itemset_features, f)

    with open(f'{directory}/itemset_list.pkl', 'wb') as f:
        pickle.dump(freq_itemsets_list, f)

    with open(f'{directory}/string_mapping.pkl', 'wb') as f:
        pickle.dump(string_mapping, f)

    with open(f'{directory}/log.txt', 'w') as f:
        f.write("started at " + start_time.strftime("%Y-%m-%d %H:%M:%S"))
        f.write("finished at " + end_time.strftime("%Y-%m-%d %H:%M:%S"))


Testing min_support = 0.06
no rows:  12703
First pass completed
Frequent Itemsets Level 1 completed
127 itemsets found
generating candidates
size of total pairs: 65.6328125KB
scattered sublists
7996 candidates generated
checking frequency of itemsets...
Frequent Itemsets Level 2 completed
338 itemsets found

generating candidates
size of total pairs: 488.2265625KB
scattered sublists
1508 candidates generated
checking frequency of itemsets...
Frequent Itemsets Level 3 completed
100 itemsets found

generating candidates
size of total pairs: 40.8828125KB
scattered sublists
62 candidates generated
checking frequency of itemsets...
No more frequent itemsets to be found at level 4!




In [4]:
print(itemset_features)

[[ True  True  True ...  True  True  True]
 [ True  True  True ... False  True False]
 [ True  True  True ... False  True False]
 ...
 [ True False False ...  True  True  True]
 [False  True  True ... False False False]
 [ True False  True ... False False False]]


In [5]:
print(freq_itemsets_list)

[frozenset({0}), frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5}), frozenset({6}), frozenset({7}), frozenset({8}), frozenset({9}), frozenset({10}), frozenset({11}), frozenset({12}), frozenset({13}), frozenset({14}), frozenset({15}), frozenset({16}), frozenset({17}), frozenset({18}), frozenset({19}), frozenset({20}), frozenset({21}), frozenset({22}), frozenset({23}), frozenset({24}), frozenset({25}), frozenset({10, 13}), frozenset({5, 13}), frozenset({9, 2}), frozenset({0, 13}), frozenset({3, 4}), frozenset({9, 13}), frozenset({0, 2}), frozenset({2, 4}), frozenset({16, 13}), frozenset({11, 14}), frozenset({12, 5}), frozenset({3, 14}), frozenset({8, 3}), frozenset({10, 12}), frozenset({2, 12}), frozenset({12, 13}), frozenset({7, 15}), frozenset({10, 4}), frozenset({6, 15}), frozenset({1, 15}), frozenset({4, 5}), frozenset({9, 7}), frozenset({16, 5}), frozenset({16, 7}), frozenset({9, 5}), frozenset({8, 10}), frozenset({3, 12}), frozenset({9, 11}), frozenset(

