In [1]:
from collections import defaultdict
import dask.distributed
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from apriori_general_v2 import apriori_disk, check_itemsets
import numpy as np
import os
import pickle
import datetime

In [2]:
count_dict = defaultdict(int)

#setup stuff
client = dask.distributed.Client(n_workers=6, threads_per_worker=1)  # Adjust based on your CPU
nltk_stopwords = stopwords.words('english')
with open('additional_stopwords.txt', 'r') as file:
    extra_stopwords = [line.strip() for line in file.readlines()]

extra_stopwords.extend(nltk_stopwords)
extra_stopwords.extend(['links', 'external', 'see', 'may', 'refer', 'link', 'wa'])

porter_stemmer = PorterStemmer()
extra_stopwords_stemmed = [porter_stemmer.stem(word) for word in extra_stopwords]
stopwords_set = set(extra_stopwords_stemmed)

In [3]:
data = 'data/combined_stemmed.csv'
block_size = "100MB"

# data = 'data/pruned_stemmed.csv'
# block_size = "10MB"

#min_supports_list = np.arange(start=0.5, stop=0.15, step=--0.05)

min_supports_list = [0.25, 0.15, 0.075]

for min_support in min_supports_list:
    
    print(f"Testing min_support = {min_support}")
    directory = f"data/wiki/minsupp_{min_support}"
    if not os.path.exists(directory):
        os.mkdir(directory)
    
    start_time = datetime.datetime.now()

    # run the apriori disk function
    frequent_itemsets, string_mapping = apriori_disk(data_file=data,
                                                     exclude=stopwords_set,
                                                     min_support_percent=min_support,
                                                     blocksize=block_size,
                                                     client=client)
    
    end_time = datetime.datetime.now()

    # if function returns nothing, don't need to continue
    if not frequent_itemsets or not string_mapping:
        continue

    # if you want to recreate the list with the original strings
    with open(f"{directory}/itemsets_and_supports.txt", 'w') as f:
        for freq_itemset, support in frequent_itemsets.items():
            # Suppose you have a set of integers called int_set
            reconstructed_strings = set(key for key, value in string_mapping.items() if value in freq_itemset)
            f.write(f"{reconstructed_strings}: {support}\n")

    # the frequent itemsets are the keys of the frequent_itemsets dictionary
    freq_itemsets_list = list(frequent_itemsets.keys())

    # for clustering purposes, find if each itemset is present in each line
    itemset_features = check_itemsets(data, freq_itemsets_list, string_mapping, block_size, client=client)

    # save all variables so don't need to re-run script + for clustering
    with open(f'{directory}/frequent_itemsets.pkl', 'wb') as f:
        pickle.dump(frequent_itemsets, f)

    with open(f'{directory}/itemset_features.pkl', 'wb') as f:
        pickle.dump(itemset_features, f)

    with open(f'{directory}/itemset_list.pkl', 'wb') as f:
        pickle.dump(freq_itemsets_list, f)

    with open(f'{directory}/string_mapping.pkl', 'wb') as f:
        pickle.dump(string_mapping, f)

    with open(f'{directory}/log.txt', 'w') as f:
        
        f.write("started at " + start_time.strftime("%Y-%m-%d %H:%M:%S") + '\n')
        f.write("finished at " + end_time.strftime("%Y-%m-%d %H:%M:%S") + '\n')
        f.write("number of freq itemsets: " + len(freq_itemsets_list) + '\n')
        f.write("length of longest itemset: " + len(freq_itemsets_list[-1]))
        
        


Testing min_support = 0.25
no rows:  6642741
First pass completed
Frequent Itemsets Level 1 completed
8 itemsets found
generating candidates
size of total pairs: 0.2890625KB
28 candidates generated
checking frequency of itemsets...
No more frequent itemsets to be found at level 2!


KeyboardInterrupt: 