In [1]:
from collections import defaultdict
import dask.distributed
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from apriori_general_v2 import apriori_disk, check_itemsets

In [2]:
count_dict = defaultdict(int)

#setup stuff
client = dask.distributed.Client(n_workers=6, threads_per_worker=1)  # Adjust based on your CPU
nltk_stopwords = stopwords.words('english')
with open('additional_stopwords.txt', 'r') as file:
    extra_stopwords = [line.strip() for line in file.readlines()]

extra_stopwords.extend(nltk_stopwords)
extra_stopwords.extend(['links', 'external', 'see', 'may', 'refer', 'link', 'wa'])

porter_stemmer = PorterStemmer()
extra_stopwords_stemmed = [porter_stemmer.stem(word) for word in extra_stopwords]
stopwords_set = set(extra_stopwords_stemmed)
print(stopwords_set)

{'d', 'inc', 'verier', 'hath', 'towardest', 'rathest', 'done', 'x', 'gotten', 'nathless', 'whensoev', 'mostli', 'in', 'on', 'yourself', 'herein', 'onc', 'appropriatest', 'now', 'and', 'r', 'musth', 'someon', 'avail', 'viz', 'better', 'all', 'which', 'whither', 'whena', 'ma', 'notwithstand', "should'v", 'hae', 'somewher', 'neath', 'much', 'anent', 'sith', 'within', 'anoth', 'see', 'must', 'variousest', 'concerning', 'ourself', 'byandbi', 'certainest', 'whosoev', 'ex', "she'", 'himself', 'fourscor', 'our', 'us', 'he', 'abov', 'cum', 'different', 'inform', 'sinc', 'might', 'info', 'p', 'midst', 'few', 'save', 'ain', 'furthermor', "needn't", 'down', 'herebi', 'they', 'make', 'therebi', "shan't", 'such', 'g', 'these', 'got', 'whithersoev', 'saidest', 'until', 'onli', 'astrid', 'describ', 'beneath', 'themselv', 'shan', "isn't", 'hasn', 'upon', 'furtherest', 'differ', 'unlik', 'severalest', 'outsid', 'provid', "you'v", 'whoso', 'ourselv', 'gone', 'wherebi', 'vs', 'howbeit', 'thee', 'v', 'thor

In [None]:

# data = 'data/combined_stemmed.csv'
data = 'data/pruned_stemmed.csv'
block_size = "100MB"

# run the apriori disk function
frequent_itemsets, string_mapping = apriori_disk(data_file=data,
                                                 exclude=stopwords_set,
                                                 min_support_percent=.57,
                                                 blocksize=block_size,
                                                 client=client)

# if function returns nothing, don't need to continue
if not frequent_itemsets or not string_mapping:
    exit()

# if you want to recreate the list with the original strings
for freq_itemset, support in frequent_itemsets.items():
    # Suppose you have a set of integers called int_set
    reconstructed_strings = set(key for key, value in string_mapping.items() if value in freq_itemset)
    print(reconstructed_strings, ":", str(support))

# the frequent itemsets are the keys of the frequent_itemsets dictionary
freq_itemsets_list = list(frequent_itemsets.keys())

# for clustering purposes, find if each itemset is present in each line
itemset_features = check_itemsets(data, freq_itemsets_list, string_mapping, block_size, client=client)

print(itemset_features)

# # save all variables so don't need to re-run script + for clustering
# with open('data/frequent_itemsets.pkl', 'wb') as f:
#     pickle.dump(frequent_itemsets, f)

# with open('data/itemset_features.pkl', 'wb') as f:
#     pickle.dump(itemset_features, f)

# with open('data/itemset_list.pkl', 'wb') as f:
#     pickle.dump(freq_itemsets_list, f)

# with open('data/string_mapping.pkl', 'wb') as f:
#     pickle.dump(string_mapping, f)

# with open('log.txt', 'w') as f:
#     now = datetime.datetime.now()
#     f.write(now.strftime("%Y-%m-%d %H:%M:%S"))
