In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

import spacy



nlp = spacy.load("en_core_web_sm")

def unique_words_from_text(text):
    processed = nlp(text)
    non_repeat = set()
    for token in processed:
        if token.is_alpha and not token.is_stop:
            non_repeat.add(token.text.lower())
            
    return(list(non_repeat))

# Minimum conditional probability of consequent given the antecedent for an association rule to be considered
confidenceThreshold = 0.5
# Minimum percentage of transactions an item has to occur in to be considered when generating association rules
supportThreshold = 0.015


rows = pd.read_csv("../data/raw/futurice/blogs.csv")

# Store the extracted transactions
transactions = []
for i, row in rows.iterrows():
    category = row['category']
    headings = row['headings']
    content = row['teaser text']
    #bullets = row['bullets']
    joined_headings = ' '.join(headings)
    
    
    
    transaction = unique_words_from_text(headings) + unique_words_from_text(content)
    
    transactions.append(transaction)

te = TransactionEncoder()
te_ary = te.fit_transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_) # type: ignore

# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df, min_support=supportThreshold, use_colnames=True)
filtered_itemsets = frequent_itemsets[frequent_itemsets['support'] <= 0.5] #removing itemsets that are way too common idk

# Generate association rules
rules = association_rules(filtered_itemsets, metric="confidence", min_threshold=confidenceThreshold)


#print(rules.head)

G = nx.DiGraph()
#filtered_rules = rules[rules['confidence'] <= 0.85] #too obv?

for i, rule in rules.iterrows():
    antecedents = rule['antecedents']
    consequents = rule['consequents']
    support = rule['support']
    confidence = rule['confidence']
    
    antecedents = ', '.join(map(str, antecedents))
    consequents = ', '.join(map(str, consequents))
    
    G.add_node(antecedents)
    G.add_node(consequents)
    G.add_edge(antecedents, consequents, weight=confidence, source=antecedents, target=consequents) 
    
pos = nx.shell_layout(G)

plt.figure(figsize=(20, 20))
nx.draw_networkx_edges(G, pos, edge_color='gray', arrowsize=15)
node_labels = {node: node for node in G.nodes()}
nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=13, font_color='black')


nx.draw_networkx_nodes(G, pos, node_size=100, node_color='yellow')

plt.show()



KeyError: 'grid paragraph'