## **Extract all the association rules with confidence greater than or equal to 0.8 involving just itemsets in the frontier**
For each rule:
- Measure the p-value
- Calculate the lift
- Visualize the rules in a re-translated fashion
to gain insights on the real values of the attributes

In [6]:
# input 
import itertools
def get_rules_from_itemset(freq_i):
	all_rules = []
	for itemset in freq_i:
		keys = list(itemset.keys())
		k = len(keys)

		# tutte le possibili combinazioni con itertools.combinations non vuote e non complete
		for r in range(1, k):
			for lhs_keys in itertools.combinations(keys, r):
				lhs_dict = {k: itemset[k] for k in lhs_keys}
				rhs_keys = set(keys) - set(lhs_keys)
				rhs_dict = {k: itemset[k] for k in rhs_keys}
				all_rules.append((lhs_dict, rhs_dict))
	return all_rules

def calc_support(df, itemset, weight_col):
	"""
	itemset è un dict: {colonna: (min_val, max_val), ...}
	"""
	total_weight = df[weight_col].sum()  # somma di tutti i pesi
	num_sum = 0

	# iteriamo sulle righe del dataframe
	for index, row in df.iterrows():
		match = True  # assumiamo che la riga soddisfi le condizioni

		for col, (low, high) in itemset.items():
			value = row[col]

			# controllo se il valore è dentro i bounds
			if not (low <= value < high):
				match = False
				break

		# se la riga soddisfa tutti i vincoli dell'itemset
		if match:
			num_sum += row[weight_col]

	support = num_sum / total_weight if total_weight > 0 else 0
	return support

def print_rules(all_rules):
	print("Generated Rules:")
	for rule in all_rules:
		print(rule[0], "->", rule[1])

def extract_association_rules(df, frequent_itemset, min_confidence, weight_col='weight'):
	all_rules = get_rules_from_itemset(frequent_itemset)
	final_association_rules = []
	print_rules(all_rules)

	for rule in all_rules:
		itemset = {**rule[0], **rule[1]} # estraggo antecedente e consequente e creo l'itemset
		# print("Itemset: " + str(itemset))
		
		support = calc_support(df, itemset, weight_col) # calcolo il supporto dell'itemset
		# print("Support: " + str(support))
		
		# calcolo la confidence
		support_antecedent = calc_support(df, rule[0], weight_col)
		# print("Antecedent: " + str(rule[0]))
		# print("Support Antecedent: " + str(support_antecedent))	
		
		if support_antecedent == 0:
			confidence = 0
			final_association_rules.append({
				'antecedent': rule[0],
				'consequent': rule[1],
				'support': support,
				'confidence': confidence
			})
		else:
			confidence = support / support_antecedent # formula confidence
			# print("Confidence: " + str(confidence))
			
			if confidence >= min_confidence: # se è maggiore del threshold finisce nelle regole finali
				final_association_rules.append({
					'antecedent': rule[0],
					'consequent': rule[1],
					'support': support,
					'confidence': confidence
				})
				
	return final_association_rules

def calculate_lift(df, association_rules, weight_col): # indica quanto la presenza di un antecedente aumenta (o diminuisce) la probabilità del conseguente,
	for rule in association_rules:
		confidence = rule['confidence']
		support_consequent = calc_support(df, rule['consequent'], weight_col=weight_col)
		
		lift = confidence / support_consequent if support_consequent > 0 else 0
		rule['lift'] = lift
	return association_rules

from scipy.stats import fisher_exact
def control_sat(itemset, row) -> bool:  # return a boolean
	for attr in itemset:
		if not (itemset[attr][0] < row[attr] < itemset[attr][1]):
			return False
	return True
	
def calculate_p_value(df, association_rules, weight_col): # prob di osservare un co-occorenza se x e y sono correlati 
		
	for rule in association_rules:
		# controllo se row soddisfa tutte le condizioni dell’itemset
		# quindi memorizzo in A e B le tracce che soddisfano antecedent e consequent della regola di associazione
		A = df.apply(lambda row: control_sat(rule['antecedent'], row), axis=1)
		B = df.apply(lambda row: control_sat(rule['consequent'], row), axis=1)
		
		a = sum(A & B)   # righe che soddisfano sia antecedent che consequent
		b = sum(A & ~B)  # righe che soddisfano antecedent ma NON consequent
		c = sum(~A & B)  # righe che NON soddisfano antecedent ma soddisfano consequent
		d = sum(~A & ~B) # righe che non soddisfano né antecedent né consequent
		
		# creo la tabella di contingenza
		contingency = [[a, b], [c, d]]
		
		# calcolo il p-value usando il test di Fisher
		try:
			_, p = fisher_exact(contingency, alternative='greater')
		except:
			p = 1.0
			
		rule['p_value'] = round(p, 3)
	return association_rules

# re-translate association rules
def format_itemset(itemset):
	parts = []
	for attr, (lo, hi) in itemset.items():
		parts.append(f"{attr} in [{lo}, {hi}]")
	return " AND ".join(parts)

def format_rule(rule):
	lhs, rhs = rule
	lhs_str = format_itemset(lhs)
	rhs_str = format_itemset(rhs)
	return f"IF {lhs_str} THEN {rhs_str}"


# ------------ #


# test create_association_rules
import pandas as pd

def main():
	print("\n------------------------------\n")
	data = pd.DataFrame({
		'A1': [1.2, 2.5, 3.1, 2.9, 1.5],
		'A2': [5.0, 5.5, 6.0, 5.2, 5.1],
		'C': [1, 1, 1, 1, 1]
	})
	df = pd.DataFrame(data)

	# Frequent itemset di test (dict senza chiavi duplicate e coerenti con il df)
	frequent_itemset = [
		{'A1': (0, 3), 'A2': (0, 6)},
		{'A1': (0, 2), 'A2': (0, 6)},
		{'A1': (0, 4), 'A2': (0, 6)}
	]

	
	# extract association rules
	weight_col = 'C'
	association_rules = extract_association_rules(df, frequent_itemset, min_confidence=0.8, weight_col=weight_col)
	
	# calculate lift for each rule
	association_rules = calculate_lift(df, association_rules, weight_col=weight_col)
	
	# calculate p-value for each rule 
	association_rules = calculate_p_value(df, association_rules, weight_col=weight_col) # più è piccolo più x e y sono correlate
	
	print("\nFinal Association Rules:")
	for rule in association_rules:
		print(rule)
		
	print("\nFormatted Rules:")
	for rule in association_rules:
		formatted_rule = format_rule((rule['antecedent'], rule['consequent']))
		print(f"{formatted_rule} | Support: {rule['support']:.3f}, Confidence: {rule['confidence']:.3f}, Lift: {rule['lift']:.3f}, p-value: {rule['p_value']}")
	
print("Start extracting association rules...")
main()

Start extracting association rules...

------------------------------

Generated Rules:
{'A1': (0, 3)} -> {'A2': (0, 6)}
{'A2': (0, 6)} -> {'A1': (0, 3)}
{'A1': (0, 2)} -> {'A2': (0, 6)}
{'A2': (0, 6)} -> {'A1': (0, 2)}
{'A1': (0, 4)} -> {'A2': (0, 6)}
{'A2': (0, 6)} -> {'A1': (0, 4)}

Final Association Rules:
{'antecedent': {'A1': (0, 3)}, 'consequent': {'A2': (0, 6)}, 'support': 0.8, 'confidence': 1.0, 'lift': 1.25, 'p_value': 0.2}
{'antecedent': {'A2': (0, 6)}, 'consequent': {'A1': (0, 3)}, 'support': 0.8, 'confidence': 1.0, 'lift': 1.25, 'p_value': 0.2}
{'antecedent': {'A1': (0, 2)}, 'consequent': {'A2': (0, 6)}, 'support': 0.4, 'confidence': 1.0, 'lift': 1.25, 'p_value': 0.6}
{'antecedent': {'A1': (0, 4)}, 'consequent': {'A2': (0, 6)}, 'support': 0.8, 'confidence': 0.8, 'lift': 1.0, 'p_value': 1.0}
{'antecedent': {'A2': (0, 6)}, 'consequent': {'A1': (0, 4)}, 'support': 0.8, 'confidence': 1.0, 'lift': 1.0, 'p_value': 1.0}

Formatted Rules:
IF A1 in [0, 3] THEN A2 in [0, 6] | Suppor