In [1]:
import random
import pandas as pd
import numpy as np
from multiprocessing import Manager, Process
from IPython.display import display

# ======================
# Utility Functions
# ======================
def point_in_interval(point, interval):
	return interval[0] < point < interval[1]

def satisfies(itemset, row):
	return all(point_in_interval(row[attr], itemset[attr]) for attr in itemset)

def support(itemset, data, class_col):
	total_c = data[class_col].sum()
	matched = data[data.apply(lambda row: satisfies(itemset, row), axis=1)]
	return matched[class_col].sum() / total_c if total_c > 0 else 0

def shrink_difference(inner, outer):
	return (inner[0] - outer[0]) + (outer[1] - inner[1])

def delta(itemset, max_vals):
	return sum(shrink_difference(itemset[attr], (0, max_vals[attr])) for attr in itemset)

# ======================
# Standard Apriori
# ======================
def apriori_standard(data, epsilon, class_col):
	attributes = [col for col in data.columns if col != class_col]
	max_vals = {a: int(np.ceil(data[a].max())) for a in attributes}
	I0 = {a: (0, max_vals[a]) for a in attributes}
	R, SWk = {}, [I0]

	while SWk:
		Wk = []
		for I in SWk:
			for a in attributes:
				b, e = I[a]
				for mid in range(b + 1, e):
					new_I = I.copy()
					new_I[a] = (b, mid)
					if delta(new_I, max_vals) == delta(I, max_vals) + 1:
						Wk.append(new_I)
		SWk = []
		for I in Wk:
			s = support(I, data, class_col)
			if s >= epsilon:
				R[str(I)] = s
				SWk.append(I)
	return R

# ======================
# Randomic Apriori
# ======================
def apriori_randomic(data, epsilon, class_col):
	attributes = [c for c in data.columns if c != class_col]
	max_vals = {a: int(np.ceil(data[a].max())) for a in attributes}
	I0 = {a: (0, max_vals[a]) for a in attributes}
	R, LP, LS, LNS = {}, [I0], {}, {}

	while LP:
		I = LP.pop(random.randint(0, len(LP) - 1))
		key = str(I)
		if key in LS or key in LNS:
			continue
		s = support(I, data, class_col)
		if s >= epsilon:
			R[key] = s
			LS[key] = s
			for a in attributes:
				b, e = I[a]
				for mid in range(b + 1, e):
					new_I = I.copy()
					new_I[a] = (b, mid)
					LP.append(new_I)
		else:
			LNS[key] = s
	return R

# ======================
# Distributed Worker
# ======================
def distributed_worker(GP, GS, GNS, data, epsilon, class_col, max_vals, attributes):
	while True:
		if GP.empty():
			break
		try:
			I = GP.get_nowait()
		except:
			break

		key = str(I)
		if key in GS or key in GNS:
			continue

		s = support(I, data, class_col)
		if s >= epsilon:
			GS[key] = s
			for a in attributes:
				b, e = I[a]
				for mid in range(b + 1, e):
					new_I = I.copy()
					new_I[a] = (b, mid)
					GP.put(new_I)
		else:
			GNS[key] = s


# ======================
# Distributed Apriori
# ======================
def apriori_randomic_distributed(data, epsilon, class_col, num_workers=2):
	attributes = [c for c in data.columns if c != class_col]
	max_vals = {a: int(np.ceil(data[a].max())) for a in attributes}
	I0 = {a: (0, max_vals[a]) for a in attributes}

	manager = Manager()
	GP, GS, GNS = manager.Queue(), manager.dict(), manager.dict()

	# seed della ricerca
	GP.put(I0)

	processes = [Process(target=distributed_worker,
						 args=(GP, GS, GNS, data, epsilon, class_col, max_vals, attributes))
				 for _ in range(num_workers)]

	for p in processes:
		p.start()
	for p in processes:
		p.join()

	return dict(GS)

# ======================
# Run All Algorithms
# ======================
print("\n======================")
print("RUN APRIORI ALGORITHMS")
print("======================")

data1 = pd.DataFrame({
	'A1': [1.2, 2.5, 3.1, 2.9, 1.5],
	'A2': [5.0, 5.5, 6.0, 5.2, 5.1],
	'C':  [1, 1, 1, 1, 1]
})

data2 = pd.DataFrame({
	'A1': [10.2, 12.5, 13.1, 12.9, 11.5],
	'A2': [50.0, 55.5, 60.0, 52.2, 51.1],
	'C':  [1, 8, 1, 1, 1]
})

data3 = pd.DataFrame({
	'A1': [5, 8, 10, 7, 6],
	'A2': [2.2, 3.5, 4.1, 3.9, 2.5],
	'A3': [100, 150, 200, 120, 110],
	'A4': [0.1, 0.5, 0.2, 0.4, 0.3],
	'C':  [2, 2, 3, 2, 5]
})

datasets = {"data1": data1, "data2": data2, "data3": data3}
epsilon = 0.4

# ======================
# Print Results
# ======================

for name, data in datasets.items():
	std_result = apriori_standard(data, epsilon, 'C')
	rnd_result = apriori_randomic(data, epsilon, 'C')
	dist_result = apriori_randomic_distributed(data, epsilon, 'C')

	print("\n=== Dataset:", name, "===")

	print("Standard Apriori:")
	for k, v in std_result.items():
		print("  itemset:", k, " support:", v)

	print("Randomic Apriori:")
	for k, v in rnd_result.items():
		print("  itemset:", k, " support:", v)

	print("Distributed Randomic Apriori:")
	for k, v in dist_result.items():
		print("  itemset:", k, " support:", v)


RUN APRIORI ALGORITHMS

=== Dataset: data1 ===
Standard Apriori:
  itemset: {'A1': (0, 3), 'A2': (0, 6)}  support: 0.8
  itemset: {'A1': (0, 2), 'A2': (0, 6)}  support: 0.4
Randomic Apriori:
  itemset: {'A1': (0, 4), 'A2': (0, 6)}  support: 0.8
  itemset: {'A1': (0, 3), 'A2': (0, 6)}  support: 0.8
  itemset: {'A1': (0, 2), 'A2': (0, 6)}  support: 0.4
Distributed Randomic Apriori:

=== Dataset: data2 ===
Standard Apriori:
  itemset: {'A1': (0, 13), 'A2': (0, 60)}  support: 0.9166666666666666
  itemset: {'A1': (0, 14), 'A2': (0, 59)}  support: 0.9166666666666666
  itemset: {'A1': (0, 13), 'A2': (0, 59)}  support: 0.9166666666666666
  itemset: {'A1': (0, 14), 'A2': (0, 58)}  support: 0.9166666666666666
  itemset: {'A1': (0, 13), 'A2': (0, 58)}  support: 0.9166666666666666
  itemset: {'A1': (0, 14), 'A2': (0, 57)}  support: 0.9166666666666666
  itemset: {'A1': (0, 13), 'A2': (0, 57)}  support: 0.9166666666666666
  itemset: {'A1': (0, 14), 'A2': (0, 56)}  support: 0.9166666666666666
  item

KeyboardInterrupt: 