## **Assignment - Algorithm Implementation and Testing**
- Implement the Apriori, Randomic Apriori, and Randomic Distributed Apriori algorithms for quantitative itemsets in your programming language of choice.
- Create a set of test cases to verify the correctness of your implementation.
- Ensure your implementation can handle various input sizes and support thresholds.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Define inputs
# df = pd.read_csv(
#     "AirQualityUCI.csv",
#     sep=";",
#     decimal=","
# )

df = pd.DataFrame({'A1': [1.2, 2.5, 3.1, 2.9, 1.5], 'A2': [5.0, 5.5, 6.0, 5.2, 5.1], 'weight': [1, 1, 1, 1, 1]})

epsilon = 0.4

##### **Study of dataset**

In [3]:
df.shape

(5, 3)

In [4]:
df.head(5)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      5 non-null      float64
 1   A2      5 non-null      float64
 2   weight  5 non-null      int64  
dtypes: float64(2), int64(1)
memory usage: 248.0 bytes


In [5]:
# Reduce dataframe size for test
df = df.head(20)

##### Need to encode *Date* and *Time*

In [6]:
# # --- Conversione robusta della colonna Date ---
# df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')

# # --- Conversione robusta della colonna Time ---
# df['Time'] = pd.to_datetime(df['Time'], format='%H.%M.%S', errors='coerce').dt.time
# # A questo punto Time è un oggetto datetime.time, convertiamolo in timedelta
# df['Time'] = pd.to_timedelta(df['Time'].astype(str))

# df['year'] = df['Date'].dt.year
# df['month'] = df['Date'].dt.month
# df['day'] = df['Date'].dt.day
# df['hour'] = df['Time'].dt.seconds // 3600
# df['minute'] = (df['Time'].dt.seconds % 3600) // 60

# df.drop(columns=['Date', 'Time'], inplace=True)

In [7]:
df.head(5)

Unnamed: 0,A1,A2,weight
0,1.2,5.0,1
1,2.5,5.5,1
2,3.1,6.0,1
3,2.9,5.2,1
4,1.5,5.1,1


##### Create weight column

In [8]:
# df['weight'] = (df['CO(GT)'] - df['CO(GT)'].min()) / (df['CO(GT)'].max() - df['CO(GT)'].min()) # per ogni riga, creo la colonna 'weight' che normalizza il valore di CO(GT) tra 0 e 1
# df.head(5)

In [9]:
import numpy as np

# Define utilities
def bottom_itemset(dataframe, weught_col):
	bottom_its = {}
	lower_bound = 0
	
	for column in dataframe.columns:
		if (weught_col != column):
			max_val = int(np.ceil(dataframe[column].max()))
			bottom_its[column] = (lower_bound, max_val)
	
	return bottom_its


def calc_support(df, itemset, weight_col):
	"""
	itemset è un dict: {colonna: (min_val, max_val), ...}
	"""
	total_weight = df[weight_col].sum()  # somma di tutti i pesi
	num_sum = 0

	# iteriamo sulle righe del dataframe
	for index, row in df.iterrows():
		match = True  # assumiamo che la riga soddisfi le condizioni

		for col, (low, high) in itemset.items():
			value = row[col]

			# controllo se il valore è dentro i bounds
			if not (low <= value < high):
				match = False
				break

		# se la riga soddisfa tutti i vincoli dell'itemset
		if match:
			num_sum += row[weight_col]

	support = num_sum / total_weight if total_weight > 0 else 0
	return support

def calc_shrink_difference(item):
	"""
	item è un dict: {colonna: (min_val, max_val)}
	"""
	for col, (min_val, max_val) in item.items():
		return max_val - min_val

In [10]:
def a_priori_quant_itemsets(df, epsilon, weight_col='weight'):
	relations = []
	base = bottom_itemset(df, weight_col) # bottom_itemset restituisce un dict: {col: (low, high)}
	print(f"Base itemset (I0): {base}")
	# s_w parte come lista di itemset a 1 colonna
	s_w = [{col: bounds} for col, bounds in base.items()]
	print(f"Initial candidate itemsets (s_w): {s_w}")
	
	k = 1
	while s_w:  # finché ho candidati
		#print(f"\n--- Iterations k={k}, numbers of candidates: {len(s_w)} ---")
		w_k = []
		
		# genera i candidati
		for item in s_w:
			print(f"Generating candidates from itemset: {item}")
			for col, (low, high) in item.items():
				# print(f"  Processing column: {col}, bounds: ({low}, {high})")
				for mid in range(low + 1, high):
					cand1 = {col: (mid, high)}
					cand2 = {col: (low, high - mid)}
					# print(f"  Candidate 1: {cand1}, Candidate 2: {cand2}")
					
					# i candidati vengono aggiunti a w_k solo se vale l'uguaglianza della shrink difference
					candidates = [cand1, cand2]
					shink_difference_itemset = calc_shrink_difference(item)
					for cand in candidates:
						shrink_difference_cand = calc_shrink_difference(cand)
						print(f"  Evaluating candidate: {cand}")
						print(f"    Shrink difference itemset: {shink_difference_itemset}, Shrink difference candidate: {shrink_difference_cand}")
						if shrink_difference_cand == shink_difference_itemset - 1:
							w_k.append(cand)
							print(f"    --> {cand} added to w_k")
						else:
							print(f"    --> {cand} NOT added to w_k")
					
		# valuta i candidati
		print("Evaluating candidates...")
		s_w = []
		print("Itemsets to evaluate:", w_k)
		for itemset in w_k:
			support = calc_support(df, itemset, weight_col)
			# print(f"Itemset: {itemset}, Support: {support}")
			if support >= epsilon:
				s_w.append(itemset)
				relations.append((itemset, support))
		
		k += 1
	
	return relations

results = a_priori_quant_itemsets(df=df, epsilon=epsilon, weight_col='weight')
print("\n--- Frequent itemsets found: ---\n")
for itemset, support in results:
	print(f"Itemset: {itemset}, Support: {support}")

Base itemset (I0): {'A1': (0, 4), 'A2': (0, 6)}
Initial candidate itemsets (s_w): [{'A1': (0, 4)}, {'A2': (0, 6)}]
Generating candidates from itemset: {'A1': (0, 4)}
  Evaluating candidate: {'A1': (1, 4)}
    Shrink difference itemset: 4, Shrink difference candidate: 3
    --> {'A1': (1, 4)} added to w_k
  Evaluating candidate: {'A1': (0, 3)}
    Shrink difference itemset: 4, Shrink difference candidate: 3
    --> {'A1': (0, 3)} added to w_k
  Evaluating candidate: {'A1': (2, 4)}
    Shrink difference itemset: 4, Shrink difference candidate: 2
    --> {'A1': (2, 4)} NOT added to w_k
  Evaluating candidate: {'A1': (0, 2)}
    Shrink difference itemset: 4, Shrink difference candidate: 2
    --> {'A1': (0, 2)} NOT added to w_k
  Evaluating candidate: {'A1': (3, 4)}
    Shrink difference itemset: 4, Shrink difference candidate: 1
    --> {'A1': (3, 4)} NOT added to w_k
  Evaluating candidate: {'A1': (0, 1)}
    Shrink difference itemset: 4, Shrink difference candidate: 1
    --> {'A1': (0,

In [11]:
#############################
# RANDOMIC
#############################

def a_priori_quant_itemsets_randomic(df, epsilon):
	relations = []
	b_itemset = bottom_itemset(df) # bottom_itemset restituisce un dict: {col: (low, high)}

	lp = [{col: bounds} for col, bounds in b_itemset.items()] # candidati da esplorare
	ls = []
	lns = []
	
	while lp:
		print(f"\n--- Numbers of candidates: {len(lp)} ---\n")
		random_itemset = random.choice(lp)
		print(f"Random itemset selected: {random_itemset}\n")
		# k = 0
		for itemset in random_itemset:
			print(f"Processing itemset: {itemset}")
			# print(k)
			if itemset in ls or itemset in lns:
				break
			else:
				support = calc_support(df, random_itemset)
				if support >= epsilon:
					relations.append((random_itemset, support))
					ls.append(random_itemset)
					
					# Per ogni itemset I frequente, genera tutti i candidati di dimensione successiva (k+1)
					for col in df.columns:
						if col not in itemset:
							new_itemset = random_itemset.copy()
							new_itemset[col] = new_itemset.pop(itemset)
							lp.append(new_itemset)
				else:
					lns.append(random_itemset)
			# k += 1
		lp.remove(random_itemset)
		
	return relations

results = a_priori_quant_itemsets_randomic(df, epsilon)
print(results)

TypeError: bottom_itemset() missing 1 required positional argument: 'weught_col'