# Exercise 8

In [10]:
!pip install pyECLAT



In [11]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")
warnings.filterwarnings("ignore", category=Warning)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

from pyECLAT import ECLAT

In [12]:
url = "https://github.com/robitussin/CCADMACL_EXERCISES/blob/8fdec5cd3f29586f9934e035187c7f16da382361/pc_games_market_basket.csv?raw=true"

df = pd.read_csv(url, header=None)
# df = pd.read_csv(url, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,No Man's Sky,Left 4 Dead 2,Elden Ring,GTA V,Monster Hunter: World,
1,Red Dead Redemption 2,Counter-Strike 2,Left 4 Dead 2,,,
2,No Man's Sky,Monster Hunter: World,,,,
3,Elden Ring,Divinity: Original Sin 2,Monster Hunter: World,Factorio,Sekiro: Shadows Die Twice,Wolfenstein II
4,Rainbow Six Siege,Doom Eternal,Dark Souls 3,Cyberpunk 2077,BioShock Infinite,


### 1. Generate transaction lists from the dataset

In [13]:

txns = df.fillna("").values.tolist()
txns = [[item for item in txn if item != ''] for txn in txns]
txns = [[item.strip() for item in txn] for txn in txns]

# Create a list of unique ids for the transactions
ids = [i + 1 for i in range(len(txns))]

# Initialize an empty list
data =[]
# Iterate through transactions and add them to the DataFrame with IDs
for i, txn in enumerate(txns):
    data.extend([{'TID': ids[i], 'Item': item} for item in txn])

df_txn = pd.DataFrame(data)
df_txn.head(25)


Unnamed: 0,TID,Item
0,1,No Man's Sky
1,1,Left 4 Dead 2
2,1,Elden Ring
3,1,GTA V
4,1,Monster Hunter: World
5,2,Red Dead Redemption 2
6,2,Counter-Strike 2
7,2,Left 4 Dead 2
8,3,No Man's Sky
9,3,Monster Hunter: World


### 2. Find the most frequent items

In [14]:
# Find the top 30 most frequent items
top_items = df_txn['Item'].value_counts().reset_index()

# Convert the top 30 items into DataFrame and sort by item count in descending order
df_top_items = pd.DataFrame(top_items)
df_top_items.columns = ['Item', 'Count']

# Calculate the percentage of transactions for each item
total_transactions = len(df)
df_top_items['% Count'] = (df_top_items['Count']*100 / total_transactions).round(2)

# Display the results
df_top_items.style.background_gradient(cmap='Blues')

Unnamed: 0,Item,Count,% Count
0,Celeste,585,11.7
1,Counter-Strike 2,581,11.62
2,Hades,576,11.52
3,GTA V,575,11.5
4,Monster Hunter: World,572,11.44
5,Sekiro: Shadows Die Twice,566,11.32
6,L.A. Noire,558,11.16
7,The Witcher 3,556,11.12
8,The Forest,556,11.12
9,Cyberpunk 2077,554,11.08


3. Generate frequent itemsets using ECLAT

In [15]:

# Initiate an Eclat instance and load transactions DataFrame to the instance
eclat = ECLAT(data=df, verbose=True)

# Generate a binary dataframe
eclat.df_bin.head()

100%|██████████| 74/74 [00:00<00:00, 182.42it/s]
100%|██████████| 74/74 [00:00<00:00, 3340.10it/s]
100%|██████████| 74/74 [00:00<00:00, 2163.11it/s]


Unnamed: 0,Slay the Spire,The Forest,Dead Cells,Rainbow Six Siege,Portal 2,Elden Ring,Stardew Valley,Monster Hunter: World,Subnautica,Dark Souls 3,Monster Hunter: World.1,Divinity: Original Sin 2,RimWorld,RimWorld.1,Hollow Knight,Skyrim,Celeste,Counter-Strike 2,Baldur's Gate 3,Resident Evil 4,Fallout 4,Quake Champions,GTA V,The Witcher 3,No Man's Sky,Slay the Spire.1,Hades,Hades.1,Left 4 Dead 2,Terraria,Rainbow Six Siege.1,Cities: Skylines,Doom Eternal,L.A. Noire,Stardew Valley.1,Portal 2.1,Red Dead Redemption 2,L.A. Noire.1,Skyrim.1,Hollow Knight.1,Cyberpunk 2077,Divinity: Original Sin 2.1,The Witcher 3.1,Subnautica.1,Left 4 Dead 2.1,Wolfenstein II,Half-Life 2,Baldur's Gate 3.1,Doom Eternal.1,Terraria.1,GTA V.1,The Forest.1,Cyberpunk 2077.1,Celeste.1,Fallout 4.1,Call of Duty: Modern Warfare 2,Counter-Strike 2.1,Factorio,Dead Cells.1,Quake Champions.1,Elden Ring.1,Sekiro: Shadows Die Twice,Dark Souls 3.1,Sekiro: Shadows Die Twice.1,Cities: Skylines.1,BioShock Infinite,Resident Evil 4.1,Wolfenstein II.1,BioShock Infinite.1,Red Dead Redemption 2.1,Factorio.1,Half-Life 2.1,Call of Duty: Modern Warfare 2.1,No Man's Sky.1
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


4. Show itemsets and support values

In [16]:
unique_item_list = eclat.uniq_
print(unique_item_list)

[nan, 'Slay the Spire', ' The Forest', ' Dead Cells', 'Rainbow Six Siege', 'Portal 2', 'Elden Ring', 'Stardew Valley', ' Monster Hunter: World', 'Subnautica', ' Dark Souls 3', 'Monster Hunter: World', ' Divinity: Original Sin 2', ' RimWorld', 'RimWorld', 'Hollow Knight', 'Skyrim', 'Celeste', ' Counter-Strike 2', " Baldur's Gate 3", ' Resident Evil 4', 'Fallout 4', 'Quake Champions', ' GTA V', ' The Witcher 3', " No Man's Sky", ' Slay the Spire', 'Hades', ' Hades', 'Left 4 Dead 2', ' Terraria', ' Rainbow Six Siege', ' Cities: Skylines', ' Doom Eternal', ' L.A. Noire', ' Stardew Valley', ' Portal 2', ' Red Dead Redemption 2', 'L.A. Noire', ' Skyrim', ' Hollow Knight', 'Cyberpunk 2077', 'Divinity: Original Sin 2', 'The Witcher 3', ' Subnautica', ' Left 4 Dead 2', 'Wolfenstein II', ' Half-Life 2', "Baldur's Gate 3", 'Doom Eternal', 'Terraria', 'GTA V', 'The Forest', ' Cyberpunk 2077', ' Celeste', ' Fallout 4', ' Call of Duty: Modern Warfare 2', 'Counter-Strike 2', ' Factorio', 'Dead Cells'

In [17]:
# Adjust the parameters
min_support_threshold = 0.02  # Lowered to 2% to get more frequent itemsets
min_combination = 1  # Minimum size of itemsets
max_combination = 3  # Maximum size of itemsets


In [18]:
get_ECLAT_indexes, get_ECLAT_supports = eclat.fit(min_support = min_support_threshold, min_combination = min_combination, max_combination = max_combination, separator=' & ', verbose=True)

# Display results in a dataframe
result = pd.DataFrame(get_ECLAT_supports.items(),columns=['Item', 'Support'])
result = result.sort_values(by=['Support'], ascending=False).reset_index(drop=True)
result

Combination 1 by 1


74it [00:02, 29.18it/s]


Combination 2 by 2


2701it [00:45, 59.95it/s]


Combination 3 by 3


64824it [16:29, 65.50it/s]


Unnamed: 0,Item,Support
0,Hades,0.0892
1,L.A. Noire,0.0888
2,Celeste,0.0872
3,Monster Hunter: World,0.0866
4,Cyberpunk 2077,0.0856
...,...,...
69,Fallout 4,0.0232
70,L.A. Noire,0.0228
71,Half-Life 2,0.0224
72,Slay the Spire,0.0218
