In [2]:
import pandas as pd
from itertools import combinations

In [3]:
def charger_dataset(file_name):
    with open(file_name, 'r') as f:
        df = pd.read_csv(f, delimiter=',')
        return df
    
print(charger_dataset('Dataset-Exos2.csv'))

    Watcher  videoCategoryId     videoCategoryLabel definition
0     Billy             29.0  Nonprofits & Activism         hd
1     Leila             22.0         People & Blogs         sd
2     Billy             22.0         People & Blogs         sd
3      Mark             24.0          Entertainment         hd
4     Billy             24.0          Entertainment         hd
..      ...              ...                    ...        ...
994     NaN              NaN                    NaN        NaN
995     NaN              NaN                    NaN        NaN
996     NaN              NaN                    NaN        NaN
997     NaN              NaN                    NaN        NaN
998     NaN              NaN                    NaN        NaN

[999 rows x 4 columns]


In [4]:
dataset2 = charger_dataset('Dataset-Exos2.csv')

nombre_de_transactions = len(dataset2)

nombre_d_items = dataset2['videoCategoryId'].nunique()
print("Nombre de transactions : ", nombre_de_transactions)
print("Nombre d'items : ", nombre_d_items)

Nombre de transactions :  999
Nombre d'items :  13


In [5]:
def pretraitement(dataset):

    dataset.dropna(inplace=True) 
    return dataset

dataset2 = pretraitement(dataset2)
len(dataset2)
dataset2.head(10)

Unnamed: 0,Watcher,videoCategoryId,videoCategoryLabel,definition
0,Billy,29.0,Nonprofits & Activism,hd
1,Leila,22.0,People & Blogs,sd
2,Billy,22.0,People & Blogs,sd
3,Mark,24.0,Entertainment,hd
4,Billy,24.0,Entertainment,hd
5,Jane,24.0,Entertainment,hd
6,Babs,22.0,People & Blogs,hd
7,Jeff,25.0,News & Politics,hd
9,Leila,28.0,Science & Technology,hd
10,Jane,27.0,Education,hd


In [6]:
dataset2_bis = dataset2.groupby('Watcher')['videoCategoryId'].apply(set).reset_index()

dataset2_bis.rename(columns={'Watcher':'Transaction','videoCategoryId':'Items'},inplace=True)
dataset2_bis.head(10)

Unnamed: 0,Transaction,Items
0,Adam,{22.0}
1,Alex,{22.0}
2,Amy,{22.0}
3,Babs,{22.0}
4,Ben,{22.0}
5,Billy,"{22.0, 24.0, 26.0, 28.0, 29.0}"
6,Bob,"{27.0, 28.0, 22.0}"
7,Brad,"{10.0, 28.0, 29.0, 22.0}"
8,Chandler,{28.0}
9,Clark,"{28.0, 22.0}"


In [7]:
n = len(dataset2_bis)
print('le nombre de transaction est :', n)
len_items = dataset2_bis['Items'].explode().unique()
print("le nombre d'item :", len(len_items))
len_items

le nombre de transaction est : 39
le nombre d'item : 12


array([22.0, 24.0, 26.0, 28.0, 29.0, 27.0, 10.0, 20.0, 25.0, 1.0, 23.0,
       17.0], dtype=object)

In [8]:
def generate_candidates(k):

    ck = []
    items = set()
    for i in dataset2_bis['Items']:
        for j in i:
            items.add(j)
    items = list(items)

    comb = combinations(items, k)

    for i in list(comb):
        ck.append(i)
    return ck

In [9]:
def calculate_support(dataset, candidates):
    support_counts = {} 
    
    for candidate in candidates:
        candidate_set = set(candidate)
        for index, row in dataset.iterrows():
            if candidate_set.issubset(row['Items']):
                if tuple(candidate) in support_counts:
                    support_counts[tuple(candidate)] += 1
                else:
                    support_counts[tuple(candidate)] = 1
    
    return support_counts

In [10]:
def generate_frequent_itemsets(support_counts, min_support):

    Lk = [] 
    
    for candidate, support in support_counts.items():
        if support >= min_support:
            Lk.append(candidate)
    
    return Lk

In [14]:

C2 = generate_candidates(2)
print("C2 :", C2)
support_counts = calculate_support(dataset2_bis, C2)
print("support_counts :", support_counts)
L2 = generate_frequent_itemsets(support_counts, 5)
print("L2 :", L2)

C2 : [(1.0, 10.0), (1.0, 17.0), (1.0, 20.0), (1.0, 22.0), (1.0, 23.0), (1.0, 24.0), (1.0, 25.0), (1.0, 26.0), (1.0, 27.0), (1.0, 28.0), (1.0, 29.0), (10.0, 17.0), (10.0, 20.0), (10.0, 22.0), (10.0, 23.0), (10.0, 24.0), (10.0, 25.0), (10.0, 26.0), (10.0, 27.0), (10.0, 28.0), (10.0, 29.0), (17.0, 20.0), (17.0, 22.0), (17.0, 23.0), (17.0, 24.0), (17.0, 25.0), (17.0, 26.0), (17.0, 27.0), (17.0, 28.0), (17.0, 29.0), (20.0, 22.0), (20.0, 23.0), (20.0, 24.0), (20.0, 25.0), (20.0, 26.0), (20.0, 27.0), (20.0, 28.0), (20.0, 29.0), (22.0, 23.0), (22.0, 24.0), (22.0, 25.0), (22.0, 26.0), (22.0, 27.0), (22.0, 28.0), (22.0, 29.0), (23.0, 24.0), (23.0, 25.0), (23.0, 26.0), (23.0, 27.0), (23.0, 28.0), (23.0, 29.0), (24.0, 25.0), (24.0, 26.0), (24.0, 27.0), (24.0, 28.0), (24.0, 29.0), (25.0, 26.0), (25.0, 27.0), (25.0, 28.0), (25.0, 29.0), (26.0, 27.0), (26.0, 28.0), (26.0, 29.0), (27.0, 28.0), (27.0, 29.0), (28.0, 29.0)]
support_counts : {(1.0, 22.0): 1, (1.0, 23.0): 1, (1.0, 28.0): 1, (1.0, 29.0): 1,