In [33]:
import pandas as pd

In [34]:
# 1. Load dataset
data = pd.read_csv("Groceries_dataset.csv")
data

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
...,...,...,...
38760,4471,08-10-2014,sliced cheese
38761,2022,23-02-2014,candy
38762,1097,16-04-2014,cake bar
38763,1510,03-12-2014,fruit/vegetable juice


In [35]:
# 2. Data cleaning
# Menghilangkan Duplikasi, menghapus baris yang memiliki duplikasi dalam semua kolom
data_cleaned = data.drop_duplicates()
# Tampilkan informasi setelah menghapus duplikasi
print(f"Data after removing duplicates: {data_cleaned.shape[0]} rows")

data['itemDescription'] = data['itemDescription'].str.lower().str.strip()
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)

Data after removing duplicates: 38006 rows


In [36]:
# 3. Cek missing value 
print(data.isnull().sum())

Member_number      0
Date               0
itemDescription    0
dtype: int64


In [37]:
# 4. Bentuk data transaksi: satu transaksi = satu kombinasi Member_number + Date
transactions = data.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).reset_index(name='Transactions')

In [38]:
# Merubah data mentah menjadi data transaksi (transactions), Pastikan item sudah dalam lowercase dan bersih
data['itemDescription'] = data['itemDescription'].str.lower().str.strip()
# Ubah kolom Date ke datetime (opsional, lebih rapi)
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)
# GROUP BY
transactions = data.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).reset_index(name='Transactions')

# Lihat hasil
print(transactions.head())

   Member_number       Date                                       Transactions
0           1000 2014-06-24                  [whole milk, pastry, salty snack]
1           1000 2015-03-15  [sausage, whole milk, semi-finished bread, yog...
2           1000 2015-05-27                         [soda, pickled vegetables]
3           1000 2015-07-24                     [canned beer, misc. beverages]
4           1000 2015-11-25                        [sausage, hygiene articles]


In [39]:
# 5. Split data transaksi menjadi train/test (misal, 80% train, 20% test)
from sklearn.model_selection import train_test_split
train, test = train_test_split(transactions, test_size=0.2, random_state=42)

print(f"Jumlah transaksi train: {len(train)}")
print(f"Jumlah transaksi test: {len(test)}")

Jumlah transaksi train: 11970
Jumlah transaksi test: 2993


In [40]:
# Penerapan Algoritma Apriori
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [42]:
# 1. Ambil list transaksi saja (list of list)
trans_list = transactions['Transactions'].tolist()

In [43]:
# 2. Ubah ke format one-hot encoding
te = TransactionEncoder()
te_ary = te.fit(trans_list).transform(trans_list)
df_apriori = pd.DataFrame(te_ary, columns=te.columns_)

In [48]:
# 3. Hitung frequent itemset
frequent_itemsets = apriori(df_apriori, min_support=0.005, use_colnames=True)
print("Frequent Itemset (min_support=0.5%):")
print(frequent_itemsets.sort_values("support", ascending=False).head(10))

Frequent Itemset (min_support=0.5%):
     support            itemsets
87  0.157923        (whole milk)
52  0.122101  (other vegetables)
65  0.110005        (rolls/buns)
74  0.097106              (soda)
88  0.085879            (yogurt)
66  0.069572   (root vegetables)
80  0.067767    (tropical fruit)
5   0.060683     (bottled water)
69  0.060349           (sausage)
18  0.053131      (citrus fruit)


In [49]:
# 4. Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)
print("\nAssociation Rules (confidence > 0.1):")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))


Association Rules (confidence > 0.1):
          antecedents         consequents   support  confidence      lift
0      (bottled beer)        (whole milk)  0.007151    0.157817  0.999330
1     (bottled water)        (whole milk)  0.007151    0.117841  0.746196
2       (canned beer)        (whole milk)  0.006015    0.128205  0.811821
3      (citrus fruit)        (whole milk)  0.007151    0.134591  0.852259
4     (domestic eggs)        (whole milk)  0.005280    0.142342  0.901341
5       (frankfurter)  (other vegetables)  0.005146    0.136283  1.116150
6       (frankfurter)        (whole milk)  0.005280    0.139823  0.885388
7        (newspapers)        (whole milk)  0.005614    0.144330  0.913926
8  (other vegetables)        (whole milk)  0.014837    0.121511  0.769430
9            (pastry)        (whole milk)  0.006483    0.125323  0.793571


In [50]:
print(f"Jumlah aturan ditemukan: {len(rules)}")

Jumlah aturan ditemukan: 19
