In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# IMPORTANT: apyori and mlxtend libraries both have methods which are called apriori. 
# Be sure to import only one of the libraries in order to use apriori.

from apyori import apriori
# from mlxtend.frequent_patterns import apriori
# from mlxtend.frequent_patterns import association_rules

# Display numbers not in scientific notation:
pd.options.display.float_format = '{:.6f}'.format

# Load data

In [2]:
df_trans = pd.read_csv('../data/transactions_train.csv', dtype={'article_id':'string'})


# 1.0 Model on wardrobes without 'None'

https://www.analyticsvidhya.com/blog/2021/10/a-comprehensive-guide-on-market-basket-analysis/

Create association rules based on wardrobes (not on baskets). 

Use apyori library.

In [4]:
# Generate wardrobe

df_trans_red = df_trans.drop(columns=['t_dat', 'price', 'sales_channel_id']).copy()
df_wardrobe = df_trans_red.groupby('customer_id')['article_id'].aggregate(lambda x: list(x)).reset_index()


In [5]:
# Clear memory
del [[df_trans,df_trans_red]]

In [6]:
df_wardrobe.head()

Unnamed: 0,customer_id,article_id
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[0625548001, 0176209023, 0627759010, 069713800..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[0583558001, 0639677008, 0640244003, 052126900..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[0663713001, 0541518023, 0663713001, 057802000..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[0742079001, 0732413001]"
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[0634249005, 0677049001, 0698286003, 070770400..."


In [7]:
# Add 'number of articles' column (needed to generate smaller dataset):

df_wardrobe['no_articles'] = df_wardrobe.article_id.apply(lambda x: len(x))


In [8]:
df_wardrobe.sort_values('no_articles', ascending=False)

Unnamed: 0,customer_id,article_id,no_articles
1011710,be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee9...,"[0658506001, 0662980002, 0667709001, 068568700...",1895
962395,b4db5e5259234574edfff958e170fe3a5e13b6f146752c...,"[0673186001, 0717205001, 0669713004, 051413400...",1441
391840,49beaacac0c7801c2ce2d189efe525fe80b5d37e46ed05...,"[0568597012, 0588689005, 0573716033, 065578400...",1364
885174,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,"[0543729003, 0610016001, 0639199001, 057365000...",1361
1090960,cd04ec2726dd58a8c753e0d6423e57716fd9ebcf2f14ed...,"[0671783004, 0711547001, 0631270001, 071187100...",1237
...,...,...,...
832233,9c7945de9cd5a5d7f833476ee2d7739e82b180a2026b5c...,[0753692004],1
234335,2c0f5a51c0b29188a119bdd358897f279b1dda4ba0194b...,[0400342037],1
234337,2c0f6e18e2e825f006126fc8be1117928fa24d5810857a...,[0790006003],1
832215,9c784d9b315f5c761a2f6c267f88d7f9bb4d8597b7983f...,[0842952001],1


In [9]:
# Select only wardrobes with 2 - 39 articles:

df_wardrobe_small = df_wardrobe[(df_wardrobe['no_articles'] > 1) & (df_wardrobe['no_articles'] < 40)]
df_wardrobe_small.head()

Unnamed: 0,customer_id,article_id,no_articles
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[0625548001, 0176209023, 0627759010, 069713800...",21
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[0663713001, 0541518023, 0663713001, 057802000...",18
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[0742079001, 0732413001]",2
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[0634249005, 0677049001, 0698286003, 070770400...",13
5,000064249685c11552da43ef22a5030f35a147f723d5b0...,"[0738133005, 0680265002, 0740962001]",3


In [10]:
df_wardrobe_small.sort_values('no_articles', ascending=False)

Unnamed: 0,customer_id,article_id,no_articles
1305935,f56a97b3ec18c8831e61b41383e8b80b413d1467218d2d...,"[0543054010, 0738921001, 0663612001, 071512600...",39
1306259,f57920bd32c9723fe415f23a82f8f9ed4f5bea5d0a76e6...,"[0673901012, 0708345010, 0710729008, 057308503...",39
447716,54437c554e2d18c1480630d217ae829e43bcd05b5c5926...,"[0685687001, 0625939025, 0710899003, 066259300...",39
914946,abeeac0902543d341478e1e62760c788e63957c4309e05...,"[0472755002, 0669391001, 0547937014, 062581800...",39
1120435,d27feeed8dd77ae2c6ef8027673888397f7b97dbff7882...,"[0656763001, 0733074001, 0741215001, 045854300...",39
...,...,...,...
278982,346f6ad56a5afd91a808edbbb655848ec2941817bd784a...,"[0743734001, 0507910001]",2
1131933,d4ac9b9744cdb2ab9c3a91df15e007b062b9fbd69f2feb...,"[0153115040, 0153115043]",2
634110,772ab79eb6880e713c275a07c922502969d9b2956aedbd...,"[0802087001, 0802087001]",2
634112,772adae42629ca0b24cefbc10a6fb06976f375050053b5...,"[0481696002, 0699755010]",2


Resource:
https://stackoverflow.com/questions/35491274/split-a-pandas-column-of-lists-into-multiple-columns


In [11]:
# Generate columns where every article_id is in one column:
df_only_articles = pd.DataFrame(df_wardrobe_small['article_id'].to_list(), index=df_wardrobe_small.index)

# Join dataframe with only articles to wardrobe on index:

df_wardrobe_small = df_wardrobe_small.join(df_only_articles)

# Drop not needed columns:

df_wardrobe_small_red = df_wardrobe_small.drop(columns=['customer_id', 'article_id', 'no_articles'])


In [12]:
# Converting dataframe into list of lists:

l=[]

for i in range (0, len(df_wardrobe_small_red)):
    l.append([str(df_wardrobe_small_red.values[i,j]) for j in range(0,df_wardrobe_small_red.shape[1])])

In [13]:
# Delete 'None'-strings from all lists:

l_new = []
for i in l:
    l_new.append([ ele for ele in i if ele != 'None' ])

In [14]:
l_new

[['0625548001',
  '0176209023',
  '0627759010',
  '0697138006',
  '0568601006',
  '0568601006',
  '0607642008',
  '0745232001',
  '0656719005',
  '0797065001',
  '0797065001',
  '0785186005',
  '0694736004',
  '0785710001',
  '0812683013',
  '0841260003',
  '0887593002',
  '0890498002',
  '0795440001',
  '0859416011',
  '0568601043'],
 ['0663713001',
  '0541518023',
  '0663713001',
  '0578020002',
  '0723529001',
  '0351484002',
  '0351484002',
  '0727808001',
  '0727808007',
  '0858883002',
  '0851400006',
  '0750424014',
  '0750424014',
  '0870304002',
  '0870304002',
  '0852643001',
  '0852643003',
  '0794321007'],
 ['0742079001', '0732413001'],
 ['0634249005',
  '0677049001',
  '0698286003',
  '0707704003',
  '0399061015',
  '0399061015',
  '0589440005',
  '0827971001',
  '0818320001',
  '0896152002',
  '0730683050',
  '0927530004',
  '0791587015'],
 ['0738133005', '0680265002', '0740962001'],
 ['0735843004',
  '0726925001',
  '0715624008',
  '0783388001',
  '0719530003',
  '044850

In [25]:
# Applying apriori algorithm

association_rules = apriori(l_new, min_support=0.0001, min_confidence=0.2, min_lift=1, min_length=2)
association_results = list(association_rules)

In [16]:
# Print association rules:

for i in range(0, len(association_results)):
    print(association_results[i][0])

frozenset({'0108775044', '0108775015'})
frozenset({'0111593001', '0111586001'})
frozenset({'0158340001', '0111586001'})
frozenset({'0436261001', '0111586001'})
frozenset({'0111593001', '0240561001'})
frozenset({'0160442010', '0160442007'})
frozenset({'0160442043', '0160442007'})
frozenset({'0458543001', '0351484002'})
frozenset({'0351484002', '0609719001'})
frozenset({'0699080001', '0351484002'})
frozenset({'0723529001', '0351484002'})
frozenset({'0351484002', '0838055001'})
frozenset({'0351484026', '0723529003'})
frozenset({'0458543004', '0351484027'})
frozenset({'0723529004', '0351484027'})
frozenset({'0715303006', '0351484028'})
frozenset({'0351484033', '0458543005'})
frozenset({'0351484033', '0728473001'})
frozenset({'0351484039', '0823505001'})
frozenset({'0838055003', '0351484041'})
frozenset({'0372860001', '0372860002'})
frozenset({'0399087010', '0399087014'})
frozenset({'0399223033', '0399223001'})
frozenset({'0399223035', '0399223001'})
frozenset({'0399223035', '0399223033'})


In [17]:
len(association_results)

209

In [19]:
# association_results_df = pd.DataFrame(association_results)
# association_results_df.to_csv('../data/20220428_association_results_wardrobesize_2-40.csv')

In [22]:
association_results_dict = {'antecedants': [], 'consequents':[], 'support': [], 'confidence': [], 'lift':[]}

for item in association_results:
    # first index of the inner list
    # Contains base item and add item
    pair = item[0]
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])
    association_results_dict['antecedants'].append(items[0])
    association_results_dict['consequents'].append(items[1])
    # second index of the inner list
    print("Support: " + str(item[1]))
    association_results_dict['support'].append(item[1])
    # third index of the list located at 0th position
    # of the third index of the inner list
    print("Confidence: " + str(item[2][0][2]))
    association_results_dict['confidence'].append(item[2][0][2])
    print("Lift: " + str(item[2][0][3]))
    association_results_dict['lift'].append(item[2][0][3])
    print("-----------------------------------------------------")

Rule: 0108775044 -> 0108775015
Support: 0.00073007300730073
Confidence: 0.2537365311087939
Lift: 115.95573923934325
-----------------------------------------------------
Rule: 0111593001 -> 0111586001
Support: 0.0007830783078307831
Confidence: 0.2283464566929134
Lift: 70.21021588168638
-----------------------------------------------------
Rule: 0158340001 -> 0111586001
Support: 0.0008250825082508251
Confidence: 0.24059492563429571
Lift: 66.49277671136879
-----------------------------------------------------
Rule: 0436261001 -> 0111586001
Support: 0.0006130613061306131
Confidence: 0.22266618234653104
Lift: 64.92969254251864
-----------------------------------------------------
Rule: 0111593001 -> 0240561001
Support: 0.00045404540454045407
Confidence: 0.2821628340584214
Lift: 86.75726253844267
-----------------------------------------------------
Rule: 0160442010 -> 0160442007
Support: 0.0013511351135113512
Confidence: 0.3090118938700824
Lift: 80.15071146062137
--------------------------

In [23]:
# Store association rules dict in dataframe:
association_results_df_2 = pd.DataFrame.from_dict(association_results_dict)
association_results_df_2.head()

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,0108775044,0108775015,0.000730,0.253737,115.955739
1,0111593001,0111586001,0.000783,0.228346,70.210216
2,0158340001,0111586001,0.000825,0.240595,66.492777
3,0436261001,0111586001,0.000613,0.222666,64.929693
4,0111593001,0240561001,0.000454,0.282163,86.757263
...,...,...,...,...,...
204,0854677002,0854683002,0.000559,0.667064,609.130349
205,0854683003,0854677003,0.000621,0.655755,555.198510
206,0860819001,0860820001,0.000963,0.624109,422.795710
207,0706016003,0706016001,0.000620,0.324098,26.406932


In [24]:
# Store association results in csv.
# Change name befor executing cell:

association_results_df_2.to_csv('../data/20220429_association_results_wardrobesize_2-40_wo_none.csv')