In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')

# 1.0 Load data from association rules

In [2]:
# Load association results:

association_results_df = pd.read_csv('../data/20220504_table_association_results_wardrobesize_2-20_wo_none.csv', \
    index_col=0,  dtype={'antecedants':'string', 'consequents':'string'})

association_results_df.sort_values('lift', ascending=False, inplace=True)
association_results_df

Unnamed: 0,antecedants,consequents,support,confidence,lift
720,0882759003,0882757003,0.000116,0.808696,6260.392977
719,0874078003,0912418001,0.000114,0.844037,6177.581318
377,0698260005,0698255005,0.000108,0.813084,6117.888025
657,0833490002,0833486002,0.000101,0.743119,5982.853211
646,0845699001,0832298002,0.000119,0.727273,5740.463458
...,...,...,...,...,...
396,0706016003,0706016001,0.001151,0.248459,23.447922
395,0706016002,0706016001,0.002006,0.235870,22.259842
401,0706016025,0706016001,0.000127,0.235023,22.179938
399,0706016015,0706016001,0.000761,0.223315,21.075021


# 2.0 Generate wardrobe table

In [3]:
# Load transaction dataset:
df_trans = pd.read_csv('../data/transactions_train.csv', dtype={'article_id':'string'})
# Train test split:
df_trans_train = df_trans.query('t_dat < "2020-09-16"').copy()
# Drop not necessary columns:
df_trans_red = df_trans_train.drop(columns=['t_dat', 'price', 'sales_channel_id']).copy()
# Generate wardrobe:
df_wardrobe = df_trans_red.groupby('customer_id')['article_id'].aggregate(lambda x: list(x)).reset_index()

df_wardrobe.head()

Unnamed: 0,customer_id,article_id
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[0625548001, 0176209023, 0627759010, 069713800..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[0583558001, 0639677008, 0640244003, 052126900..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[0663713001, 0541518023, 0663713001, 057802000..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[0742079001, 0732413001]"
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[0634249005, 0677049001, 0698286003, 070770400..."


In [4]:
# Reverse wardrobe article_ids to focus on the last bought items:
df_wardrobe['wardrobe_reverse'] = df_wardrobe['article_id'].apply(lambda x: list(reversed(x)))
df_wardrobe.head()

Unnamed: 0,customer_id,article_id,wardrobe_reverse
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[0625548001, 0176209023, 0627759010, 069713800...","[0568601043, 0859416011, 0795440001, 089049800..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[0583558001, 0639677008, 0640244003, 052126900...","[0826211002, 0351484002, 0811925005, 081192500..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[0663713001, 0541518023, 0663713001, 057802000...","[0794321007, 0852643003, 0852643001, 087030400..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[0742079001, 0732413001]","[0732413001, 0742079001]"
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[0634249005, 0677049001, 0698286003, 070770400...","[0791587015, 0927530004, 0730683050, 089615200..."


In [5]:
# Add 'number of articles' column (propably needed to generate smaller dataset) and sort descending:

df_wardrobe['no_articles'] = df_wardrobe.article_id.apply(lambda x: len(x))
df_wardrobe.sort_values('no_articles', ascending=False, inplace=True)

In [6]:
# # Select a sample for testing code
# df_sample = df_wardrobe.sample(n=10000, random_state=42).copy()
# df_sample.sort_values('no_articles', ascending=False)

# Since no sample is necessary df_sample will be df_wardrobe:
df_sample = df_wardrobe.copy()

In [7]:
# Clear memory
del [[df_trans,df_trans_red, df_trans_train, df_wardrobe]]

gc.collect()

48

# 3.0 Assign product recommendations based on current wardrobes

In [8]:
# Generate empty column for recommendation
df_sample['reco'] = df_sample.apply(lambda x: [], axis=1)

# Generate empty column for recommendation including lift:
df_sample['reco_lift'] = df_sample.apply(lambda x: [], axis=1)

df_sample.head()

Unnamed: 0,customer_id,article_id,wardrobe_reverse,no_articles,reco,reco_lift
1007560,be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee9...,"[0658506001, 0662980002, 0667709001, 068568700...","[0879291001, 0926502001, 0904965002, 088656600...",1895,[],[]
958450,b4db5e5259234574edfff958e170fe3a5e13b6f146752c...,"[0673186001, 0717205001, 0669713004, 051413400...","[0935689001, 0898692003, 0809238001, 092194400...",1427,[],[]
390238,49beaacac0c7801c2ce2d189efe525fe80b5d37e46ed05...,"[0568597012, 0588689005, 0573716033, 065578400...","[0895836001, 0892555002, 0803757008, 091854700...",1356,[],[]
881514,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,"[0543729003, 0610016001, 0639199001, 057365000...","[0903090001, 0909920001, 0787884003, 091220800...",1355,[],[]
1086478,cd04ec2726dd58a8c753e0d6423e57716fd9ebcf2f14ed...,"[0671783004, 0711547001, 0631270001, 071187100...","[0952267001, 0893432003, 0873274002, 091733400...",1223,[],[]


In [9]:
# Append consequents as list
# Info: needed 186 minutes for wardrobes between 2-20 products.

# ON REVERSED WARDROBES
# Higher focus on last purchased articles

for i in tqdm(range(len(df_sample))):
    for j in range(len(association_results_df)):
        if association_results_df.antecedants.iloc[j] in df_sample.wardrobe_reverse.iloc[i]:
            if association_results_df.consequents.iloc[j] not in df_sample.wardrobe_reverse.iloc[i]:
                df_sample['reco'].iloc[i].append(association_results_df.consequents.iloc[j])
                # Append consequents and corresponding lift as tuples if needed:
                df_sample['reco_lift'].iloc[i].append((association_results_df.consequents.iloc[j], association_results_df.lift.iloc[j]))

# ON NOT REVERSED WARDROBES:

# for i in tqdm(range(len(df_sample))):
#     for j in range(len(association_results_df)):
#         if association_results_df.antecedants.iloc[j] in df_sample.article_id.iloc[i]:
#             if association_results_df.consequents.iloc[j] not in df_sample.article_id.iloc[i]:
#                 df_sample['reco'].iloc[i].append(association_results_df.consequents.iloc[j])
#                 # Append consequents and corresponding lift as tuples if needed:
#                 df_sample['reco_lift'].iloc[i].append((association_results_df.consequents.iloc[j], association_results_df.lift.iloc[j]))
            



100%|██████████| 1356709/1356709 [3:18:45<00:00, 113.76it/s] 


In [10]:
df_sample.sort_values('no_articles', ascending=False).head()

Unnamed: 0,customer_id,article_id,wardrobe_reverse,no_articles,reco,reco_lift
1007560,be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee9...,"[0658506001, 0662980002, 0667709001, 068568700...","[0879291001, 0926502001, 0904965002, 088656600...",1895,"[0758060001, 0492244014, 0799410003, 078338000...","[(0758060001, 5556.76862745098), (0492244014, ..."
958450,b4db5e5259234574edfff958e170fe3a5e13b6f146752c...,"[0673186001, 0717205001, 0669713004, 051413400...","[0935689001, 0898692003, 0809238001, 092194400...",1427,"[0772771001, 0736582001, 0806528004, 072243700...","[(0772771001, 2340.774791764891), (0736582001,..."
390238,49beaacac0c7801c2ce2d189efe525fe80b5d37e46ed05...,"[0568597012, 0588689005, 0573716033, 065578400...","[0895836001, 0892555002, 0803757008, 091854700...",1356,"[0811907005, 0568601008, 0795675001, 076786900...","[(0811907005, 1333.1780109752576), (0568601008..."
881514,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,"[0543729003, 0610016001, 0639199001, 057365000...","[0903090001, 0909920001, 0787884003, 091220800...",1355,"[0757995009, 0620945002, 0629420011, 083247300...","[(0757995009, 594.6467565831726), (0620945002,..."
1086478,cd04ec2726dd58a8c753e0d6423e57716fd9ebcf2f14ed...,"[0671783004, 0711547001, 0631270001, 071187100...","[0952267001, 0893432003, 0873274002, 091733400...",1223,"[0355072001, 0758050001, 0572797041, 057279704...","[(0355072001, 270.47522244333123), (0758050001..."


In [None]:
# Check results for one customer:

# idn = 476607
# customer_id = df_sample.customer_id.loc[idn]
# test_list_wr = df_sample.article_id.loc[idn]
# test_list = df_sample.reco.loc[idn]
# test_list_lift = df_sample.reco_lift.loc[idn]
# print(customer_id)
# print('------')
# print(test_list_wr)
# print('------')
# print(test_list)
# print('------')
# print(test_list_lift)

In [11]:
df_sample['no_reco'] = df_sample.reco.apply(lambda x: len(x))
df_sample.sort_values('no_reco', ascending=False).head()


Unnamed: 0,customer_id,article_id,wardrobe_reverse,no_articles,reco,reco_lift,no_reco
1007560,be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee9...,"[0658506001, 0662980002, 0667709001, 068568700...","[0879291001, 0926502001, 0904965002, 088656600...",1895,"[0758060001, 0492244014, 0799410003, 078338000...","[(0758060001, 5556.76862745098), (0492244014, ...",60
64883,0c44b5697c84f49c39712df9b5452ec131cf6cf4090e35...,"[0399136060, 0399136061, 0668390001, 042619900...","[0748566027, 0730683062, 0730683001, 088275700...",510,"[0802980001, 0698283002, 0832284003, 054246400...","[(0802980001, 2851.271864991377), (0698283002,...",53
238923,2d17eb6bc25d969da89b83e3d446041c90bc99789998a3...,"[0680810001, 0537346022, 0550309001, 063967700...","[0811907005, 0890140001, 0866468001, 082316800...",158,"[0758060001, 0843873003, 0740720001, 075800201...","[(0758060001, 5556.76862745098), (0843873003, ...",45
561018,69dcbfa8db2b3b9832657dd3f693cdb9bae009b5e3c289...,"[0658298001, 0657291001, 0657291001, 063967700...","[0861173003, 0919273002, 0903824006, 076891200...",341,"[0833486002, 0772032001, 0758064002, 060004301...","[(0833486002, 5982.853211009175), (0772032001,...",45
19815,03c3771e117f921c472552497e243607243fd7de0c57fd...,"[0688873005, 0618899009, 0670687001, 067068700...","[0872901005, 0852584001, 0809238001, 088194200...",411,"[0871468001, 0861477001, 0767438001, 081183500...","[(0871468001, 3232.2615101289134), (0861477001...",44


In [12]:
# Number of customer_ids with at least X recommendations:
print("The dataset includes...")
print(f"- {df_sample[df_sample['no_reco']>0].shape[0]} customers with at least 1 recommendation")
print(f"- {df_sample[df_sample['no_reco']>5].shape[0]} customers with at least 6 recommendations")
print(f"- {df_sample[df_sample['no_reco']>12].shape[0]} customers with more than 12 recommendations")


The dataset includes...
- 541901 customers with at least 1 recommendation
- 111490 customers with at least 6 recommendations
- 23616 customers with more than 12 recommendations


In [13]:
# Create reco-column ("prediction") which stores recos in one string:
# df_sample['prediction'] = ' '.join(df_sample['reco'])
df_sample['prediction'] = df_sample.reco.apply(lambda x: ' '.join(x))
df_sample.head()

Unnamed: 0,customer_id,article_id,wardrobe_reverse,no_articles,reco,reco_lift,no_reco,prediction
1007560,be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee9...,"[0658506001, 0662980002, 0667709001, 068568700...","[0879291001, 0926502001, 0904965002, 088656600...",1895,"[0758060001, 0492244014, 0799410003, 078338000...","[(0758060001, 5556.76862745098), (0492244014, ...",60,0758060001 0492244014 0799410003 0783380002 06...
958450,b4db5e5259234574edfff958e170fe3a5e13b6f146752c...,"[0673186001, 0717205001, 0669713004, 051413400...","[0935689001, 0898692003, 0809238001, 092194400...",1427,"[0772771001, 0736582001, 0806528004, 072243700...","[(0772771001, 2340.774791764891), (0736582001,...",14,0772771001 0736582001 0806528004 0722437003 07...
390238,49beaacac0c7801c2ce2d189efe525fe80b5d37e46ed05...,"[0568597012, 0588689005, 0573716033, 065578400...","[0895836001, 0892555002, 0803757008, 091854700...",1356,"[0811907005, 0568601008, 0795675001, 076786900...","[(0811907005, 1333.1780109752576), (0568601008...",24,0811907005 0568601008 0795675001 0767869001 05...
881514,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,"[0543729003, 0610016001, 0639199001, 057365000...","[0903090001, 0909920001, 0787884003, 091220800...",1355,"[0757995009, 0620945002, 0629420011, 083247300...","[(0757995009, 594.6467565831726), (0620945002,...",6,0757995009 0620945002 0629420011 0832473003 06...
1086478,cd04ec2726dd58a8c753e0d6423e57716fd9ebcf2f14ed...,"[0671783004, 0711547001, 0631270001, 071187100...","[0952267001, 0893432003, 0873274002, 091733400...",1223,"[0355072001, 0758050001, 0572797041, 057279704...","[(0355072001, 270.47522244333123), (0758050001...",5,0355072001 0758050001 0572797041 0572797041 07...


In [14]:
# Store recommendations as csv:
df_sample.to_csv('../data/20220506_train_all_cust_recommendations_wardrobesize_2-20_wo_none_reverse_wr.csv')

# 4.0 Generate submission csv

In [19]:
# Generate submission train file (only customer_ids which exist in train dataset):
df_submission_train = df_sample.drop(columns=['article_id', 'reco', 'no_articles', 'no_reco', 'wardrobe_reverse', 'reco_lift'])
df_submission_train.head()


Unnamed: 0,customer_id,prediction
1007560,be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee9...,0758060001 0492244014 0799410003 0783380002 06...
958450,b4db5e5259234574edfff958e170fe3a5e13b6f146752c...,0772771001 0736582001 0806528004 0722437003 07...
390238,49beaacac0c7801c2ce2d189efe525fe80b5d37e46ed05...,0811907005 0568601008 0795675001 0767869001 05...
881514,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,0757995009 0620945002 0629420011 0832473003 06...
1086478,cd04ec2726dd58a8c753e0d6423e57716fd9ebcf2f14ed...,0355072001 0758050001 0572797041 0572797041 07...


In [20]:
# Load sample_submission to get all customer_ids and drop sample predictions:
df_sample_sub = pd.read_csv('../data/sample_submission.csv')
df_sample_sub.drop(columns=['prediction'], inplace=True)
df_sample_sub.shape

(1371980, 1)

In [21]:
# Generate full submission file with all customer_ids:
df_submission_test = pd.merge(df_sample_sub, df_submission_train, how= 'left', on="customer_id")


In [22]:
df_submission_test.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568597007
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0776237020 0688537005 0589599036 0838357002 04...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0458543001 0458543009 0838055001 0823505002
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0692721005


In [23]:
# Fill up with baseline-model recommendations:

df_submission_test_baseline = df_submission_test.copy()

In [24]:
baseline = " 0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001"
df_submission_test_baseline['prediction'] = df_submission_test_baseline['prediction'] + baseline
df_submission_test_baseline['prediction'].fillna(value=baseline, inplace=True)
df_submission_test_baseline['prediction'] = df_submission_test_baseline['prediction'].str.strip()
df_submission_test_baseline

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568597007 0706016001 0706016002 0372860001 06...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0776237020 0688537005 0589599036 0838357002 04...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0458543001 0458543009 0838055001 0823505002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0692721005 0706016001 0706016002 0372860001 06...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0706016001 0706016002 0372860001 0610776002 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0854677001 0776237020 0838357002 0624486064 05...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0573716002 0573716050 0706016006 0706016006 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0706016001 0706016002 0372860001 0610776002 07...


In [25]:
df_submission_test_baseline.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1371980 entries, 0 to 1371979
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   customer_id  1371980 non-null  object
 1   prediction   1371980 non-null  object
dtypes: object(2)
memory usage: 31.4+ MB


In [26]:
# MBA with baseline: Store submission file as csv:
df_submission_test_baseline.to_csv('../data/MBA-Baseline-WR-Reverse_20220506_submission_wardrobesize_2-20.csv', index = False)

In [27]:
# MBA without baseline: Store submission file as csv:
df_submission_test.to_csv('../data/MBA-wo-Baseline-WR-Reverse_20220506_submission_wardrobesize_2-20.csv', index = False)

In [28]:
df_submission_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1371980 entries, 0 to 1371979
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   customer_id  1371980 non-null  object
 1   prediction   1356709 non-null  object
dtypes: object(2)
memory usage: 31.4+ MB
