# Build recommendations and submission file (for Marketbasket Analysis)

This notebook generates recommendation for customer_ids based on the association rules from marketbasket analysis and stores them in a submission file (csv). 
* Customer_ids where no recommendations could be associated based on association rules are filled with top12-recommendations.
* All customer_ids which have less than 12 recommendations are completed with top12-recommendations.

In [None]:
# Import necessary libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')

# 1.0 Load data from association rules

In [None]:
# Load association results:

association_results_df = pd.read_csv('../data/20220510_table_association_results_wardrobesize_2-20_wo_none.csv', \
    index_col=0,  dtype={'antecedants':'string', 'consequents':'string'})

association_results_df.sort_values('lift', ascending=False, inplace=True)
association_results_df

# 2.0 Generate wardrobe table

In [None]:
# Load transaction dataset:
df_trans = pd.read_csv('../data/transactions_train.csv', dtype={'article_id':'string'})
# Train test split:
df_trans_train = df_trans.query('t_dat < "2020-09-16"').copy()
# Drop not necessary columns:
df_trans_red = df_trans_train.drop(columns=['t_dat', 'price', 'sales_channel_id']).copy()
# Generate wardrobe:
df_wardrobe = df_trans_red.groupby('customer_id')['article_id'].aggregate(lambda x: list(x)).reset_index()

df_wardrobe.head()

In [None]:
# Reverse wardrobe article_ids to focus on the last bought items:
df_wardrobe['wardrobe_reverse'] = df_wardrobe['article_id'].apply(lambda x: list(reversed(x)))
df_wardrobe.head()

In [None]:
# Add 'number of articles' column (propably needed to generate smaller dataset) and sort descending:

df_wardrobe['no_articles'] = df_wardrobe.article_id.apply(lambda x: len(x))
df_wardrobe.sort_values('no_articles', ascending=False, inplace=True)

In [None]:
# # Select a sample for testing code
# df_sample = df_wardrobe.sample(n=10000, random_state=42).copy()
# df_sample.sort_values('no_articles', ascending=False)

# Since no sample is necessary df_sample will be df_wardrobe:
df_sample = df_wardrobe.copy()

In [None]:
# Clear memory
del [[df_trans,df_trans_red, df_trans_train, df_wardrobe]]

gc.collect()

# 3.0 Assign product recommendations based on current wardrobes

In [None]:
# Generate empty column for recommendation
df_sample['reco'] = df_sample.apply(lambda x: [], axis=1)

# Generate empty column for recommendation including lift:
df_sample['reco_lift'] = df_sample.apply(lambda x: [], axis=1)

df_sample.head()

In [None]:
# Append consequents as list
# Info: needed 186 minutes for wardrobes between 2-20 products.

# ON REVERSED WARDROBES
# Higher focus on last purchased articles

for i in tqdm(range(len(df_sample))):
    for j in range(len(association_results_df)):
        if association_results_df.antecedants.iloc[j] in df_sample.wardrobe_reverse.iloc[i]:
            if association_results_df.consequents.iloc[j] not in df_sample.wardrobe_reverse.iloc[i]:
                df_sample['reco'].iloc[i].append(association_results_df.consequents.iloc[j])
                # Append consequents and corresponding lift as tuples if needed:
                df_sample['reco_lift'].iloc[i].append((association_results_df.consequents.iloc[j], association_results_df.lift.iloc[j]))

# ON NOT REVERSED WARDROBES:

# for i in tqdm(range(len(df_sample))):
#     for j in range(len(association_results_df)):
#         if association_results_df.antecedants.iloc[j] in df_sample.article_id.iloc[i]:
#             if association_results_df.consequents.iloc[j] not in df_sample.article_id.iloc[i]:
#                 df_sample['reco'].iloc[i].append(association_results_df.consequents.iloc[j])
#                 # Append consequents and corresponding lift as tuples if needed:
#                 df_sample['reco_lift'].iloc[i].append((association_results_df.consequents.iloc[j], association_results_df.lift.iloc[j]))
            



In [None]:
# Check dataframe im reasonable inputs:
df_sample.sort_values('no_articles', ascending=False).head()

In [None]:
# Add column which shows the number of recommendations for each customer_id:
df_sample['no_reco'] = df_sample.reco.apply(lambda x: len(x))
df_sample.sort_values('no_reco', ascending=False).head()


In [None]:
# Number of customer_ids with at least X recommendations:
print("The dataset includes...")
print(f"- {df_sample[df_sample['no_reco']>0].shape[0]} customers with at least 1 recommendation")
print(f"- {df_sample[df_sample['no_reco']>5].shape[0]} customers with at least 6 recommendations")
print(f"- {df_sample[df_sample['no_reco']>12].shape[0]} customers with more than 12 recommendations")


In [None]:
# Create reco-column ("prediction") which stores recos in one string:
# df_sample['prediction'] = ' '.join(df_sample['reco'])
df_sample['prediction'] = df_sample.reco.apply(lambda x: ' '.join(x))
df_sample.head()

In [None]:
# Store recommendations as csv:
df_sample.to_csv('../data/20220510_train_all_cust_recommendations_wardrobesize_2-20_wo_none_reverse_wr.csv')

# 4.0 Generate submission csv

In [None]:
# Generate submission train file (only customer_ids which exist in train dataset):
df_submission_train = df_sample.drop(columns=['article_id', 'reco', 'no_articles', 'no_reco', 'wardrobe_reverse', 'reco_lift'])
df_submission_train.head()


In [None]:
# Load sample_submission to get all customer_ids and drop sample predictions:
df_sample_sub = pd.read_csv('../data/baseline_sample_submission.csv')
df_sample_sub.drop(columns=['prediction'], inplace=True)
df_sample_sub.shape

In [None]:
# Generate full submission file with all customer_ids:
df_submission_test = pd.merge(df_sample_sub, df_submission_train, how= 'left', on="customer_id")
df_submission_test.head()

In [None]:
# Fill up with baseline-model recommendations (top12 most sold articles):

df_submission_test_baseline = df_submission_test.copy()

In [None]:
baseline = " 0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001"
df_submission_test_baseline['prediction'] = df_submission_test_baseline['prediction'] + baseline
df_submission_test_baseline['prediction'].fillna(value=baseline, inplace=True)
df_submission_test_baseline['prediction'] = df_submission_test_baseline['prediction'].str.strip()
df_submission_test_baseline

In [None]:
df_submission_test_baseline.info()

In [None]:
# MBA with baseline: Store submission file as csv:
df_submission_test_baseline.to_csv('../data/MBA-Baseline-WR-Reverse_20220510_submission_wardrobesize_2-20.csv', index = False)

In [None]:
# MBA without baseline: Store submission file as csv:
df_submission_test.to_csv('../data/MBA-wo-Baseline-WR-Reverse_20220510_submission_wardrobesize_2-20.csv', index = False)

In [None]:
df_submission_test.info()