# Marketbasket Analysis on wardrobes

This notebook generates association rules based on the wardrobes of each selected customer_id. The selection is based on the number of articles a wardrobe should have. Wardrobesizes can be degfined by the variables lower_bound and upper_bound (see section 1.0).

Be aware:
The bigger the wardrobesizes, the longer the estimation will take.

In [None]:
# Import necessary libraries:
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')

# IMPORTANT: apyori and mlxtend libraries both have methods which are called apriori. 
# Be sure to import only one of the libraries in order to use apriori.

from apyori import apriori
# from mlxtend.frequent_patterns import apriori
# from mlxtend.frequent_patterns import association_rules

# Display numbers not in scientific notation:
pd.options.display.float_format = '{:.6f}'.format

# Load data

In [None]:
df_trans = pd.read_csv('../data/transactions_train.csv', dtype={'article_id':'string'})
df_trans.tail()


In [None]:
# Train test split:
df_trans_train = df_trans.query('t_dat < "2020-08-26"').copy()
df_trans_train.tail()

# 1.0 Model on wardrobes without 'None'

https://www.analyticsvidhya.com/blog/2021/10/a-comprehensive-guide-on-market-basket-analysis/

Create association rules based on wardrobes (not on baskets) without Nones in list-variable "wardrobes_as_list".

Apyori library is used here.

In [None]:
# Generate wardrobe

df_trans_red = df_trans_train.drop(columns=['t_dat', 'price', 'sales_channel_id']).copy()
df_wardrobe = df_trans_red.groupby('customer_id')['article_id'].aggregate(lambda x: list(x)).reset_index().copy()
df_wardrobe.head()


In [None]:
# Add 'number of articles' column (needed to generate smaller dataset):

df_wardrobe['no_articles'] = df_wardrobe.article_id.apply(lambda x: len(x))
df_wardrobe.sort_values('no_articles', ascending=False)


In [None]:
# Choose the customers, which should be included in the apriori association rule mining based on the number of articles they have in their wardrobe. 
# BE AWARE: The more articles in a wardrobe the longer the estimation will take.

# Define lower bound. This should be set on 2, since in wardrobes with less than 2 articles no assiciations can be found:
lower_bound = 2
# Define upper bound. How much articles should be in the wardrobes in maximum:
upper_bound = 20

df_wardrobe_small = df_wardrobe[(df_wardrobe['no_articles'] > lower_bound - 1 ) & (df_wardrobe['no_articles'] < upper_bound +1)]
df_wardrobe_small.shape

In [None]:
# Sort selected wardrobes based on number of articles:
df_wardrobe_small.sort_values('no_articles', ascending=False).head()

In [None]:
# Create list of lists with all wardrobes (wardrobesizes: see lower and upper bound):
wardrobes_as_list = df_wardrobe_small['article_id'].to_list()

In [None]:
# Clear memory

del [[df_trans,df_trans_red, df_trans_train]]
del [[df_wardrobe, df_wardrobe_small]]
gc.collect()

In [None]:
# Check if list is filled:
wardrobes_as_list

In [None]:
# Applying apriori algorithm

association_rules = apriori(wardrobes_as_list, min_support=0.0001, min_confidence=0.2, min_lift=1, min_length=2)
association_results = list(association_rules)

In [None]:
# Print association rules:

for i in range(0, len(association_results)):
    print(f"Rule = {association_results[i][0]} , len of rule= {len(association_results[i][0])}")

# Print number of association rules:

print(f'Number of association rules found: {len(association_results)}')

In [None]:
# Print example of association rules, fill in a number between 0 and length of association_result-list:
print(association_results[5])

In [None]:
# Store plain association rules as a csv:
association_results_df_plain = pd.DataFrame(association_results)
association_results_df_plain.to_csv('../data/20220510_plain_association_results_wardrobesize_2-20_wo_none.csv')

In [None]:
# Fill dictionary with results in order to store a result table later:
association_results_dict = {'antecedants': [], 'consequents':[], 'support': [], 'confidence': [], 'lift':[]}

for item in association_results:
    # first index of the inner list
    # Contains base item and add item
    pair = item[0]
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])
    association_results_dict['antecedants'].append(items[0])
    association_results_dict['consequents'].append(items[1])
    # second index of the inner list
    print("Support: " + str(item[1]))
    association_results_dict['support'].append(item[1])
    # third index of the list located at 0th position
    # of the third index of the inner list
    print("Confidence: " + str(item[2][0][2]))
    association_results_dict['confidence'].append(item[2][0][2])
    print("Lift: " + str(item[2][0][3]))
    association_results_dict['lift'].append(item[2][0][3])
    print("-----------------------------------------------------")

In [None]:
# Store association rules dict in dataframe:
association_results_df_table = pd.DataFrame.from_dict(association_results_dict)
association_results_df_table.head()

In [None]:
# Check shape of result-table dataframe:
association_results_df_table.shape

In [None]:
# Store association results in csv.
# Change name before executing cell:

association_results_df_table.to_csv('../data/20220510_table_association_results_wardrobesize_2-20_wo_none.csv')

# 2.0 Code-Backup

Resource:
https://stackoverflow.com/questions/35491274/split-a-pandas-column-of-lists-into-multiple-columns


In [None]:
# NOT NEEDED ANYMORE
# # Generate columns where every article_id is in one column:
# df_only_articles = pd.DataFrame(df_wardrobe_small['article_id'].to_list(), index=df_wardrobe_small.index)

# # Join dataframe with only articles to wardrobe on index:

# df_wardrobe_small = df_wardrobe_small.join(df_only_articles)

# # Drop not needed columns:

# df_wardrobe_small_red = df_wardrobe_small.drop(columns=['customer_id', 'article_id', 'no_articles'])
# df_wardrobe_small_red


In [None]:
# NOT NEEDED ANYMORE
# Converting dataframe into list of lists:

# l=[]

# for i in range (0, len(df_wardrobe_small_red)):
#     l.append([str(df_wardrobe_small_red.values[i,j]) for j in range(0,df_wardrobe_small_red.shape[1])])

# l

In [None]:
# NOT NEEDED ANYMORE
# # Delete 'None'-strings from all lists:

# l_new = []
# for i in l:
#     l_new.append([ ele for ele in i if ele != 'None' ])