In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Display numbers not in scientific notation:
pd.options.display.float_format = '{:.2f}'.format

import Functions_EDA_CMM as fe

# 1.0 Load and prepare data

In [2]:
# Read in the transaction data csv:

df_trans = pd.read_csv('../data/transactions_train.csv')

In [3]:
# Execute function to calculate number of orders for each customer (Assumption: one order is the sum of all purchases of a customer on one day)
number_orders = fe.calc_orders_cust(df_trans)

In [5]:
number_orders.head()

Unnamed: 0,customer_id,number_orders
0,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,427
1,f137c16fd175271922dad4006565503952f24750a57388...,391
2,8df45859ccd71ef1e48e2ee9d1c65d5728c31c46ae957d...,390
3,788785852eddb5874f924603105f315d69571b3e5180f3...,322
4,e34f8aa5e7c8c258523ea3e5f5f13168b6c21a9e8bffcc...,318


In [6]:
# Add number of orders to every row in transaction file:
df_trans_orders = pd.merge(df_trans, number_orders, on="customer_id")

# Sort by customer id and number of orders to start wardrobe generation with customer ids who purchased a lot:
df_trans_sort = df_trans_orders.sort_values(['number_orders', 'customer_id'], ascending=[False, False])

In [10]:
df_trans_sort.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,datetime,number_orders
3386028,2018-09-24,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,543729003,0.05,2,2018-09-24,427
3386029,2018-09-24,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,610016001,0.04,2,2018-09-24,427
3386030,2018-09-24,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,639199001,0.08,2,2018-09-24,427
3386031,2018-09-24,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,573650001,0.02,2,2018-09-24,427
3386032,2018-09-26,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,607712001,0.03,2,2018-09-26,427


In [7]:
# Store all customer ids in a list for using it in a for loop:
cust_list_all = list(df_trans_sort.customer_id.unique())

# 2.0 Create wardrobe dataframe

#### New version

In [16]:
from tqdm import tqdm

In [18]:
wardrobe_list = []
wardrobe_dict = {}

# Initialize counter, if dataframe should not be executed completely (exit-option)
i=0
# Loop through transaction dataframe for each customer
for cust in tqdm(cust_list_all):
    # print(f"customer id: {cust} and i: {i}")
    # Count one up (exit-option)
    i += 1
    # Query transaction dataframe for row with the current customer id and store purchased article_ids in a dict
    query = df_trans_sort.query(f'customer_id == "{cust}"')
    wardrobe_dict[cust] = query['article_id'].tolist()
    if i == 10000:
        # Exit-option: Break for loop after 1000 customer ids
        break

# Transfrom dict into dataframe
df_wardrobe3 = pd.DataFrame(list(wardrobe_dict.items()), columns=['customer_id', 'articles'])

df_wardrobe3.head()

  1%|          | 9999/1362281 [4:21:34<589:35:20,  1.57s/it]  


Unnamed: 0,customer_id,articles
0,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,"[543729003, 610016001, 639199001, 573650001, 6..."
1,f137c16fd175271922dad4006565503952f24750a57388...,"[671505001, 675319001, 575141004, 673643001, 6..."
2,8df45859ccd71ef1e48e2ee9d1c65d5728c31c46ae957d...,"[660830005, 665535003, 675281002, 675281002, 6..."
3,788785852eddb5874f924603105f315d69571b3e5180f3...,"[189616006, 554598012, 554598028, 637005002, 6..."
4,e34f8aa5e7c8c258523ea3e5f5f13168b6c21a9e8bffcc...,"[478751007, 572517002, 572517002, 626366004, 5..."


In [19]:
# Save wardrobe as csv
df_wardrobe3.to_csv('../data/wardrobe_10000.csv')


In [21]:
df_wardrobe3.describe()

Unnamed: 0,customer_id,articles
count,10000,10000
unique,10000,10000
top,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,"[543729003, 610016001, 639199001, 573650001, 6..."
freq,1,1


#### Old Version (uses samples of transaction dataset)

In [153]:
# df_trans_sample = df_trans.sample(n=60000, random_state=42)

In [154]:
# cust_list = list(df_trans_sample.customer_id.unique())

In [155]:
# wardrobe_list = []
# wardrobe_dict = {}
# # cust_list = list(df_trans.customer_id.unique())

# for cust in cust_list:
#     query = df_trans_sample.query(f'customer_id == "{cust}"')
   
#     wardrobe_dict[cust] = query['article_id'].tolist()
#     # wardrobe_list = []

# df_wardrobe = pd.DataFrame(list(wardrobe_dict.items()), columns=['customer_id', 'articles'])
# # df_wardrobe = pd.DataFrame.from_dict(wardrobe_dict, orient='index', columns=['customer_id', 'articles']).reset_index()
# # df_wardrobe = pd.DataFrame.from_dict(wardrobe_dict)

# # df_wardrobe.rename(columns={'index': 'customer_id', 'customer_id': 'articles', 'articles': 'ho'}, inplace=True)
# # df_wardrobe.drop(columns='ho', inplace=True)


# print(df_wardrobe.head())

                                         customer_id     articles
0  215895f90002eb3d1a04bd603513c8e85e6002ef08f136...  [786586001]
1  7b183268e3a4623b80d5325ec4a20a0af0edff7bcb1748...  [658911001]
2  2eb7412239a90c0570cd3d1bf0492856ae5b59058b1ea6...  [759326005]
3  74f162e5a170fd57207aa2a7d5c58479ee9de903b2a277...  [737137004]
4  aab9306ee28c4db494003955f80355e540b01480ab35cf...  [785931001]
