In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Display numbers not in scientific notation:
pd.options.display.float_format = '{:.2f}'.format

import Functions_EDA_CMM as fe

# 1.0 Load and prepare data

In [2]:
# Read in the transaction data csv:

df_trans = pd.read_csv('../data/transactions_train.csv')

In [3]:
df_trans.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.05,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.03,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.02,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.02,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.02,2


In [None]:
# NOT NEEDED FOR VERSION 20220425
# Execute function to calculate number of orders for each customer (Assumption: one order is the sum of all purchases of a customer on one day)
number_orders = fe.calc_orders_cust(df_trans)

In [None]:
number_orders.head(100)

In [None]:
# NOT NEEDED FOR VERSION 20220425

# Add number of orders to every row in transaction file:
df_trans_orders = pd.merge(df_trans, number_orders, on="customer_id")

# Sort by customer id and number of orders to start wardrobe generation with customer ids who purchased a lot:
df_trans_sort = df_trans_orders.sort_values(['number_orders', 'customer_id'], ascending=[False, False])

In [None]:
df_trans_sort.head()

In [None]:
# NOT NEEDED FOR VERSION 20220425

# Store all customer ids in a list for using it in a for loop:
# cust_list_all = list(df_trans_sort.customer_id.unique())

cust_list_all = list(number_orders.customer_id)

# 2.0 Create wardrobe dataframe

## 2.1 Version 20220425

In [4]:
# Drop all not necessary columns from transaction df:

df_trans_red = df_trans.drop(columns=['t_dat', 'price', 'sales_channel_id']).copy()

In [6]:
df_trans_red.head()

Unnamed: 0,customer_id,article_id
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004


In [7]:
wardrobe_df = df_trans_red.groupby('customer_id')['article_id'].aggregate(lambda x: list(x)).reset_index()


In [8]:
wardrobe_df

Unnamed: 0,customer_id,article_id
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[625548001, 176209023, 627759010, 697138006, 5..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[583558001, 639677008, 640244003, 521269001, 6..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[663713001, 541518023, 663713001, 578020002, 7..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[742079001, 732413001]"
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[634249005, 677049001, 698286003, 707704003, 3..."
...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,"[698276003, 699075005, 694182002, 722436003, 7..."
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,"[671695001, 562245015, 562245018, 562245004, 6..."
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,"[568597019, 484398001, 484398001, 701083001, 5..."
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,"[821395003, 806241002, 714790020, 866755002, 8..."


Test if same results:

In [25]:
wardrobe_df.loc[wardrobe_df['customer_id'] == '089cb17f70586ad3dacb6e0ced2352c5464ab575b49d0680a463f9218efdd566'].article_id.str.len()

45722    92
Name: article_id, dtype: int64

In [33]:
test_1 = wardrobe_df.loc[wardrobe_df['customer_id'] == '089cb17f70586ad3dacb6e0ced2352c5464ab575b49d0680a463f9218efdd566']
test_1.article_id.values

array([list([688873002, 661162001, 399136004, 584992003, 584992003, 580301001, 580301001, 673677004, 706108001, 640735007, 737735002, 685689001, 685689001, 685689001, 685689001, 654100005, 682848004, 693584002, 673396002, 693956001, 697686002, 708588001, 514937001, 587229001, 720346005, 713200003, 636902001, 600886011, 648769001, 648769001, 399136027, 399201005, 788178001, 705955002, 726228001, 705955004, 705955004, 705955004, 399201022, 788178001, 705955002, 705955002, 705955002, 669360001, 740215005, 797988002, 736963001, 714032005, 797710001, 756318002, 819139001, 836258001, 790635001, 834412001, 809961002, 834412001, 802974003, 802974003, 862970001, 862970001, 805000004, 573390001, 833548001, 795777001, 819547002, 716670007, 842952003, 796137001, 716670007, 796137001, 814766001, 795440001, 796137001, 831211002, 716670007, 801512004, 822186001, 720125041, 881497001, 881497001, 881497001, 883021003, 909080002, 909080002, 862970002, 864339003, 864339003, 836699001, 843373001, 84337300

In [19]:
test_wardrobe = pd.read_csv('../data/wardrobe_10000.csv', index_col=0)

In [32]:
test_2 = test_wardrobe.loc[test_wardrobe['customer_id'] == '089cb17f70586ad3dacb6e0ced2352c5464ab575b49d0680a463f9218efdd566']
test_2.articles.values

array(['[688873002, 661162001, 399136004, 584992003, 584992003, 580301001, 580301001, 673677004, 706108001, 640735007, 737735002, 685689001, 685689001, 685689001, 685689001, 654100005, 682848004, 693584002, 673396002, 693956001, 697686002, 708588001, 514937001, 587229001, 720346005, 713200003, 636902001, 600886011, 648769001, 648769001, 399136027, 399201005, 788178001, 705955002, 726228001, 705955004, 705955004, 705955004, 399201022, 788178001, 705955002, 705955002, 705955002, 669360001, 740215005, 797988002, 736963001, 714032005, 797710001, 756318002, 819139001, 836258001, 790635001, 834412001, 809961002, 834412001, 802974003, 802974003, 862970001, 862970001, 805000004, 573390001, 833548001, 795777001, 819547002, 716670007, 842952003, 796137001, 716670007, 796137001, 814766001, 795440001, 796137001, 831211002, 716670007, 801512004, 822186001, 720125041, 881497001, 881497001, 881497001, 883021003, 909080002, 909080002, 862970002, 864339003, 864339003, 836699001, 843373001, 843373001, 8

RESULT: Both variants in dataframe-creation deliver the same number of articles for the selected customer_id. This means, that the oneliner if Version 20220425 is working correctly and can be applied.

## 2.2 Version 20220424 (OLD)

Can be deleted later.

In [None]:
from tqdm import tqdm

In [None]:
# Drop all not necessary columns from transaction df:

df_trans_sort.drop(columns=['t_dat', 'price', 'sales_channel_id', 'datetime', 'number_orders'], inplace=True)

In [None]:
df_trans_sort

In [None]:
wardrobe_list = []
wardrobe_dict = {}

# Initialize counter, if dataframe should not be executed completely (exit-option)
i=0
# Loop through transaction dataframe for each customer
for cust in tqdm(cust_list_all):
    # print(f"customer id: {cust} and i: {i}")
    # Count one up (exit-option)
    i += 1
    # Query transaction dataframe for row with the current customer id and store purchased article_ids in a dict
    query = df_trans_sort.query(f'customer_id == "{cust}"')
    wardrobe_dict[cust] = query['article_id'].tolist()
    if i == 10000:
        # Exit-option: Break for loop after 10000 customer ids
        break

# Transfrom dict into dataframe
df_wardrobe3 = pd.DataFrame(list(wardrobe_dict.items()), columns=['customer_id', 'articles'])

df_wardrobe3.head()

In [None]:
# Save wardrobe as csv
df_wardrobe3.to_csv('../data/wardrobe_10000.csv')


In [None]:
df_wardrobe3.describe()

## 2.3 Version 20220421 (OLD - uses samples of transaction dataset)

Can bee deleted later.

In [None]:
# df_trans_sample = df_trans.sample(n=60000, random_state=42)

In [None]:
# cust_list = list(df_trans_sample.customer_id.unique())

In [None]:
# wardrobe_list = []
# wardrobe_dict = {}
# # cust_list = list(df_trans.customer_id.unique())

# for cust in cust_list:
#     query = df_trans_sample.query(f'customer_id == "{cust}"')
   
#     wardrobe_dict[cust] = query['article_id'].tolist()
#     # wardrobe_list = []

# df_wardrobe = pd.DataFrame(list(wardrobe_dict.items()), columns=['customer_id', 'articles'])
# # df_wardrobe = pd.DataFrame.from_dict(wardrobe_dict, orient='index', columns=['customer_id', 'articles']).reset_index()
# # df_wardrobe = pd.DataFrame.from_dict(wardrobe_dict)

# # df_wardrobe.rename(columns={'index': 'customer_id', 'customer_id': 'articles', 'articles': 'ho'}, inplace=True)
# # df_wardrobe.drop(columns='ho', inplace=True)


# print(df_wardrobe.head())