In [1]:
import numpy as np
import pandas as pd
import os
import glob
from tqdm import tqdm
import datetime
import gc
import random
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

In [2]:
data = pd.read_csv("transactions_train.csv", 
                   dtype={'article_id':str}
                  )

In [3]:
print("All Transactions Date Range: {} to {}".format(data['t_dat'].min(), data['t_dat'].max()))
data["t_dat"] = pd.to_datetime(data["t_dat"])

All Transactions Date Range: 2018-09-20 to 2020-09-22


In [4]:
articles_df = pd.read_csv('articles.csv', dtype={'article_id':str})
customers_df = pd.read_csv('customers.csv')

In [5]:
customers_df['fashion_news_frequency'] = customers_df['fashion_news_frequency'].replace('NONE', 'None')
customers_df['age_bucket'] = pd.cut(customers_df['age'].fillna(36), [15, 24, 35, 50, 100], 
                                    labels=['16-24', '25-35', '36-50', '51-100'])

In [6]:
(articles_df.shape, articles_df.article_id.nunique(), articles_df.garment_group_name.nunique(), 
 articles_df.index_group_name.nunique())

((105542, 25), 105542, 21, 5)

In [7]:
customers_df['customer_id_old'] = customers_df['customer_id']

In [8]:
data['customer_id'] = data['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
customers_df['customer_id'] = customers_df['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')

In [9]:
data = data[data.t_dat>='2020-06-22'].reset_index(drop=True)

In [10]:
data['YearWeek'] = data["t_dat"].dt.year.astype(str) + data["t_dat"].dt.week.astype(str)
data['YearWeek'] = data['YearWeek'].astype(int)

  """Entry point for launching an IPython kernel.


In [11]:
fix_weeks = {20181:201801, 20191:201901, 20192:201902, 20193:201903, 20194:201904, 20195:201905, 20196:201906, 
             20197:201907, 20198:201908, 20199:201909, 20201:202001, 20202:202002, 
             20203:202003, 20204:202004, 20205:202005, 20206:202006, 20207:202007, 20208:202008, 20209:202009}

data['YearWeek'] = data['YearWeek'].replace(fix_weeks)

In [12]:
new_df = pd.DataFrame()

week_list = []
cust_week_list = []

cust_list = data.customer_id.unique().tolist()
all_weeks = [i for i in range(data.YearWeek.min(), data.YearWeek.max()+1)]
# all_weeks = [i for i in range(201934, 201953)] + [i for i in range(202001, data.YearWeek.max()+1)]
# all_weeks = [i for i in range(data.YearWeek.min(), 201853)] + [i for i in range(201901, 201953)] + [i for i in range(202001, data.YearWeek.max()+1)]

for customer in cust_list:
    cust_week_list.extend([customer]*len(all_weeks))
    week_list.extend(all_weeks)

len(week_list), len(cust_week_list)

(7495838, 7495838)

In [13]:
new_df['customer_id'] = cust_week_list
new_df['YearWeek'] = week_list

del week_list, cust_week_list

In [14]:
articles_df['Concat_text'] = (articles_df['prod_name']+' '+articles_df['product_type_name']+' '+
                              articles_df['product_group_name']+' '+
                              articles_df['graphical_appearance_name']+' '+articles_df['colour_group_name']+' '+
                              articles_df['perceived_colour_value_name']+' '+
                              articles_df['perceived_colour_master_name']+' '+
                              articles_df['department_name']+' '+articles_df['index_name']+' '+
                              articles_df['index_group_name']+' '+
                              articles_df['section_name']+' '+articles_df['garment_group_name']+' '+
                              articles_df['detail_desc'])

In [15]:
drop_cols = ['prod_name', 
       'product_type_name', 'product_group_name',
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name','perceived_colour_master_name',
       'department_name', 'index_name', 
       'index_group_name','section_name','garment_group_name', 'detail_desc']

In [16]:
articles_df = articles_df.drop(drop_cols, axis=1)

In [17]:
data = data.merge(pd.concat([articles_df[['article_id']], 
                             pd.get_dummies(articles_df['index_code'], prefix='index_code_')], axis=1), 
                  on='article_id', how='left')

In [18]:
data = new_df.merge(data.drop(['t_dat','article_id','price','sales_channel_id'], axis=1), 
                    on=['customer_id', 'YearWeek'], how='left').replace(np.nan, 0)

data = data.merge(customers_df[['age_bucket', 'customer_id']], on='customer_id', how='left')
del new_df

In [22]:
data

Unnamed: 0,customer_id,YearWeek,index_code__A,index_code__B,index_code__C,index_code__D,index_code__F,index_code__G,index_code__H,index_code__I,index_code__J,index_code__S,age_bucket
0,4180559723895677828,202026,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,25-35
1,4180559723895677828,202026,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,25-35
2,4180559723895677828,202027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25-35
3,4180559723895677828,202028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25-35
4,4180559723895677828,202029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25-35
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10483009,-551574007161385996,202036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36-50
10483010,-551574007161385996,202037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36-50
10483011,-551574007161385996,202038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36-50
10483012,-551574007161385996,202039,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,36-50


In [None]:
data = data.drop('age_bucket', axis=1).groupby(['customer_id','YearWeek']).sum().reset_index()

In [100]:
data = data.drop(['FiveSplit'], axis=1)

In [101]:
week_split = pd.DataFrame.from_dict({'YearWeek': data.YearWeek.unique().tolist(), 
                                                       'FiveSplit':[1,1,1,2,2,2,3,3,3,4,4,5,5,6]})

data = data.merge(week_split, on='YearWeek', how='left')

del week_split

In [102]:
week_split = data.drop('YearWeek', axis=1).groupby(['customer_id','FiveSplit']).sum().reset_index()

In [103]:
week_split[week_split.customer_id==9223148401910457466]

Unnamed: 0,customer_id,FiveSplit,index_code__A,index_code__B,index_code__C,index_code__D,index_code__F,index_code__G,index_code__H,index_code__I,index_code__J,index_code__S
3212496,9223148401910457466,1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3212497,9223148401910457466,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3212498,9223148401910457466,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3212499,9223148401910457466,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3212500,9223148401910457466,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3212501,9223148401910457466,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [104]:
target_list = [col for col in week_split.columns if 'index_code__' in col]

for target in target_list:

    week_split.loc[week_split.FiveSplit==1, target] = 0.2*1*week_split[target]
    week_split.loc[week_split.FiveSplit==2, target] = 0.2*2*week_split[target]
    week_split.loc[week_split.FiveSplit==3, target] = 0.2*3*week_split[target]
    week_split.loc[week_split.FiveSplit==4, target] = 0.2*4*week_split[target]
    week_split.loc[week_split.FiveSplit==5, target] = 0.2*5*week_split[target]
    
six_split = week_split.drop('FiveSplit', axis=1).groupby('customer_id').sum().reset_index()
six_split['FiveSplit'] = 6

In [108]:
week_split = week_split[week_split.FiveSplit!=6].reset_index(drop=True)
six_split = six_split[week_split.columns.tolist()]

week_split = pd.concat([week_split, six_split], axis=0)
week_split = week_split.sort_values(['customer_id','FiveSplit']).reset_index(drop=True)

del six_split

In [109]:
week_split[week_split.customer_id==9223148401910457466]

Unnamed: 0,customer_id,FiveSplit,index_code__A,index_code__B,index_code__C,index_code__D,index_code__F,index_code__G,index_code__H,index_code__I,index_code__J,index_code__S
3212496,9223148401910457466,1,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3212497,9223148401910457466,2,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0
3212498,9223148401910457466,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3212499,9223148401910457466,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3212500,9223148401910457466,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3212501,9223148401910457466,6,0.6,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
new_df = week_split[week_split.FiveSplit==6].reset_index(drop=True)
df = new_df.drop(['customer_id','FiveSplit'], axis=1)

In [131]:
df['P1Column'] = df.eq(df.max(axis=1), axis=0).apply(lambda x: ', '.join(df.columns[x]), axis=1)
new_df['P1Column'] = df['P1Column']
df = df.drop('P1Column', axis=1)

In [133]:
df['P2Column'] = df.eq(df.apply(lambda row: row.nlargest(2).values[-1],axis=1), 
                       axis=0).apply(lambda x: ', '.join(df.columns[x]), axis=1)

new_df['P2Column'] = df['P2Column']
df = df.drop('P2Column', axis=1)

In [134]:
df['P3Column'] = df.eq(df.apply(lambda row: row.nlargest(3).values[-1],axis=1), 
                       axis=0).apply(lambda x: ', '.join(df.columns[x]), axis=1)

new_df['P3Column'] = df['P3Column']

del df

In [137]:
new_df['P2Column'] = np.where(new_df['P1Column']==new_df['P2Column'], np.nan, new_df['P2Column'])
new_df['P3Column'] = np.where(new_df['P2Column']==new_df['P3Column'], np.nan, new_df['P3Column'])

Unnamed: 0,customer_id,FiveSplit,index_code__A,index_code__B,index_code__C,index_code__D,index_code__F,index_code__G,index_code__H,index_code__I,index_code__J,index_code__S,P1Column,P2Column,P3Column
0,-9223352921020755230,6,0.8,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"index_code__A, index_code__C",,"index_code__B, index_code__D, index_code__F, i..."
1,-9223343869995384291,6,9.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.4,index_code__A,index_code__S,"index_code__B, index_code__C, index_code__D, i..."
2,-9223293121067732640,6,0.4,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,index_code__B,index_code__A,"index_code__C, index_code__D, index_code__F, i..."
3,-9223290575350349271,6,0.6,0.6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,index_code__S,index_code__C,"index_code__A, index_code__B"
4,-9223279922255655589,6,3.6,0.0,0.0,1.2,0.6,0.0,0.0,0.0,0.0,0.0,index_code__A,index_code__D,index_code__F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535412,9222940818992675193,6,2.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.4,index_code__S,index_code__A,index_code__B
535413,9223099843213569889,6,3.6,0.0,1.6,2.0,0.0,0.0,0.0,0.0,0.0,0.0,index_code__A,index_code__D,index_code__C
535414,9223141695752178477,6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,index_code__D,"index_code__A, index_code__B, index_code__C, i...",
535415,9223144259947635662,6,1.0,0.0,0.0,0.0,3.6,0.0,0.0,0.0,0.0,0.0,index_code__F,index_code__A,"index_code__B, index_code__C, index_code__D, i..."


In [139]:
new_df = new_df[['customer_id', 'P1Column', 'P2Column', 'P3Column']]
new_df = new_df.merge(customers_df[['customer_id_old','customer_id']], on='customer_id', how='left')
new_df

Unnamed: 0,customer_id,P1Column,P2Column,P3Column,customer_id_old
0,-9223352921020755230,"index_code__A, index_code__C",,"index_code__B, index_code__D, index_code__F, i...",e7df12e603a24cdaa788472825ee8ed928c1a05ea2c0bd...
1,-9223343869995384291,index_code__A,index_code__S,"index_code__B, index_code__C, index_code__D, i...",2d4e2425458bf97676503ce907ef5ee22bd84fb1f7999f...
2,-9223293121067732640,index_code__B,index_code__A,"index_code__C, index_code__D, index_code__F, i...",894fa1e94d6aec5c82ba0cdb363e773180b9814a7138dc...
3,-9223290575350349271,index_code__S,index_code__C,"index_code__A, index_code__B",2f41fce7971687a9bdd279e2a1ea8f0c576531a6337eaa...
4,-9223279922255655589,index_code__A,index_code__D,index_code__F,c1a47f2b5a187d30f0df97a272ac6af156e162c34bdfd8...
...,...,...,...,...,...
535412,9222940818992675193,index_code__S,index_code__A,index_code__B,a82700ee07c11cc67440f231cb3945f1464b17fc61e64c...
535413,9223099843213569889,index_code__A,index_code__D,index_code__C,a50d0faab1d3f60b464e91d4e5c7d1e99d4f2cd2c54dd9...
535414,9223141695752178477,index_code__D,"index_code__A, index_code__B, index_code__C, i...",,2f9cd196629e31761d1faf1b42e159e597f0a5cb225656...
535415,9223144259947635662,index_code__F,index_code__A,"index_code__B, index_code__C, index_code__D, i...",de9eea5cb3a575ed4e19e00379026eaad6f27785a1d5bf...


In [140]:
new_df.to_csv('candidate_groups.csv', index=False)