In [1]:
import numpy as np
import pandas as pd
import os
import glob
from tqdm import tqdm
import datetime
import gc
import random

### Forming Train Set

In [2]:
data = pd.read_csv("transactions_train.csv", 
                   dtype={'article_id':str}
                  )

In [3]:
print("All Transactions Date Range: {} to {}".format(data['t_dat'].min(), data['t_dat'].max()))
data["t_dat"] = pd.to_datetime(data["t_dat"])

All Transactions Date Range: 2018-09-20 to 2020-09-22


In [4]:
articles_df = pd.read_csv('articles.csv', dtype={'article_id':str})
customers_df = pd.read_csv('customers.csv')

In [5]:
customers_df['fashion_news_frequency'] = customers_df['fashion_news_frequency'].replace('NONE', 'None')
customers_df['age_bucket'] = pd.cut(customers_df['age'].fillna(36), [15, 24, 35, 50, 100], 
                                    labels=['16-24', '25-35', '36-50', '51-100'])

In [6]:
(articles_df.shape, articles_df.article_id.nunique(), articles_df.garment_group_name.nunique(), 
 articles_df.index_group_name.nunique())

((105542, 25), 105542, 21, 5)

In [7]:
articles_df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
article_id,0108775015,0108775044,0108775051,0110065001,0110065002,0110065011,0111565001,0111565003,0111586001,0111593001
product_code,108775,108775,108775,110065,110065,110065,111565,111565,111586,111593
prod_name,Strap top,Strap top,Strap top (1),OP T-shirt (Idro),OP T-shirt (Idro),OP T-shirt (Idro),20 den 1p Stockings,20 den 1p Stockings,Shape Up 30 den 1p Tights,Support 40 den 1p Tights
product_type_no,253,253,253,306,306,306,304,302,273,304
product_type_name,Vest top,Vest top,Vest top,Bra,Bra,Bra,Underwear Tights,Socks,Leggings/Tights,Underwear Tights
product_group_name,Garment Upper body,Garment Upper body,Garment Upper body,Underwear,Underwear,Underwear,Socks & Tights,Socks & Tights,Garment Lower body,Socks & Tights
graphical_appearance_no,1010016,1010016,1010017,1010016,1010016,1010016,1010016,1010016,1010016,1010016
graphical_appearance_name,Solid,Solid,Stripe,Solid,Solid,Solid,Solid,Solid,Solid,Solid
colour_group_code,9,10,11,9,10,12,9,13,9,9
colour_group_name,Black,White,Off White,Black,White,Light Beige,Black,Beige,Black,Black


In [8]:
articles_df[['index_group_name', 'index_name', 'product_group_name', 
             'product_type_name', 'garment_group_name', 'section_name', 'department_name']].nunique()

index_group_name        5
index_name             10
product_group_name     19
product_type_name     131
garment_group_name     21
section_name           56
department_name       250
dtype: int64

In [9]:
articles_df[['index_group_name', 'index_name', 'product_group_name']].drop_duplicates()

Unnamed: 0,index_group_name,index_name,product_group_name
0,Ladieswear,Ladieswear,Garment Upper body
3,Ladieswear,Lingeries/Tights,Underwear
6,Ladieswear,Lingeries/Tights,Socks & Tights
8,Ladieswear,Lingeries/Tights,Garment Lower body
11,Baby/Children,Baby Sizes 50-98,Garment Upper body
...,...,...,...
95238,Baby/Children,Baby Sizes 50-98,Unknown
95767,Baby/Children,"Children Accessories, Swimwear",Fun
97586,Ladieswear,Ladieswear,Stationery
98242,Baby/Children,"Children Accessories, Swimwear",Bags


In [10]:
articles_df[articles_df['product_group_name']=='Garment Upper body']['index_name'].unique()

array(['Ladieswear', 'Baby Sizes 50-98', 'Sport',
       'Children Sizes 134-170', 'Menswear', 'Divided',
       'Children Sizes 92-140', 'Children Accessories, Swimwear',
       'Ladies Accessories', 'Lingeries/Tights'], dtype=object)

In [11]:
customers_df.head().T

Unnamed: 0,0,1,2,3,4
customer_id,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...
FN,,,,,1
Active,,,,,1
club_member_status,ACTIVE,ACTIVE,ACTIVE,ACTIVE,ACTIVE
fashion_news_frequency,,,,,Regularly
age,49,25,24,54,52
postal_code,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...
age_bucket,36-50,25-35,16-24,51-100,51-100


In [12]:
data = data.merge(customers_df[['age_bucket', 'customer_id']], on='customer_id', how='left')
data = data.merge(pd.concat([articles_df[['article_id']], 
                             pd.get_dummies(articles_df['index_name'])], axis=1), 
                  on='article_id', how='left')

data['YearWeek'] = data["t_dat"].dt.year.astype(str) + data["t_dat"].dt.week.astype(str)
data['YearWeek'] = data['YearWeek'].astype(int)

  """


In [13]:
data.tail().T

Unnamed: 0,31788319,31788320,31788321,31788322,31788323
t_dat,2020-09-22 00:00:00,2020-09-22 00:00:00,2020-09-22 00:00:00,2020-09-22 00:00:00,2020-09-22 00:00:00
customer_id,fff2282977442e327b45d8c89afde25617d00124d0f999...,fff2282977442e327b45d8c89afde25617d00124d0f999...,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...
article_id,0929511001,0891322004,0918325001,0833459002,0898573003
price,0.0593051,0.0423559,0.0432034,0.00676271,0.0338814
sales_channel_id,2,2,1,1,2
age_bucket,25-35,25-35,51-100,16-24,25-35
Baby Sizes 50-98,0,0,0,0,0
"Children Accessories, Swimwear",0,0,0,0,0
Children Sizes 134-170,0,0,0,0,0
Children Sizes 92-140,0,0,0,0,0


In [15]:
article_types = ['Children Accessories, Swimwear', 'Children Sizes 134-170','Children Sizes 92-140', 
                 'Divided', 'Ladies Accessories', 'Ladieswear', 'Lingeries/Tights', 'Menswear', 'Sport']

In [16]:
# grouping_cols = ['customer_id']
# trn_agg = data.loc[(data["t_dat"] < datetime.datetime(2020,9,16))][article_types+grouping_cols].groupby(grouping_cols).sum().reset_index()
# trn_agg.to_csv('trn_art_agg_cust.csv', index=False)

# del trn_agg

In [None]:
# gc.collect()

In [17]:
# trn_agg = data.loc[(data["t_dat"] < datetime.datetime(2020,9,23))][article_types+grouping_cols].groupby(grouping_cols).sum().reset_index()
# trn_agg.to_csv('train_art_agg_cust.csv', index=False)

# del trn_agg

In [None]:
# gc.collect()

In [18]:
grouping_cols = ['customer_id', 'YearWeek']

trn_agg = data.loc[(data["t_dat"] < datetime.datetime(2020,9,16))][article_types+grouping_cols].groupby(grouping_cols).sum().reset_index()
trn_agg.to_csv('trn_art_agg_cust_ywk.csv', index=False)

del trn_agg

In [20]:
gc.collect()

11

In [19]:
# trn_agg = data.loc[(data["t_dat"] < datetime.datetime(2020,9,23))][article_types+grouping_cols].groupby(grouping_cols).sum().reset_index()
# trn_agg.to_csv('train_art_agg_cust_ywk.csv', index=False)

# del trn_agg

In [None]:
# gc.collect()

In [22]:
grouping_cols = ['customer_id', 'YearWeek', 'age_bucket']

trn_agg = data.loc[(data["t_dat"] < datetime.datetime(2020,9,16))][article_types+grouping_cols].groupby(grouping_cols).sum().reset_index()
trn_agg.to_csv('trn_agg_art_cust_ywk_ab.csv', index=False)

del trn_agg

KeyboardInterrupt: 

In [None]:
# gc.collect()

In [20]:
# trn_agg = data.loc[(data["t_dat"] < datetime.datetime(2020,9,23))][article_types+grouping_cols].groupby(grouping_cols).sum().reset_index()
# trn_agg.to_csv('train_agg_art_cust_ywk_ab.csv', index=False)

# del trn_agg

In [None]:
gc.collect()

In [21]:
# data = data.drop(article_types, axis=1)
# data = data.merge(pd.concat([articles_df[['article_id']], pd.get_dummies(articles_df['product_group_name'])], axis=1), on='article_id', how='left')

In [22]:
# article_types = ['Accessories', 'Bags', 'Cosmetic', 'Fun', 'Furniture','Garment Full body', 'Garment Lower body', 'Garment Upper body',
#                 'Garment and Shoe care', 'Interior textile', 'Items', 'Nightwear','Shoes', 'Socks & Tights', 'Stationery', 'Swimwear', 'Underwear',
#                 'Underwear/nightwear', 'Unknown']

In [23]:
# grouping_cols = ['customer_id']
# trn_agg = data.loc[(data["t_dat"] < datetime.datetime(2020,9,16))][article_types+grouping_cols].groupby(grouping_cols).sum().reset_index()
# trn_agg.to_csv('trn_prd_agg_cust.csv', index=False)

# del trn_agg

In [24]:
# trn_agg = data.loc[(data["t_dat"] < datetime.datetime(2020,9,23))][article_types+grouping_cols].groupby(grouping_cols).sum().reset_index()
# trn_agg.to_csv('train_prd_agg_cust.csv', index=False)

# del trn_agg

In [25]:
# grouping_cols = ['customer_id', 'YearWeek']

# trn_agg = data.loc[(data["t_dat"] < datetime.datetime(2020,9,16))][article_types+grouping_cols].groupby(grouping_cols).sum().reset_index()
# trn_agg.to_csv('trn_prd_agg_cust_ywk.csv', index=False)

# del trn_agg

In [26]:
# trn_agg = data.loc[(data["t_dat"] < datetime.datetime(2020,9,23))][article_types+grouping_cols].groupby(grouping_cols).sum().reset_index()
# trn_agg.to_csv('train_prd_agg_cust_ywk.csv', index=False)

# del trn_agg

In [27]:
# grouping_cols = ['customer_id', 'YearWeek', 'age_bucket']

# trn_agg = data.loc[(data["t_dat"] < datetime.datetime(2020,9,16))][article_types+grouping_cols].groupby(grouping_cols).sum().reset_index()
# trn_agg.to_csv('trn_prd_art_cust_ywk_ab.csv', index=False)

# del trn_agg

In [28]:
# trn_agg = data.loc[(data["t_dat"] < datetime.datetime(2020,9,23))][article_types+grouping_cols].groupby(grouping_cols).sum().reset_index()
# trn_agg.to_csv('train_prd_art_cust_ywk_ab.csv', index=False)

# del trn_agg

In [29]:
# def apk(actual, predicted, k=12):
#     if len(predicted)>k:
#         predicted = predicted[:k]

#     score = 0.0
#     num_hits = 0.0

#     for i,p in enumerate(predicted):
#         if p in actual and p not in predicted[:i]:
#             num_hits += 1.0
#             score += num_hits / (i+1.0)

#     if not actual:
#         return 0.0

#     return score / min(len(actual), k)

# def mapk(actual, predicted, k=12):
#     return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [30]:
# train0 = data.loc[(data["t_dat"] < datetime.datetime(2020,9,16))]
# train1 = data.loc[(data["t_dat"] < datetime.datetime(2020,9,8))]
# train2 = data.loc[(data["t_dat"] < datetime.datetime(2020,8,31))]
# train3 = data.loc[(data["t_dat"] < datetime.datetime(2020,8,23))]
# train4 = data.loc[(data["t_dat"] < datetime.datetime(2020,8,15))]

In [31]:
# val0 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,16)) & (data['t_dat'] < datetime.datetime(2020,9,23))]
# val1 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,8)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
# val2 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,31)) & (data['t_dat'] < datetime.datetime(2020,9,8))]
# val3 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,23)) & (data['t_dat'] < datetime.datetime(2020,8,31))]
# val4 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,15)) & (data['t_dat'] < datetime.datetime(2020,8,23))]