In [1]:
import numpy as np
import pandas as pd
import os
import glob
from tqdm import tqdm
import datetime
import gc
import random
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

In [2]:
new_df = pd.read_csv("candidate_groups.csv")

In [3]:
data = pd.read_csv("transactions_train.csv", 
                   dtype={'article_id':str}
                  )

# data = data[data.t_dat>='2020-06-22'].reset_index(drop=True)  # for submission
data = data[data.t_dat>='2020-06-15'].reset_index(drop=True)  # for validation
data['customer_id'] = data['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')

In [4]:
articles_df = pd.read_csv('articles.csv', dtype={'article_id':str})
customers_df = pd.read_csv('customers.csv')

In [5]:
customers_df['customer_id_old'] = customers_df['customer_id']
customers_df['customer_id'] = customers_df['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')

In [6]:
customers_df['fashion_news_frequency'] = customers_df['fashion_news_frequency'].replace('NONE', 'None')
customers_df['age_bucket'] = pd.cut(customers_df['age'].fillna(36), [15, 24, 35, 50, 100], 
                                    labels=['16-24', '25-35', '36-50', '51-100'])

In [7]:
data = data.merge(articles_df[['article_id','index_code','prod_name']], on='article_id', how='left')

In [8]:
data = data.merge(customers_df[['age_bucket', 'customer_id']], on='customer_id', how='left')

In [9]:
new_df = new_df.merge(customers_df[['age_bucket', 'customer_id']], on='customer_id', how='left')

In [10]:
# last_bought = data[['customer_id','index_code',
#                     'article_id']].groupby(['customer_id','index_code'])['article_id'].last().reset_index()

last_bought = data[data.t_dat<='2020-09-15'][['customer_id','index_code',
                    'article_id']].groupby(['customer_id','index_code'])['article_id'].last().reset_index()

In [11]:
last_bought

Unnamed: 0,customer_id,index_code,article_id
0,-9223352921020755230,A,0812167004
1,-9223352921020755230,C,0673396002
2,-9223343869995384291,A,0910601003
3,-9223343869995384291,S,0849886010
4,-9223293121067732640,A,0835008005
...,...,...,...
1217155,9223099843213569889,D,0903062001
1217156,9223144259947635662,A,0839464002
1217157,9223144259947635662,F,0720504008
1217158,9223148401910457466,A,0851110002


In [12]:
index_dict = {}

for indx_cd in data.index_code.unique():
    for age_bkt in data.age_bucket.unique():

        index_dict[indx_cd + '_' + age_bkt] = data[(data.t_dat>'2020-08-22') 
                                   & (data.index_code==indx_cd) 
                                   & (data.age_bucket==age_bkt)]['article_id'].value_counts().head(12).index.tolist()
        
for age_bkt in data.age_bucket.unique():
    index_dict['_' + age_bkt] = data[(data.t_dat>'2020-08-22') 
                               & (data.age_bucket==age_bkt)]['article_id'].value_counts().head(12).index.tolist()

In [13]:
last_bought = last_bought.set_index('customer_id')

In [14]:
n_list = []
articles = []

for cust in tqdm(last_bought.index.tolist()):
    
    k = last_bought.loc[cust]['article_id']    
    if type(k) is str:
        n_list.append(1)
        articles.append(' '.join(['0'+str(int(k))]))
    else:
        n_list.append(len(k.tolist()))
        articles.append(' '.join(k.tolist()))

100%|██████████| 1217160/1217160 [02:53<00:00, 7024.80it/s]


In [15]:
b = last_bought.reset_index()
b['number_of_articles_last_purchased'] = n_list
b['article_list'] = articles

In [16]:
last_bought = b[['customer_id','number_of_articles_last_purchased',
                 'article_list']].drop_duplicates().reset_index(drop=True)

In [17]:
del b

In [18]:
last_bought = last_bought.merge(customers_df[['customer_id', 'customer_id_old']], 
                 left_on='customer_id', right_on='customer_id', 
                 how='left')

last_bought = last_bought.drop(['customer_id'], axis=1)
last_bought = last_bought.rename(columns={'customer_id_old':'customer_id'})

In [None]:
# last_bought.to_csv('last_bought.csv', index=False)

In [20]:
last_bought.to_csv('last_bought_valid.csv', index=False)

In [53]:
def check_dict(indx_cd, age_bkt, dict_index_article):
    return dict_index_article[indx_cd + '_' + age_bkt]

<!-- if l1==10:
	take age based

if l1!=10 and l2==0 and l3==(10-l1):
	
	take everything from p1

if l1!=10 and l2==0 and l3!=(10-l1):
	
	take 85% from p1 and 15% from p3

if l1!=10 and l2==(10-l1):

	take everything from p1

if l1!=10 and l2!=(10-l1) and l3==(10-l1-l2):

	take 70% from p1 and 30% from p2

if l1!=10 and l2!=(10-l1) and l3!=(10-l1-l2):
	
	take 67 % from p1 and 22% from p2 and 11 % from p3 -->

In [83]:
sample_sub = pd.read_csv('sample_submission.csv')

sample_sub = sample_sub.merge(customers_df[['age_bucket', 'customer_id_old']], 
                 left_on='customer_id', right_on='customer_id_old', 
                 how='left')

sample_sub = sample_sub.drop(['customer_id','prediction'], axis=1)

In [111]:
a = 'index_code__A, index_code__C, index_code__B, index_code__D, index_code__F, index_code__G, \
index_code__H, index_code__I, index_code__J, index_code__S'

all_df = sample_sub.merge(new_df.drop(['customer_id','age_bucket'], axis=1), 
                 on='customer_id_old', how='left')

all_df['P1Column'] = all_df['P1Column'].fillna(a)
all_df = all_df.rename(columns={'customer_id_old':'customer_id'})

In [133]:
age_bkt = ['16-24', '25-35', '36-50', '51-100']
age_preds = []

for ab in age_bkt:
    ab = '_'+ab
    age_preds.append(' '.join(pd.DataFrame.from_dict(index_dict)[ab].tolist()))

age_based = pd.DataFrame({'age_bucket':age_bkt, 'prediction':age_preds})

In [138]:
no_shop = all_df[all_df.P1Column==a].copy()
no_shop = no_shop.merge(age_based, on='age_bucket')[['customer_id', 'prediction']]

Unnamed: 0,customer_id,prediction
0,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0751471043 0751471001 0896152002 0896169002 09...
1,0000b7a134c3ec0d8842fad1fd4ca28517424c14fc4848...,0751471043 0751471001 0896152002 0896169002 09...
2,00018385675844f7a6babbed41b5655b5727fb16483b6e...,0751471043 0751471001 0896152002 0896169002 09...
3,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,0751471043 0751471001 0896152002 0896169002 09...
4,0001bc8e81a72aaf228a103a85d2ac530c09defffbc6ab...,0751471043 0751471001 0896152002 0896169002 09...
...,...,...
836558,ffff01710b4f0d558ff62d7dc00f0641065b37e840bb43...,0915526001 0448509014 0706016001 0918292001 07...
836559,ffff25c78688e1c34e48a4e34b9a953bde663cf937e715...,0915526001 0448509014 0706016001 0918292001 07...
836560,ffff7d65748db4d52e48b74c8f83ccb0029fc3bbafa511...,0915526001 0448509014 0706016001 0918292001 07...
836561,ffffa28cd7ab5d1cbbbfe7b582b1c419270cc0539f3dae...,0915526001 0448509014 0706016001 0918292001 07...


In [247]:
shoppers = all_df[all_df.P1Column!=a].copy()

In [248]:
# if l1==10:
#     take age based

# if l1!=10 and l2==0 and l3==(10-l1):

#     take everything from p1

# if l1!=10 and l2==0 and l3!=(10-l1):

#     take 85% from p1 and 15% from p3

# if l1!=10 and l2==(10-l1):

#     take everything from p1

# if l1!=10 and l2!=(10-l1) and l3==(10-l1-l2):

#     take 70% from p1 and 30% from p2

# if l1!=10 and l2!=(10-l1) and l3!=(10-l1-l2):

#     take 67 % from p1 and 22% from p2 and 11 % from p3

In [244]:
b = last_bought.set_index('customer_id')

In [249]:
shoppers.set_index('customer_id', inplace=True)

In [256]:
prediction_store = []

for cust in tqdm(shoppers.index.tolist()):
    
    candidate_list = []

    filter_df = shoppers.loc[cust].copy()
    cust_age = filter_df['age_bucket']
    
    filter_2 = b.loc[cust].copy()
    n_items = filter_2['number_of_articles_last_purchased']
    prev_purchase = filter_2['article_list']
    
    if filter_df['P1Column'] is not np.nan:
        p1 = filter_df['P1Column'].replace('index_code__','').replace(' ','').split(',')
    else: p1 = filter_df['P1Column']

    if filter_df['P1Column'] is not np.nan:
        p1 = filter_df['P1Column'].replace('index_code__','').replace(' ','').split(',')
    else: p1 = filter_df['P1Column']

    if filter_df['P1Column'] is not np.nan:
        p1 = filter_df['P1Column'].replace('index_code__','').replace(' ','').split(',')
    else: p1 = filter_df['P1Column']
        
    if p1 is not np.nan: l1 = len(p1)
    else: l1=0
   
    if p2 is not np.nan: l2 = len(p2)
    else: l2=0
        
    if p3 is not np.nan: l3 = len(p3)
    else: l3=0    
    
    search_n = 12 - n_items
    
    if l2==0: 
        if l3==(10-l1):
            fact = int(search_n/l1)
            for indx in p1:
                candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact])
        elif l3!=(10-l1):
            pr1 = 0.85
            fact_1 = int(search_n*pr1/l1)
            fact_3 = int(search_n*(1-pr1)/l3)

            for indx in p1:
                candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_1])
            for indx in p3:
                candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_3])

    if l2==(10-l1):

        fact = int(search_n/l1)
        for indx in p1:
            candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact])

    if l2!=(10-l1) and l2!=0:
        if l3==(10-l1-l2):
            pr1 = 0.70
            fact_1 = int(search_n*pr1/l1)
            fact_2 = int(search_n*(1-pr1)/l2)

            for indx in p1:
                candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_1])
            for indx in p2:
                candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_2])

        elif l3!=(10-l1-l2) and l3!=0:
            pr1 = 0.67
            pr2 = 0.22
            fact_1 = int(search_n*pr1/l1)
            fact_2 = int(search_n*(1-pr1)/l2)
            fact_3 = int(search_n*(1-pr1-pr2)/l3)

            for indx in p1:
                candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_1])
            for indx in p2:
                candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_2])
            for indx in p3:
                candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_3])

    candidate_list.extend(check_dict('', cust_age, index_dict))
        
    prediction_store.append(prev_purchase+' '+' '.join(candidate_list[:search_n]))

100%|██████████| 535417/535417 [06:33<00:00, 1362.38it/s]


In [258]:
# prediction_store = []

# for cust in tqdm(shoppers.index.tolist()):
    
#     candidate_list = []

#     filter_df = shoppers.loc[cust].copy()
#     cust_age = filter_df['age_bucket']
    
#     filter_2 = b.loc[cust].copy()
#     n_items = filter_2['number_of_articles_last_purchased']
#     prev_purchase = filter_2['article_list']
    
#     if filter_df['P1Column'] is not np.nan:
#         p1 = filter_df['P1Column'].replace('index_code__','').replace(' ','').split(',')
#     else: p1 = filter_df['P1Column']

#     if filter_df['P1Column'] is not np.nan:
#         p1 = filter_df['P1Column'].replace('index_code__','').replace(' ','').split(',')
#     else: p1 = filter_df['P1Column']

#     if filter_df['P1Column'] is not np.nan:
#         p1 = filter_df['P1Column'].replace('index_code__','').replace(' ','').split(',')
#     else: p1 = filter_df['P1Column']
        
#     if p1 is not np.nan: l1 = len(p1)
#     else: l1=0
   
#     if p2 is not np.nan: l2 = len(p2)
#     else: l2=0
        
#     if p3 is not np.nan: l3 = len(p3)
#     else: l3=0    
    
#     search_n = 12 - n_items
    
#     if l2==0: 
#         if l3==(10-l1):
#             fact = int(search_n/l1)
#             for indx in p1:
#                 candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact])
#         elif l3!=(10-l1):
#             pr1 = 0.85
#             fact_1 = int(search_n*pr1/l1)
#             fact_3 = int(search_n*(1-pr1)/l3)

#             for indx in p1:
#                 candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_1])
#             for indx in p3:
#                 candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_3])

#     if l2==(10-l1):

#         fact = int(search_n/l1)
#         for indx in p1:
#             candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact])

#     if l2!=(10-l1) and l2!=0:
#         if l3==(10-l1-l2):
#             pr1 = 0.70
#             fact_1 = int(search_n*pr1/l1)
#             fact_2 = int(search_n*(1-pr1)/l2)

#             for indx in p1:
#                 candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_1])
#             for indx in p2:
#                 candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_2])

#         elif l3!=(10-l1-l2) and l3!=0:
#             pr1 = 0.67
#             pr2 = 0.22
#             fact_1 = int(search_n*pr1/l1)
#             fact_2 = int(search_n*(1-pr1)/l2)
#             fact_3 = int(search_n*(1-pr1-pr2)/l3)

#             for indx in p1:
#                 candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_1])
#             for indx in p2:
#                 candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_2])
#             for indx in p3:
#                 candidate_list.extend(check_dict(indx, cust_age, index_dict)[:fact_3])

#     candidate_list.extend(check_dict('', cust_age, index_dict))
        
#     prediction_store.append(prev_purchase+' '.join(candidate_list[:search_n]))

In [259]:
shoppers['prediction'] = prediction_store
shoppers_pred = shoppers[['prediction']].reset_index().copy()

In [260]:
sub = pd.concat([no_shop, shoppers_pred], axis=0).reset_index(drop=True)
sub.shape

(1371980, 2)

In [261]:
sub

Unnamed: 0,customer_id,prediction
0,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0751471043 0751471001 0896152002 0896169002 09...
1,0000b7a134c3ec0d8842fad1fd4ca28517424c14fc4848...,0751471043 0751471001 0896152002 0896169002 09...
2,00018385675844f7a6babbed41b5655b5727fb16483b6e...,0751471043 0751471001 0896152002 0896169002 09...
3,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,0751471043 0751471001 0896152002 0896169002 09...
4,0001bc8e81a72aaf228a103a85d2ac530c09defffbc6ab...,0751471043 0751471001 0896152002 0896169002 09...
...,...,...
1371975,ffff61677073258d461e043cc9ed4ed97be5617a920640...,0810746001 0846782002 0158340001 0885951001 07...
1371976,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0557599022 0713997002 0804992033 0372860002 03...
1371977,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0762846031 0759871030 0685816044 0668012013 04...
1371978,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0762846027 0689365050 0570004009 0915526001 07...


In [262]:
sub.to_csv('sub_pp_idxcd_age.csv', index = False)

In [263]:
!kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f \
"sub_pp_idxcd_age.csv" -m "Last purchased with index code and age based grouping from last 3 months"

100%|########################################| 258M/258M [00:06<00:00, 44.4MB/s]
Successfully submitted to H&M Personalized Fashion Recommendations

In [267]:
sub[sub.customer_id=='00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657']['prediction'].iloc[0]

'0568601043 0751471001 0915529003 0751471043 0863595006 0884319001 0865929003 0803757001 0850917001 0768912001 0783346001 0673677002'

In [None]:
'0568601043 0568601006 0751471001 0924243002 0924243001 0448509014 0745232001 0656719005 0923758001'