In [1]:
!wget https://raw.githubusercontent.com/benhamner/Metrics/master/Python/ml_metrics/average_precision.py

--2022-04-27 23:36:21--  https://raw.githubusercontent.com/benhamner/Metrics/master/Python/ml_metrics/average_precision.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1654 (1.6K) [text/plain]
Saving to: 'average_precision.py.1'


2022-04-27 23:36:21 (17.0 MB/s) - 'average_precision.py.1' saved [1654/1654]



In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from average_precision import apk

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

from average_precision import apk

def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []
    no_purchases_pattern = []
    
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
import pandas as pd

In [4]:
%%time

transactions = pd.read_parquet('transactions_train.parquet')
customers = pd.read_parquet('customers.parquet')
articles = pd.read_parquet('articles.parquet')

CPU times: user 2.8 s, sys: 1.83 s, total: 4.63 s
Wall time: 1.66 s


In [5]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [6]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 32.8 s, sys: 3.04 s, total: 35.8 s
Wall time: 32.2 s


In [7]:
c2weeks

customer_id
28847241659200          [95, 96, 101, 102]
41318098387474                        [98]
116809474287335                 [101, 103]
200292573348128          [95, 96, 99, 102]
248294615847351                       [96]
                               ...        
18446624797007271432                  [95]
18446630855572834764                 [103]
18446662237889060501                 [100]
18446705133201055310                 [102]
18446737527580148316                 [104]
Name: week, Length: 437365, dtype: object

In [8]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 678 ms, sys: 74.3 ms, total: 752 ms
Wall time: 751 ms


In [9]:
c2weeks2shifted_weeks

{28847241659200: {95: 96, 96: 101, 101: 102, 102: 105},
 41318098387474: {98: 105},
 116809474287335: {101: 103, 103: 105},
 200292573348128: {95: 96, 96: 99, 99: 102, 102: 105},
 248294615847351: {96: 105},
 272412481300040: {95: 96, 96: 103, 103: 105},
 329094189075899: {100: 105},
 330092272649261: {98: 105},
 366493139417506: {96: 99, 99: 105},
 375055163245029: {96: 98, 98: 105},
 519262836338427: {102: 105},
 649760207043851: {96: 105},
 690285180337957: {96: 97, 97: 103, 103: 105},
 736218475114453: {96: 105},
 745180086074610: {96: 99, 99: 100, 100: 102, 102: 105},
 762483386043116: {100: 105},
 805095543045062: {102: 105},
 857913002275398: {95: 96, 96: 98, 98: 99, 99: 105},
 879819981624203: {97: 105},
 964326548579219: {99: 102, 102: 105},
 1037449031262554: {97: 98, 98: 99, 99: 101, 101: 102, 102: 105},
 1083424902212452: {99: 105},
 1112426114951896: {98: 105},
 1134266496627188: {97: 98, 98: 99, 99: 105},
 1152192358796555: {97: 98, 98: 105},
 1195818762005827: {100: 105}

In [10]:
transactions[transactions.customer_id==28847241659200]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29202087,2020-07-18,28847241659200,762846001,0.025407,1,95
29202088,2020-07-18,28847241659200,829308001,0.033881,1,95
29527049,2020-07-26,28847241659200,887770001,0.016932,1,96
30989557,2020-08-31,28847241659200,760084003,0.025407,1,101
31100629,2020-09-03,28847241659200,925246001,0.128797,2,102


In [11]:
candidates_last_purchase = transactions.copy()

In [12]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 7.12 s, sys: 12.2 ms, total: 7.13 s
Wall time: 7.13 s


In [13]:
candidates_last_purchase[candidates_last_purchase.customer_id==28847241659200]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29202087,2020-07-18,28847241659200,762846001,0.025407,1,96
29202088,2020-07-18,28847241659200,829308001,0.033881,1,96
29527049,2020-07-26,28847241659200,887770001,0.016932,1,101
30989557,2020-08-31,28847241659200,760084003,0.025407,1,102
31100629,2020-09-03,28847241659200,925246001,0.128797,2,105


### Bestsellers candidates

In [14]:
mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()

In [15]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [16]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [17]:
bestsellers_previous_week

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.022980
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
...,...,...,...,...
115,105,915529003,8,0.033439
116,105,915529005,9,0.033417
117,105,448509014,10,0.041630
118,105,762846027,11,0.025005


In [18]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [19]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [20]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,bestseller_rank,price
0,2020-07-22,200292573348128,2,96,760084003,1,0.025094
1,2020-07-22,200292573348128,2,96,866731001,2,0.024919
2,2020-07-22,200292573348128,2,96,600886001,3,0.022980
3,2020-07-22,200292573348128,2,96,706016001,4,0.033197
4,2020-07-22,200292573348128,2,96,372860002,5,0.013193
...,...,...,...,...,...,...,...
8141191,2020-09-22,18440902715633436014,1,104,918292001,8,0.041424
8141192,2020-09-22,18440902715633436014,1,104,762846027,9,0.025104
8141193,2020-09-22,18440902715633436014,1,104,809238005,10,0.041656
8141194,2020-09-22,18440902715633436014,1,104,673677002,11,0.024925


In [21]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [22]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [23]:
candidates_bestsellers_test_week

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,bestseller_rank,price
0,2020-07-15,272412481300040,1,105,924243001,1,0.041535
1,2020-07-15,272412481300040,1,105,924243002,2,0.041877
2,2020-07-15,272412481300040,1,105,918522001,3,0.041435
3,2020-07-15,272412481300040,1,105,923758001,4,0.033462
4,2020-07-15,272412481300040,1,105,866731001,5,0.025015
...,...,...,...,...,...,...,...
5248375,2020-09-22,18438270306572912089,1,105,915529003,8,0.033439
5248376,2020-09-22,18438270306572912089,1,105,915529005,9,0.033417
5248377,2020-09-22,18438270306572912089,1,105,448509014,10,0.041630
5248378,2020-09-22,18438270306572912089,1,105,762846027,11,0.025005


In [24]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

# Combining transactions and candidates / negative examples

In [25]:
transactions['purchased'] = 1

In [26]:
transactions[transactions.customer_id==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95,1
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95,1
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95,1
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95,1
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95,1
29319533,2020-07-22,272412481300040,885077001,0.008458,1,96,1
29410772,2020-07-24,272412481300040,850176003,0.029034,2,96,1
29410773,2020-07-24,272412481300040,875803001,0.064559,2,96,1
29410774,2020-07-24,272412481300040,892970003,0.020966,2,96,1
29410775,2020-07-24,272412481300040,854619003,0.020966,2,96,1


In [27]:
candidates_last_purchase[candidates_last_purchase.customer_id==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,96
29030504,2020-07-15,272412481300040,816592008,0.016932,1,96
29030505,2020-07-15,272412481300040,621381021,0.033881,1,96
29030506,2020-07-15,272412481300040,817477003,0.025407,1,96
29030507,2020-07-15,272412481300040,899088002,0.025407,1,96
29319533,2020-07-22,272412481300040,885077001,0.008458,1,103
29410772,2020-07-24,272412481300040,850176003,0.029034,2,103
29410773,2020-07-24,272412481300040,875803001,0.064559,2,103
29410774,2020-07-24,272412481300040,892970003,0.020966,2,103
29410775,2020-07-24,272412481300040,854619003,0.020966,2,103


In [28]:
candidates_bestsellers[candidates_bestsellers.customer_id==272412481300040]

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
24,2020-07-22,272412481300040,1,96,760084003,0.025094
25,2020-07-22,272412481300040,1,96,866731001,0.024919
26,2020-07-22,272412481300040,1,96,600886001,0.02298
27,2020-07-22,272412481300040,1,96,706016001,0.033197
28,2020-07-22,272412481300040,1,96,372860002,0.013193
29,2020-07-22,272412481300040,1,96,610776002,0.008318
30,2020-07-22,272412481300040,1,96,877278002,0.025036
31,2020-07-22,272412481300040,1,96,547780003,0.024814
32,2020-07-22,272412481300040,1,96,817354001,0.021913
33,2020-07-22,272412481300040,1,96,827968001,0.016436


In [29]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [30]:
data[data.customer_id==272412481300040].head(60)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95,1.0
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95,1.0
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95,1.0
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95,1.0
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95,1.0
29319533,2020-07-22,272412481300040,885077001,0.008458,1,96,1.0
29410772,2020-07-24,272412481300040,850176003,0.029034,2,96,1.0
29410773,2020-07-24,272412481300040,875803001,0.064559,2,96,1.0
29410774,2020-07-24,272412481300040,892970003,0.020966,2,96,1.0
29410775,2020-07-24,272412481300040,854619003,0.020966,2,96,1.0


In [31]:
data.shape

(18915320, 7)

In [32]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [33]:
data.shape

(18253749, 7)

In [34]:
data.purchased.mean()

0.13607582749165664

### Add bestseller information

In [35]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [36]:
data[data.customer_id==272412481300040].head(60)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank
0,2020-07-15,272412481300040,778064028,0.008458,1,95,1.0,
1,2020-07-15,272412481300040,816592008,0.016932,1,95,1.0,
2,2020-07-15,272412481300040,621381021,0.033881,1,95,1.0,
3,2020-07-15,272412481300040,817477003,0.025407,1,95,1.0,
4,2020-07-15,272412481300040,899088002,0.025407,1,95,1.0,
261995,2020-07-22,272412481300040,885077001,0.008458,1,96,1.0,
345952,2020-07-24,272412481300040,850176003,0.029034,2,96,1.0,
345953,2020-07-24,272412481300040,875803001,0.064559,2,96,1.0,
345954,2020-07-24,272412481300040,892970003,0.020966,2,96,1.0,
345955,2020-07-24,272412481300040,854619003,0.020966,2,96,1.0,


In [37]:
data.week.min()

95

In [38]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [39]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [40]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [41]:
data[data.customer_id==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
54,2020-07-22,272412481300040,885077001,0.008458,1,96,1.0,999.0,885077,9473,...,9,1019,1,10343,1,1,0,1,48,338849
55,2020-07-24,272412481300040,850176003,0.029034,2,96,1.0,999.0,850176,16852,...,22,1018,12,19097,1,1,0,1,48,338849
56,2020-07-24,272412481300040,875803001,0.064559,2,96,1.0,999.0,875803,37399,...,0,1012,18,38738,1,1,0,1,48,338849
57,2020-07-24,272412481300040,892970003,0.020966,2,96,1.0,999.0,892970,3357,...,13,1005,0,12787,1,1,0,1,48,338849
58,2020-07-24,272412481300040,854619003,0.020966,2,96,1.0,999.0,854619,1527,...,22,1018,12,15033,1,1,0,1,48,338849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11381690,2020-07-15,272412481300040,915529003,0.033439,1,105,0.0,8.0,915529,10789,...,0,1003,3,10386,1,1,0,1,48,338849
11381691,2020-07-15,272412481300040,915529005,0.033417,1,105,0.0,9.0,915529,10789,...,0,1003,3,10386,1,1,0,1,48,338849
11381692,2020-07-15,272412481300040,448509014,0.041630,1,105,0.0,10.0,448509,248,...,1,1009,5,248,1,1,0,1,48,338849
11381693,2020-07-15,272412481300040,762846027,0.025005,1,105,0.0,11.0,762846,542,...,7,1010,6,506,1,1,0,1,48,338849


In [57]:
user_features = pd.read_parquet('user_features.parquet').reset_index()
item_features = pd.read_parquet('item_features.parquet').reset_index()

user_features['customer_id'] = customer_hex_id_to_int(user_features['customer_id'])
item_features['article_id'] = article_id_str_to_int(item_features['article_id'])

In [62]:
drop_cols = ['product_type_name', 'product_group_name',
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name', 'perceived_colour_master_name',
       'department_name', 'index_name', 'index_group_name', 'section_name',
       'garment_group_name']

item_features = item_features.drop(drop_cols, axis=1)

In [64]:
drop_cols = ['FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'age']
user_features = user_features.drop(drop_cols, axis=1)

In [65]:
data.shape

(17991762, 38)

In [66]:
data = pd.merge(data, item_features, on='article_id', how='left')
data = pd.merge(data, user_features, on='customer_id', how='left')

In [67]:
data.shape

(17991762, 105)

In [68]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [69]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values
## can use other columns to group with week and cust id - age_bucket for e.g

In [70]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [76]:
columns_to_use = columns_to_use + [col for col in item_features.columns if col !='article_id'] + \
[col for col in user_features.columns if col !='customer_id']

In [77]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']
test_X = test[columns_to_use]

CPU times: user 1.11 s, sys: 704 ms, total: 1.81 s
Wall time: 1.8 s


# Model training

In [78]:
from lightgbm.sklearn import LGBMRanker

In [79]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1000,
    importance_type='gain',
    verbose=100
)

In [80]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.872533
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.443987
[LightGBM] [Debug] init for col-wise cost 1.944481 seconds, init for row-wise cost 8.804254 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 6086
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 85
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and 

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Tr

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 15
[LightGBM] [Debug] 

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 13
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 14
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 14
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 14
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 15
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debu

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 15
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debu

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 13
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 14
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [De

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 15
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 13
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [De

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 13
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
CPU times: user 9h 19min 32s, sys: 43.6 s, total: 9h 20min 16s
Wall time: 14min 14s


In [81]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.8248618449802321
sum_transactions 0.027423705260087736
mean_price 0.021279722611331542
min_price 0.016230830180293223
article_id 0.015776853971111567
top_graphical_appearance_name_0 0.015223227641171322
max_minus_min_transactions 0.009904678436368141
product_type_no 0.008710038762420692
department_no 0.0061039866958390145
garment_group_no 0.006044049405673814
max_price 0.005710596282131059
count 0.004735347631365529
sum_price 0.004151382214450787
min_transactions 0.004062754450066001
max_transactions 0.0030502273460702714
section_no 0.0030313630414606223
colour_group_code 0.002597831770244888
top_product_group_name_0 0.0024937483484444405
perceived_colour_value_id 0.0022023790790141194
top_section_name_0 0.001692473376626485
department_name_3 0.0016884369635902484
top_perceived_colour_value_name_0 0.0016262649660159314
median_transactions 0.0016244597240601631
perceived_colour_master_id 0.0012775711826133057
graphical_appearance_no 0.0012691144096083958
age 0.00106585

In [49]:
# for i in ranker.feature_importances_.argsort()[::-1]:
#     print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())
    
# # bestseller_rank 0.9084779803115295
# # article_id 0.028151283853994704
# # product_type_no 0.01703869095478825
# # department_no 0.012842365564634884
# # garment_group_no 0.008488918871462595
# # colour_group_code 0.00508518511811137
# # graphical_appearance_no 0.0043176729897553825
# # section_no 0.004305431560331087
# # perceived_colour_master_id 0.0034837779550319953
# # perceived_colour_value_id 0.0030256368553717135
# # age 0.0018456475582899136
# # index_group_no 0.0011573662182938502
# # index_code 0.0010249484379311765
# # club_member_status 0.0003302095247203395
# # postal_code 0.00031080881628281037
# # fashion_news_frequency 6.535356783770413e-05
# # Active 4.503368971922028e-05
# # FN 3.688151913462962e-06

bestseller_rank 0.9084779803115295
article_id 0.028151283853994704
product_type_no 0.01703869095478825
department_no 0.012842365564634884
garment_group_no 0.008488918871462595
colour_group_code 0.00508518511811137
graphical_appearance_no 0.0043176729897553825
section_no 0.004305431560331087
perceived_colour_master_id 0.0034837779550319953
perceived_colour_value_id 0.0030256368553717135
age 0.0018456475582899136
index_group_no 0.0011573662182938502
index_code 0.0010249484379311765
club_member_status 0.0003302095247203395
postal_code 0.00031080881628281037
fashion_news_frequency 6.535356783770413e-05
Active 4.503368971922028e-05
FN 3.688151913462962e-06


# Calculate predictions

In [82]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 12 µs, sys: 0 ns, total: 12 µs
Wall time: 24.1 µs


# Create submission

In [83]:
sub = pd.read_csv('sample_submission.csv')

In [84]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 5.06 s, sys: 192 ms, total: 5.25 s
Wall time: 5.24 s


In [85]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [86]:
sub_name = 'lgbm_1000_useritem_feat'
sub.to_csv(f'{sub_name}.csv.gz', index=False)

In [87]:
# !kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f "lgbm_1000_useritem_feat.csv.gz" -m \
# "LGBM Ranker with 1000 Trees More Features"

100%|######################################| 61.6M/61.6M [00:03<00:00, 21.2MB/s]
Successfully submitted to H&M Personalized Fashion Recommendations