In [1]:
# !wget https://raw.githubusercontent.com/benhamner/Metrics/master/Python/ml_metrics/average_precision.py

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from average_precision import apk

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

from average_precision import apk

def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []
    no_purchases_pattern = []
    
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
import pandas as pd

In [4]:
%%time

transactions = pd.read_parquet('transactions_train.parquet')
customers = pd.read_parquet('customers.parquet')
articles = pd.read_parquet('articles.parquet')

CPU times: user 2.89 s, sys: 1.77 s, total: 4.66 s
Wall time: 1.64 s


In [5]:
VALID = True

if VALID:
    transactions = transactions[transactions.t_dat<'2020-09-16']

In [6]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [7]:
%%time
c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 30.8 s, sys: 2.32 s, total: 33.1 s
Wall time: 30.6 s


In [8]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 717 ms, sys: 77.7 ms, total: 795 ms
Wall time: 793 ms


In [9]:
mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()

In [10]:
SALES_ONLY = False

if SALES_ONLY:

    sales = transactions \
        .groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(12).rename('bestseller_rank').astype('int8')

    bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
    
else:
    
    vbf = pd.read_parquet('valid_best_sell_all.parquet')
#     vbf = pd.read_parquet('valid_best_sell_with_channel.parquet')
#     vbf = vbf[vbf.RankType.isin(['MostBought'])].drop('RankType', axis=1)
#     vbf = vbf[vbf.RankType.isin(['MostBought','SecondMost'])].drop('RankType', axis=1)

    rank_map = {'MostBought':1, 'SecondMost':2, 'LowerPrice':3, 'AnotherColor':4,'AnotherAppearance':5}
    vbf['RankType'] = vbf['RankType'].map(rank_map)
    vbf['bestseller_rank'] = vbf['bestseller_rank'] * vbf['RankType']
    vbf = vbf.drop('RankType', axis=1)
    bestsellers_previous_week = pd.merge(vbf, 
                                     mean_price, on=['week', 'article_id']).reset_index(drop=True)
#     del vbf
    
bestsellers_previous_week.week += 1
bestsellers_previous_week = bestsellers_previous_week.sort_values(['week','bestseller_rank']).reset_index(drop=True)

In [11]:
bestsellers_previous_week[bestsellers_previous_week.week==95]

Unnamed: 0,week,article_id,bestseller_rank,price
0,95,806388001,1,0.013301
1,95,806388003,2,0.013223
2,95,730683021,2,0.025643
3,95,806388002,3,0.013238
4,95,610776002,3,0.008303
5,95,806388017,4,0.013284
6,95,730683050,4,0.041446
7,95,805308002,4,0.013609
8,95,806388009,5,0.013236
9,95,866383006,5,0.024971


In [12]:
DEBUG = False
if DEBUG:
        transactions = transactions[transactions.customer_id==28847241659200]

In [13]:
transactions[transactions.article_id==806388001]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
28785735,2020-07-08,50946259841479423,806388001,0.013542,2,94
28760322,2020-07-08,153677097157202940,806388001,0.012949,2,94
28774032,2020-07-08,372327345626205245,806388001,0.012576,2,94
28743446,2020-07-08,470987419378131801,806388001,0.013542,2,94
28743447,2020-07-08,470987419378131801,806388001,0.013542,2,94
...,...,...,...,...,...,...
31546637,2020-09-15,17119065277524020867,806388001,0.012153,2,103
31541846,2020-09-15,17545169252291287947,806388001,0.013542,2,103
31524933,2020-09-15,17933101227612189071,806388001,0.013542,2,103
31543673,2020-09-15,18224046061640218639,806388001,0.013542,1,103


In [14]:
# transactions = transactions.merge(bestsellers_previous_week[['article_id', 'RankType', 'week']], 
#                    on=['article_id','week'], how='left')

In [15]:
candidates_last_purchase = transactions.copy()

In [16]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 6.97 s, sys: 21.9 ms, total: 7 s
Wall time: 6.99 s


In [17]:
candidates_last_purchase[candidates_last_purchase.customer_id==28847241659200]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29024918,2020-07-14,28847241659200,730683036,0.042356,1,95
29024919,2020-07-14,28847241659200,851094001,0.011,1,95
29024920,2020-07-14,28847241659200,757303012,0.030492,1,95
29202087,2020-07-18,28847241659200,762846001,0.025407,1,96
29202088,2020-07-18,28847241659200,829308001,0.033881,1,96
29527049,2020-07-26,28847241659200,887770001,0.016932,1,101
30989557,2020-08-31,28847241659200,760084003,0.025407,1,102
31100629,2020-09-03,28847241659200,925246001,0.128797,2,104


In [18]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [20]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

# candidates_bestsellers = pd.merge(
#     unique_transactions,
#     bestsellers_previous_week,
#     on = ['week','sales_channel_id']
# )

In [21]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,bestseller_rank,price
0,2020-07-15,272412481300040,1,95,806388001,1,0.013301
1,2020-07-15,272412481300040,1,95,806388003,2,0.013223
2,2020-07-15,272412481300040,1,95,730683021,2,0.025643
3,2020-07-15,272412481300040,1,95,806388002,3,0.013238
4,2020-07-15,272412481300040,1,95,610776002,3,0.008303
...,...,...,...,...,...,...,...
29863840,2020-09-15,18446630855572834764,2,103,832311002,36,0.047176
29863841,2020-09-15,18446630855572834764,2,103,762846027,36,0.024895
29863842,2020-09-15,18446630855572834764,2,103,448509018,40,0.041628
29863843,2020-09-15,18446630855572834764,2,103,832311001,45,0.040096


In [22]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [23]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

# candidates_bestsellers_test_week = pd.merge(
#     test_set_transactions,
#     bestsellers_previous_week,
#     on = ['week','sales_channel_id']
# )

In [24]:
candidates_bestsellers.groupby('week').article_id.nunique()

week
95     40
96     42
97     40
98     34
99     47
100    37
101    29
102    22
103    37
Name: article_id, dtype: int64

In [25]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

candidates_bestsellers = candidates_bestsellers[candidates_last_purchase.columns.tolist()]

# Combining transactions and candidates / negative examples

In [26]:
transactions['purchased'] = 1

In [27]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [28]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [29]:
data.shape

(42221170, 7)

In [30]:
data.purchased.mean()

0.059917856373947004

### Add bestseller Rank

In [31]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [32]:
data.week.min()

94

In [33]:
data = data[data.week != data.week.min()].copy()
data.bestseller_rank = data.bestseller_rank.fillna(999)

In [34]:
data[(data.customer_id==28847241659200) & (data.week==101)]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank
2005457,2020-08-31,28847241659200,760084003,0.025407,1,101,1.0,999.0
3221303,2020-07-26,28847241659200,887770001,0.016932,1,101,0.0,999.0
28557182,2020-08-31,28847241659200,916468003,0.032983,1,101,0.0,1.0
28557183,2020-08-31,28847241659200,916468002,0.032668,1,101,0.0,2.0
28557184,2020-08-31,28847241659200,896152003,0.033229,1,101,0.0,2.0
28557185,2020-08-31,28847241659200,896152003,0.033229,1,101,0.0,6.0
28557186,2020-08-31,28847241659200,916468001,0.03244,1,101,0.0,3.0
28557187,2020-08-31,28847241659200,896152002,0.033338,1,101,0.0,3.0
28557188,2020-08-31,28847241659200,896152002,0.033338,1,101,0.0,4.0
28557189,2020-08-31,28847241659200,751471001,0.033391,1,101,0.0,4.0


In [35]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [36]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [37]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [70]:
# columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
# 'perceived_colour_master_id', 'department_no', 'index_code',
# 'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
# 'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

columns_to_use = ['article_id', 'bestseller_rank', 'sales_channel_id']


In [71]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']
test_X = test[columns_to_use]

CPU times: user 158 ms, sys: 120 ms, total: 278 ms
Wall time: 275 ms


# Model training

In [72]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

## can use other columns to group with week and cust id - age_bucket for e.g

In [73]:
len(train_baskets)

686726

In [74]:
from lightgbm.sklearn import LGBMRanker

In [75]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type = "dart",
    n_estimators = 300,
    importance_type='gain',
    verbose=100,
    num_leaves = 255,
    max_depth = 8,
)

In [76]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000000
[LightGBM] [Debug] init for col-wise cost 0.000199 seconds, init for row-wise cost 0.131503 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 294
[LightGBM] [Info] Number of data points in the train set: 33147844, number of used features: 3
[LightGBM] [Debug] Trained a tree with leaves = 167 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 188 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 179 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 180 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 186 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 181 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 176 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 187 and depth = 8


[LightGBM] [Debug] Trained a tree with leaves = 180 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 170 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 185 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 182 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 196 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 169 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 194 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 191 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 189 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 199 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 191 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 199 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 193 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 192 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 161 and depth = 8
[LightGBM]

[LightGBM] [Debug] Trained a tree with leaves = 181 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 183 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 196 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 191 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 190 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 196 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 186 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 183 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 188 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 194 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 186 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 198 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 195 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 184 and depth = 8
[LightGBM]

[LightGBM] [Debug] Trained a tree with leaves = 206 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 172 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 181 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 204 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 212 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 193 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 199 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 202 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 185 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 195 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 182 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 161 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 203 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 191 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 178 and depth = 8
[LightGBM]

[LightGBM] [Debug] Trained a tree with leaves = 187 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 197 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 171 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 185 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 158 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 176 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 162 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 172 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 172 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 202 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 187 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 201 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 188 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 176 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 175 and depth = 8
[LightGBM]

[LightGBM] [Debug] Trained a tree with leaves = 156 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 181 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 181 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 170 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 183 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 194 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 155 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 150 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 8
CPU times: user 6h 23min 48s, sys: 20.8 s, total: 6h 24min 9s
Wall time: 9min 47s


In [77]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())
    
# # bestseller_rank 0.9196792705997064
# # article_id 0.026285480731471945
# # product_type_no 0.013504447845095592
# # department_no 0.0091643043478062
# # colour_group_code 0.006066296941045826
# # garment_group_no 0.006018620705832099
# # graphical_appearance_no 0.0032828171393240045
# # age 0.003127522463078104
# # section_no 0.0029876189004976114
# # perceived_colour_value_id 0.002548262435245271
# # perceived_colour_master_id 0.0023937136361779562
# # postal_code 0.0021755290232670283
# # index_group_no 0.0008668293782069826
# # index_code 0.0008546754890484832
# # club_member_status 0.0006268607336245004
# # fashion_news_frequency 0.00018769383289481714
# # Active 0.00016006989051367204
# # FN 6.998590716348927e-05

bestseller_rank 0.9739716958055894
article_id 0.023322567919769437
sales_channel_id 0.0027057362746410215


# Calculate predictions

In [78]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()


CPU times: user 16 µs, sys: 0 ns, total: 16 µs
Wall time: 32.9 µs


# Create submission

In [79]:
sub = pd.read_csv('sample_submission.csv')

In [80]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 6.11 s, sys: 156 ms, total: 6.26 s
Wall time: 6.26 s


In [81]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds
sub['cust_id'] = customer_hex_id_to_int(sub.customer_id)

In [82]:
valid_data = pd.read_parquet('transactions_train.parquet')
valid_data = valid_data[valid_data.t_dat>='2020-09-16'].copy()

In [83]:
valid_unq = valid_data.groupby('customer_id')['article_id'].apply(list).reset_index()
valid_unq['valid_true'] = valid_unq['article_id'].map(lambda x: '0'+' 0'.join(str(x)[1:-1].split(', ')))

In [84]:
valid_unq = valid_unq.drop('article_id', axis=1).merge(sub[['cust_id','prediction']], left_on='customer_id',
                                          right_on='cust_id', how='inner')

In [85]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    # remove this case in advance
    # if not actual:
    #     return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [86]:
mapk(
    valid_unq['valid_true'].map(lambda x: x.split()), 
    valid_unq['prediction'].map(lambda x: x.split()), 
    k=12
)

## 0.02368 with 300 trees
## 0.02370 with 255 n leaves and depth 8 and 300 trees
## 0.02299 with most bought n second most bought and with 255 n leaves and depth 8 and 300 trees
## 0.02264 with all 5 and with 255 n leaves and depth 8 and 300 trees
## 0.02312 with multiplied best seller rank and with 255 n leaves and depth 8 and 300 trees

0.02218991367351084

In [95]:
def score(actual, predict):
    act_tot = len(actual)
    pre_tot = len(predict)
    correct = actual.merge(predict, on=['customer_id', 'article_id'], how='inner').shape[0]
    print(f"[+] Recall = {correct/act_tot*100:.1f}% ({correct}/{act_tot})")
    print(f"[+] Multiple Factor = {pre_tot//correct} ({pre_tot}/{correct})")

In [100]:
score(transactions[transactions['week'] == 103][['customer_id', 'article_id']], 
      data[data.week==103][['customer_id', 'article_id']])

[+] Recall = 100.6% (256764/255241)
[+] Multiple Factor = 13 (3393429/256764)


In [55]:
t = valid_unq['valid_true'].apply(lambda x: (x.split(' '))).tolist()
flat_list = [item for sublist in t for item in sublist]
flat_pred = [y for x in preds for y in x.split(' ')]

In [56]:
apr = len(list(set(flat_pred).intersection(set(flat_list))))
ap = len(flat_list)
num_candidates_on_val = valid_unq['prediction'].apply(lambda x: len(x.split(' '))).sum()
num_candidates_on_all = sub['prediction'].apply(lambda x: len(x.split(' '))).sum()

print('actual purchase captured:', apr)
print('actual purchase happened:', ap)
print('recall:', apr/ap)
print('multiple factor all customers:', num_candidates_on_all/apr)
print('multiple factor val customers:', num_candidates_on_val/apr)

# actual purchase captured: 16766
# actual purchase happened: 240311
# recall: 0.06976792572957542
# multiple factor all customers: 981.9730406775617
# multiple factor val customers: 49.37420971012764

actual purchase captured: 16764
actual purchase happened: 240311
recall: 0.06975960318087811
multiple factor all customers: 982.0901932712957
multiple factor val customers: 49.38010021474589
