In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from average_precision import apk
import pandas as pd
from tqdm import tqdm

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

from average_precision import apk

def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []
    no_purchases_pattern = []
    
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [117]:
%%time

transactions = pd.read_parquet('transactions_train.parquet')
customers = pd.read_parquet('customers.parquet')
articles = pd.read_parquet('articles.parquet')

CPU times: user 3.32 s, sys: 2.3 s, total: 5.62 s
Wall time: 1.82 s


In [118]:
transactions.week.max()

104

In [119]:
USE_WEEKS = 10
TEST_WEEK = 104

VALIDATION = True

if VALIDATION:
    valid = transactions[transactions['week'] == TEST_WEEK]
    transactions = transactions[transactions.week != TEST_WEEK]

transactions = transactions[transactions.week > transactions.week.max() - 10]

In [121]:
transactions.week.min(), transactions.week.max()

(94, 103)

In [122]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = TEST_WEEK

CPU times: user 33.2 s, sys: 1.75 s, total: 34.9 s
Wall time: 33.1 s


In [147]:
mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()

In [123]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(120).rename('bestseller_rank').astype('int8')

In [124]:
previous_week_top = transactions[['week','customer_id']].drop_duplicates().merge(sales.reset_index(), 
                                                                                 on='week')

In [126]:
weeks = []
for i, (c_id, week) in enumerate(zip(previous_week_top['customer_id'], previous_week_top['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
previous_week_top.week=weeks

# valid = 104 -: top products from wk 103 in pv with week as 104

In [127]:
previous_week_top

Unnamed: 0,week,customer_id,article_id,bestseller_rank
0,95,857913002275398,806388001,1
1,95,857913002275398,730683021,2
2,95,857913002275398,610776002,3
3,95,857913002275398,805308002,4
4,95,857913002275398,866383006,5
...,...,...,...,...
91526635,104,18446630855572834764,929275001,93
91526636,104,18446630855572834764,762846008,94
91526637,104,18446630855572834764,865929007,94
91526638,104,18446630855572834764,871710012,95


In [129]:
def return_best_each_week(actual, predict):
    
    wk_list = []
    art_list = []
    bs_rank = []
    
    for wk in predict.week.unique():
        
        df = actual[actual.week==wk].merge(predict[predict.week==wk], 
                                           on=['customer_id', 'article_id'], how='inner')
        
        n_cust = actual[actual.week==wk]['customer_id'].nunique()
        n_article_reco = predict[predict.week==wk]['article_id'].nunique()

        df = df.groupby('article_id')['customer_id'].count().rank(
            method='dense').sort_values(ascending=False).reset_index(
            name='BestSellerRank').iloc[:int(n_article_reco/4)]
        
        k = df['article_id'].tolist()
        art_list.extend(k)
        bs_rank.extend([i for i in range(1, len(k)+1)])
        wk_list.extend([wk]*len(k))
        
#     print(len(art_list), len(bs_rank), len(wk_list))
    
    return pd.DataFrame({'week':wk_list, 'article_id':art_list, 'bestseller_rank':bs_rank})

In [130]:
bestsellers_previous_week = return_best_each_week(pd.concat([transactions,
                                                             valid], 
                                                            axis=0).sort_values(
    ['week','customer_id']), previous_week_top)

In [151]:
bestsellers_previous_week = bestsellers_previous_week.merge(mean_price, on=['week', 'article_id'])
bestsellers_previous_week = bestsellers_previous_week.sort_values(['week', 'bestseller_rank'])

In [150]:
bestsellers_previous_week

Unnamed: 0,week,article_id,bestseller_rank,price
0,95,817354001,1,0.021913
1,95,806388001,2,0.013307
2,95,759871002,3,0.006181
3,95,895002002,4,0.013114
4,95,824490001,5,0.030348
...,...,...,...,...
723,103,924605001,127,0.016465
724,103,871997002,128,0.033063
725,103,933989001,129,0.041626
726,103,762796013,130,0.033657


In [135]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
28777300,2020-07-08,857913002275398,599580068,0.008458,1,94
28777301,2020-07-08,857913002275398,776237011,0.025407,1,94
28777302,2020-07-08,857913002275398,844294001,0.011847,1,94
28787123,2020-07-08,1658289241058394,877773001,0.007610,1,94
28788562,2020-07-08,3828854365940846,507883009,0.013542,1,94
...,...,...,...,...,...,...
31536744,2020-09-15,18446630855572834764,568601045,0.050831,2,103
31536745,2020-09-15,18446630855572834764,568601045,0.050831,2,103
31536746,2020-09-15,18446630855572834764,898713001,0.067780,2,103
31536747,2020-09-15,18446630855572834764,898713001,0.067780,2,103


In [136]:
candidates_last_purchase = transactions.copy()

In [137]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 7.1 s, sys: 267 ms, total: 7.37 s
Wall time: 7.36 s


In [138]:
candidates_last_purchase

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
28777300,2020-07-08,857913002275398,599580068,0.008458,1,95
28777301,2020-07-08,857913002275398,776237011,0.025407,1,95
28777302,2020-07-08,857913002275398,844294001,0.011847,1,95
28787123,2020-07-08,1658289241058394,877773001,0.007610,1,104
28788562,2020-07-08,3828854365940846,507883009,0.013542,1,100
...,...,...,...,...,...,...
31536744,2020-09-15,18446630855572834764,568601045,0.050831,2,104
31536745,2020-09-15,18446630855572834764,568601045,0.050831,2,104
31536746,2020-09-15,18446630855572834764,898713001,0.067780,2,104
31536747,2020-09-15,18446630855572834764,898713001,0.067780,2,104


In [139]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [141]:
unique_transactions.week.min(), bestsellers_previous_week.week.min()

(94, 95)

In [152]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [153]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = TEST_WEEK

In [154]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [155]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

candidates_bestsellers = candidates_bestsellers[candidates_last_purchase.columns.tolist()]

In [156]:
transactions['purchased'] = 1

In [157]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [162]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [163]:
data.shape, data.purchased.mean()

((59796931, 7), 0.042306552488454635)

In [167]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [168]:
data.week.min()

94

In [169]:
data = data[data.week != data.week.min()].copy()
data.bestseller_rank = data.bestseller_rank.fillna(999)

In [170]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [171]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [173]:
train = data[data.week != TEST_WEEK]
test = data[data.week==TEST_WEEK].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [174]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

# columns_to_use = ['article_id', 'bestseller_rank', 'sales_channel_id']


In [175]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']
test_X = test[columns_to_use]

CPU times: user 1.02 s, sys: 671 ms, total: 1.69 s
Wall time: 1.68 s


In [176]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [177]:
from lightgbm.sklearn import LGBMRanker

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type = "dart",
    n_estimators = 300,
    importance_type='gain',
    verbose=100,
    num_leaves = 255,
    max_depth = 8,
)

In [178]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.844095
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.153488
[LightGBM] [Debug] init for col-wise cost 3.193031 seconds, init for row-wise cost 1.525365 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1174
[LightGBM] [Info] Number of data points in the train set: 58138944, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 254 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 251 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 25

[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 254 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 254 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 254 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 254 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 252 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 254 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 249 and depth = 8
[LightGBM]

[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 239 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 249 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 254 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 254 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 246 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 254 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM]

[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 245 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 254 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 247 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 242 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 249 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 252 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 246 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 251 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 249 and depth = 8
[LightGBM]

[LightGBM] [Debug] Trained a tree with leaves = 245 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 238 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 243 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 253 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 250 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 252 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 241 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 251 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 251 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 255 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 250 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 252 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 251 and depth = 8
[LightGBM]

In [179]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())
    
# # bestseller_rank 0.9196792705997064
# # article_id 0.026285480731471945
# # product_type_no 0.013504447845095592
# # department_no 0.0091643043478062
# # colour_group_code 0.006066296941045826
# # garment_group_no 0.006018620705832099
# # graphical_appearance_no 0.0032828171393240045
# # age 0.003127522463078104
# # section_no 0.0029876189004976114
# # perceived_colour_value_id 0.002548262435245271
# # perceived_colour_master_id 0.0023937136361779562
# # postal_code 0.0021755290232670283
# # index_group_no 0.0008668293782069826
# # index_code 0.0008546754890484832
# # club_member_status 0.0006268607336245004
# # fashion_news_frequency 0.00018769383289481714
# # Active 0.00016006989051367204
# # FN 6.998590716348927e-05

bestseller_rank 0.9623808561704634
article_id 0.010463750521857954
product_type_no 0.006350165940639954
age 0.004480889100241997
department_no 0.003655688420848908
garment_group_no 0.0026753015885279713
colour_group_code 0.0017566770836988741
section_no 0.0016775366779412852
graphical_appearance_no 0.0015009023581218609
postal_code 0.001378068711398064
perceived_colour_value_id 0.0010109675633908864
perceived_colour_master_id 0.0007899964475396099
club_member_status 0.0004989191419984502
index_code 0.00046779188984241504
index_group_no 0.0003639615975027452
fashion_news_frequency 0.0002739457540119103
Active 0.00019098831660904715
FN 8.359271536473126e-05


In [180]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()


CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 19.1 µs


In [181]:
sub = pd.read_csv('sample_submission.csv')

In [182]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 4.61 s, sys: 168 ms, total: 4.78 s
Wall time: 4.77 s


In [183]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds
sub['cust_id'] = customer_hex_id_to_int(sub.customer_id)

In [184]:
valid_data = pd.read_parquet('transactions_train.parquet')
valid_data = valid_data[valid_data.t_dat>='2020-09-16'].copy()

In [185]:
valid_unq = valid_data.groupby('customer_id')['article_id'].apply(list).reset_index()
valid_unq['valid_true'] = valid_unq['article_id'].map(lambda x: '0'+' 0'.join(str(x)[1:-1].split(', ')))

In [186]:
valid_unq = valid_unq.drop('article_id', axis=1).merge(sub[['cust_id','prediction']], left_on='customer_id',
                                          right_on='cust_id', how='inner')

In [187]:
def apk(actual, predicted, k=10):
    
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [188]:
mapk(
    valid_unq['valid_true'].map(lambda x: x.split()), 
    valid_unq['prediction'].map(lambda x: x.split()), 
    k=12
)

## 0.02368 with 300 trees
## 0.02370 with 255 n leaves and depth 8 and 300 trees
## 0.02299 with most bought n second most bought and with 255 n leaves and depth 8 and 300 trees
## 0.02264 with all 5 and with 255 n leaves and depth 8 and 300 trees
## 0.02312 with multiplied best seller rank and with 255 n leaves and depth 8 and 300 trees

0.022848648448014364

In [34]:
previous_week = transactions[transactions['week'] == 103][['customer_id', 'article_id']]
previous_week['tmp'] = 1

top_products = pd.DataFrame(transactions[transactions['week'] == 103].value_counts('article_id').iloc[:100].index.tolist(),
                            columns=['article_id'])
top_products['tmp'] = 1

previous_week_top = transactions[['tmp','customer_id']].drop_duplicates().merge(top_products, on='tmp')

previous_week_top = previous_week_top.drop('tmp', axis=1)
previous_week = previous_week.drop('tmp', axis=1)

cand = pd.concat([previous_week, previous_week_top]).drop_duplicates()


In [74]:
def score_and_best(actual, predict, Lookback):
    
    n_cust = valid['customer_id'].nunique()
    n_article_reco = predict['article_id'].nunique()
    
    act_tot = len(actual)
    pre_tot = len(predict)
    df = actual.merge(predict, on=['customer_id', 'article_id'], how='inner')
    correct = df.shape[0]
    
    print(f"[+] Recall = {correct/act_tot*100:.1f}% ({correct}/{act_tot})")
    print(f"[+] Multiple Factor = {pre_tot//correct} ({pre_tot}/{correct})")
    
    df = df.groupby('article_id')['customer_id'].count().rank(
        method='dense').sort_values(ascending=False).reset_index(name='BestSellerRank').iloc[:int(n_article_reco/4)]
    
    df['BestSellerRank'] = [i for i in range(1, int(n_article_reco/4)+1)]
    df['WeeksLookback'] = [Lookback]*int(n_article_reco/4)
    
    return df

In [75]:
k = score_and_best(valid[['customer_id', 'article_id']], 
      previous_week_top[['customer_id', 'article_id']], 10)

[+] Recall = 7.5% (17925/240311)
[+] Multiple Factor = 2302 (41274500/17925)


In [76]:
k

Unnamed: 0,article_id,BestSellerRank,WeeksLookback
0,924243001,1,10
1,924243002,2,10
2,909370001,3,10
3,923758001,4,10
4,918522001,5,10
5,866731001,6,10
6,915529005,7,10
7,919273002,8,10
8,915529003,9,10
9,714790020,10,10


In [2]:
def score(actual, predict):
    act_tot = len(actual)
    pre_tot = len(predict)
    correct = actual.merge(predict, on=['customer_id', 'article_id'], how='inner').shape[0]
    print(f"[+] Recall = {correct/act_tot*100:.1f}% ({correct}/{act_tot})")
    print(f"[+] Multiple Factor = {pre_tot//correct} ({pre_tot}/{correct})")

In [3]:
%%time

transactions = pd.read_parquet('transactions_train.parquet')
customers = pd.read_parquet('customers.parquet')
articles = pd.read_parquet('articles.parquet')

CPU times: user 2.73 s, sys: 1.8 s, total: 4.53 s
Wall time: 1.58 s


In [5]:
transactions['tmp'] = 1

In [6]:
previous_week = transactions[transactions['week'] == 103][['customer_id', 'article_id']]
previous_week['tmp'] = 1

top_products = pd.DataFrame(transactions[transactions['week'] == 103].value_counts('article_id').iloc[:200].index.tolist(),
                            columns=['article_id'])
top_products['tmp'] = 1

previous_week_top = transactions[['tmp','customer_id']].drop_duplicates().merge(top_products, on='tmp')

previous_week_top = previous_week_top.drop('tmp', axis=1)
previous_week = previous_week.drop('tmp', axis=1)

cand = pd.concat([previous_week, previous_week_top]).drop_duplicates()


In [7]:
score(valid[['customer_id', 'article_id']], 
      cand[['customer_id', 'article_id']])

# [+] Recall = 9.2% (22040/240311)
# [+] Multiple Factor = 2143 (47236859/22040)

[+] Recall = 12.7% (30522/240311)
[+] Multiple Factor = 2710 (82730459/30522)


In [9]:
score(valid[['customer_id', 'article_id']], 
      cand[['customer_id', 'article_id']])

# [+] Recall = 6.6% (15786/240311)
# [+] Multiple Factor = 1502 (23725532/15786)

[+] Recall = 8.8% (21228/240311)
[+] Multiple Factor = 1953 (41472332/21228)


In [10]:
previous_week = transactions[transactions['week'] == 103][['customer_id', 'article_id']]
previous_week['tmp'] = 1

top_products = pd.DataFrame(transactions[transactions['week'] == 103].value_counts('article_id').iloc[:50].index.tolist(),
                            columns=['article_id'])
top_products['tmp'] = 1

previous_week_top = transactions[['tmp','customer_id']].drop_duplicates().merge(top_products, on='tmp')

previous_week_top = previous_week_top.drop('tmp', axis=1)
previous_week = previous_week.drop('tmp', axis=1)

cand = pd.concat([previous_week, previous_week_top]).drop_duplicates()


In [11]:
score(valid[['customer_id', 'article_id']], 
      cand[['customer_id', 'article_id']])

# [+] Recall = 4.7% (11266/240311)
# [+] Multiple Factor = 1062 (11972953/11266)

[+] Recall = 6.1% (14589/240311)
[+] Multiple Factor = 1428 (20846353/14589)


In [27]:
previous_week = transactions[transactions['week'] == 103][['customer_id', 'article_id']]
previous_week['tmp'] = 1

top_products = pd.DataFrame(transactions[transactions['week'] == 103].value_counts('article_id').iloc[:500].index.tolist(),
                            columns=['article_id'])
top_products['tmp'] = 1

previous_week_top = transactions[['tmp','customer_id']].drop_duplicates().merge(top_products, on='tmp')

previous_week_top = previous_week_top.drop('tmp', axis=1)
previous_week = previous_week.drop('tmp', axis=1)

cand = pd.concat([previous_week, previous_week_top]).drop_duplicates()


In [13]:
score(valid[['customer_id', 'article_id']], 
      cand[['customer_id', 'article_id']])

# [+] Recall = 14.7% (35239/240311)
# [+] Multiple Factor = 3342 (117787579/35239)

[+] Recall = 20.8% (50078/240311)
[+] Multiple Factor = 4123 (206521579/50078)


In [29]:
score(valid[['customer_id', 'article_id']], 
      previous_week_top[['customer_id', 'article_id']])

# [+] Recall = 19.9% (47753/240311)
# [+] Multiple Factor = 4321 (206372500/47753)

[+] Recall = 19.9% (47753/240311)
[+] Multiple Factor = 4321 (206372500/47753)


In [16]:
vbf = pd.read_parquet('valid_best_sell_all.parquet')
vbf.week += 1

In [17]:
previous_week = transactions[transactions['week'] == 103][['customer_id', 'article_id']]
previous_week['tmp'] = 1

top_products = vbf[vbf.week==104][['article_id']].drop_duplicates()
top_products['tmp'] = 1

previous_week_top = transactions[['tmp','customer_id']].drop_duplicates().merge(top_products, on='tmp')

previous_week_top = previous_week_top.drop('tmp', axis=1)
previous_week = previous_week.drop('tmp', axis=1)

cand = pd.concat([previous_week, previous_week_top]).drop_duplicates()

In [18]:
score(valid[['customer_id', 'article_id']], 
      cand[['customer_id', 'article_id']])

# [+] Recall = 3.9% (9420/240311)
# [+] Multiple Factor = 1250 (11775261/9420)

[+] Recall = 3.9% (9420/240311)
[+] Multiple Factor = 1250 (11775261/9420)


In [21]:
previous_week = transactions[transactions['week'] == 103][['customer_id', 'article_id']]
previous_week['tmp'] = 1

top_products = vbf[vbf.week==104][['article_id']].drop_duplicates()
top_products['tmp'] = 1

tp = pd.DataFrame(transactions[transactions['week'] == 103].value_counts('article_id').iloc[:200].index.tolist(),
                            columns=['article_id'])
tp['tmp'] = 1

top_products = pd.concat([top_products, tp], axis=0).drop_duplicates()

previous_week_top = transactions[['tmp','customer_id']].drop_duplicates().merge(top_products, on='tmp')

previous_week_top = previous_week_top.drop('tmp', axis=1)
previous_week = previous_week.drop('tmp', axis=1)

cand = pd.concat([previous_week, previous_week_top]).drop_duplicates()

In [22]:
score(valid[['customer_id', 'article_id']], 
      cand[['customer_id', 'article_id']])

# [+] Recall = 12.8% (30780/240311)
# [+] Multiple Factor = 2768 (85206551/30780)

[+] Recall = 12.8% (30780/240311)
[+] Multiple Factor = 2768 (85206551/30780)


In [25]:
score(valid[['customer_id', 'article_id']], 
      previous_week[['customer_id', 'article_id']])

# [+] Recall = 2.2% (5228/240311)
# [+] Multiple Factor = 48 (255241/5228)

[+] Recall = 2.2% (5228/240311)
[+] Multiple Factor = 48 (255241/5228)


In [26]:
score(valid[['customer_id', 'article_id']], 
      previous_week_top[['customer_id', 'article_id']])

# [+] Recall = 11.6% (27808/240311)
# [+] Multiple Factor = 3057 (85025470/27808)

[+] Recall = 11.6% (27808/240311)
[+] Multiple Factor = 3057 (85025470/27808)
