# data preparation

In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 63 bytes


In [2]:
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations

Downloading h-and-m-personalized-fashion-recommendations.zip to /content
100% 28.7G/28.7G [08:12<00:00, 94.0MB/s]
100% 28.7G/28.7G [08:12<00:00, 62.6MB/s]


In [3]:
!kaggle datasets download -d post-hm-features

Downloading post-hm-features.zip to /content
 98% 921M/936M [00:36<00:00, 32.1MB/s]
100% 936M/936M [00:36<00:00, 27.0MB/s]


In [4]:
!unzip h-and-m-personalized-fashion-recommendations.zip articles.csv
!unzip h-and-m-personalized-fashion-recommendations.zip customers.csv
!unzip h-and-m-personalized-fashion-recommendations.zip transactions_train.csv
!unzip h-and-m-personalized-fashion-recommendations.zip sample_submission.csv

Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: articles.csv            
Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: customers.csv           
Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: transactions_train.csv  
Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: sample_submission.csv   


In [5]:
!unzip post-hm-features.zip user_last_dict_valid.pickle
!unzip post-hm-features.zip user_last_dict.pickle
!unzip post-hm-features.zip article2vec_idx.pickle
!unzip post-hm-features.zip sales_channel_count_valid.csv
!unzip post-hm-features.zip item_count_valid.csv
!unzip post-hm-features.zip sales_channel_count_full.csv
!unzip post-hm-features.zip item_count_full.csv

Archive:  post-hm-features.zip
  inflating: user_last_dict_valid.pickle  
Archive:  post-hm-features.zip
  inflating: user_last_dict.pickle   
Archive:  post-hm-features.zip
  inflating: article2vec_idx.pickle  
Archive:  post-hm-features.zip
  inflating: sales_channel_count_valid.csv  
Archive:  post-hm-features.zip
  inflating: item_count_valid.csv    
Archive:  post-hm-features.zip
  inflating: sales_channel_count_full.csv  
Archive:  post-hm-features.zip
  inflating: item_count_full.csv     


# import modules

In [6]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import scipy.sparse as sps
from sklearn import preprocessing
from scipy.sparse import coo_matrix
from lightgbm.sklearn import LGBMRanker

import warnings
warnings.filterwarnings('ignore')

In [7]:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/306007
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py

def apk(actual, predicted, k=12):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]
        
    score = 0.0
    num_hits = 0.0
    
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [8]:
VALID = True
ELEMENTS = 100
TRAIN_WEEK = 6

# data 

In [9]:
df = pd.read_csv("transactions_train.csv")
article_df = pd.read_csv("articles.csv")
customer_df = pd.read_csv("customers.csv")
sub_df = pd.read_csv("sample_submission.csv")

In [10]:
article_columns = []
for i in article_df.columns:
    if "int" in str(article_df[i].dtype) and i != "article_id":
        article_columns.append(i)

In [11]:
df["t_dat"] = pd.to_datetime(df["t_dat"])

date_week_df = df.drop_duplicates("t_dat")[["t_dat"]].reset_index(drop=True)
date_week_df['week_no'] = (date_week_df['t_dat'] + pd.DateOffset(days=5)).dt.week
date_week_df["week_no"] = date_week_df["week_no"].diff(1)
date_week_df["week_no"].fillna(0, inplace=True)
date_week_df["week_no"] = date_week_df["week_no"] != 0
date_week_df["week_no"] = date_week_df["week_no"].cumsum()

df = pd.merge(df, date_week_df, on="t_dat", how="left")

df.sort_values(['t_dat', 'customer_id'], inplace=True)

test_week = df.week_no.max()

In [12]:
if VALID:
    valid = df[df.week_no == test_week].reset_index(drop=True)
    #valid = valid.groupby('customer_id')['article_id'].apply(list).reset_index()
    #valid = valid.rename({'article_id':'prediction'},axis=1)
    #valid['prediction'] = valid.prediction.apply(lambda x: ' '.join(['0'+str(k) for k in x]))
    df = df[df.week_no < test_week].reset_index(drop=True)

# data sample
cons_week = [i for i in df.week_no.unique() if i > df.week_no.max() - TRAIN_WEEK]
df = df[df.week_no.isin(cons_week)].reset_index(drop=True) 
del date_week_df
df.shape

(1614095, 6)

# prepare candidates

In [13]:
def make_purchase_dict(input_df):
    purchase_dict_w = {}

    for i,x in enumerate(zip(input_df['customer_id'], input_df['article_id'])):
        cust_id, art_id = x
        if cust_id not in purchase_dict_w:
            purchase_dict_w[cust_id] = {}
    
        if art_id not in purchase_dict_w[cust_id]:
            purchase_dict_w[cust_id][art_id] = 0
    
        purchase_dict_w[cust_id][art_id] += 1
        
    return purchase_dict_w

In [14]:
def repurchase_candidates(customers_id, n_candidates = 12): 
    """
    df - basically, dataframe with customers(customers should be unique)
    """
    prediction_dict = {}
    
    if VALID:
        with open('user_last_dict_valid.pickle', mode='rb') as f:
            user_last_dict = pickle.load(f)
    else:
        with open('user_last_dict.pickle', mode='rb') as f:
            user_last_dict = pickle.load(f)        
        
    for cust_id in customers_id: 
        if cust_id in user_last_dict.keys():
            prediction_dict[cust_id] = user_last_dict[cust_id][:n_candidates]
        else:
            pass

    k = list(map(lambda x: x[0], prediction_dict.items()))
    v = list(map(lambda x: x[1], prediction_dict.items()))
    negatives_df = pd.DataFrame({'customer_id': k, 'negatives': v})
    negatives_df = (
          negatives_df
          .explode('negatives')
          .rename(columns = {'negatives': 'article_id'})
      )
    return negatives_df

In [15]:
def most_popular_candidates(customers_id, n_candidates = 12): 
    """
    df - basically, dataframe with customers(customers should be unique)
    """
    prediction_dict = {}
    
    if VALID:
        df_w = df[(df['t_dat'] >= pd.to_datetime('2020-09-09'))].copy() 
    else:
        df_w = df[df['t_dat'] >= pd.to_datetime('2020-09-16')].copy()
    dummy_list = list((df_w['article_id'].value_counts()).index)
        
    for cust_id in customers_id: 
        s = dummy_list[:n_candidates]        
        prediction_dict[cust_id] = s    
        
    k = list(map(lambda x: x[0], prediction_dict.items()))
    v = list(map(lambda x: x[1], prediction_dict.items()))
    negatives_df = pd.DataFrame({'customer_id': k, 'negatives': v})
    negatives_df = (
          negatives_df
          .explode('negatives')
          .rename(columns = {'negatives': 'article_id'})
      )
    return negatives_df

In [16]:
def popular_candidates_by_postal_code(customers_id, n_candidates = 12):
    """
    df - basically, dataframe with customers(customers should be unique)
    """
    prediction_dict = {}
    
    # postal_codeで人気のarticle_id setを成形
    if VALID:
        tmp = df[(df['t_dat'] >= pd.to_datetime('2020-09-09'))].copy() 
    else:
        tmp = df[df['t_dat'] >= pd.to_datetime('2020-09-16')].copy()
    
    tmp = pd.merge(tmp, customer_df[['customer_id', 'postal_code']], on ='customer_id', how="left")
    most_common_postal_code = tmp['postal_code'].value_counts().index[0]

    tmp = tmp.groupby(['postal_code'])['article_id'].value_counts().reset_index(name='count')
    tmp = tmp.sort_values(["postal_code", "count"], ascending=False).reset_index(drop=True)
    tmp = tmp.groupby("postal_code").head(n_candidates)
    tmp = tmp.groupby("postal_code")["article_id"].apply(list).reset_index()
    tmp = dict(zip(tmp.postal_code, tmp.article_id))
        
    # userとpostal_codeの辞書
    user_postal_dict = dict(zip(customer_df.customer_id, customer_df.postal_code))
            
    for cust_id in customers_id: 
        try:
            prediction_dict[cust_id] = tmp[user_postal_dict[cust_id]]
        except:
            prediction_dict[cust_id] = tmp[most_common_postal_code]

    k = list(map(lambda x: x[0], prediction_dict.items()))
    v = list(map(lambda x: x[1], prediction_dict.items()))
    negatives_df = pd.DataFrame({'customer_id': k, 'negatives': v})
    negatives_df = (
          negatives_df
          .explode('negatives')
          .rename(columns = {'negatives': 'article_id'})
      )
    return negatives_df

In [17]:
def popular_candidates_by_sales_channel(customers_id, n_candidates = 12):
    """
    df - basically, dataframe with customers(customers should be unique)
    """
    prediction_dict = {}
    
    # postal_codeで人気のarticle_id setを成形
    if VALID:
        tmp = df[(df['t_dat'] >= pd.to_datetime('2020-09-09'))].copy() 
    else:
        tmp = df[df['t_dat'] >= pd.to_datetime('2020-09-16')].copy()
    
    tmp = pd.merge(tmp, customer_df[['customer_id', 'postal_code']], on ='customer_id', how="left")
    tmp = tmp.groupby(['sales_channel_id'])['article_id'].value_counts().reset_index(name='count')
    tmp = tmp.sort_values(['sales_channel_id', "count"], ascending=False).reset_index(drop=True)
    tmp = tmp.groupby('sales_channel_id').head(n_candidates)
    tmp = tmp.groupby('sales_channel_id')["article_id"].apply(list).reset_index()
    tmp = dict(zip(tmp.sales_channel_id, tmp.article_id))
                
    for cust_id in customers_id: 
        prediction_dict[cust_id] = tmp[1] + tmp[2]

    k = list(map(lambda x: x[0], prediction_dict.items()))
    v = list(map(lambda x: x[1], prediction_dict.items()))
    negatives_df = pd.DataFrame({'customer_id': k, 'negatives': v})
    negatives_df = (
          negatives_df
          .explode('negatives')
          .rename(columns = {'negatives': 'article_id'})
      )
    return negatives_df

In [18]:
def popular_candidates_by_gender(customers_id, n_candidates = 12):
    prediction_dict = {}
    
    if VALID:
        tmp = df[(df['t_dat'] >= pd.to_datetime('2020-09-09'))].copy() 
    else:
        tmp = df[df['t_dat'] >= pd.to_datetime('2020-09-16')].copy()
    
    tmp = pd.merge(tmp, article_df[["article_id", "index_group_name"]], on='article_id', how='left')
    mp = {'Ladieswear':1, 'Baby/Children':0.5, 'Menswear':0, 'Sport':0.5, 'Divided':0.5} 
    tmp["gender"] = tmp["index_group_name"].map(mp)
    user_g = tmp.groupby('customer_id').gender.mean().reset_index()
    tmp.drop("gender", axis=1, inplace=True)
    user_g.loc[user_g.gender>=0.75, "gender"] = 1
    user_g.loc[user_g.gender<=0.25, "gender"] = 0
    user_g.loc[(user_g.gender<0.75) & (user_g.gender>0.25), "gender"] = 0.5
    
    tmp = pd.merge(tmp, user_g, on ='customer_id', how="left")
    all_cust_gender = pd.merge(customer_df["customer_id"], user_g, on="customer_id", how="left")
    all_cust_gender["gender"].fillna(0.5, inplace=True)
    all_cust_gender = dict(zip(all_cust_gender.customer_id, all_cust_gender.gender))
    
    tmp = tmp.groupby(['gender'])['article_id'].value_counts().reset_index(name='count')
    tmp = tmp.sort_values(['gender', "count"], ascending=False).reset_index(drop=True)
    tmp = tmp.groupby('gender').head(n_candidates)
    tmp = tmp.groupby('gender')["article_id"].apply(list).reset_index()
    tmp = dict(zip(tmp.gender, tmp.article_id))
    
    for cust_id in customers_id: 
        prediction_dict[cust_id] = tmp[all_cust_gender[cust_id]]

    k = list(map(lambda x: x[0], prediction_dict.items()))
    v = list(map(lambda x: x[1], prediction_dict.items()))
    negatives_df = pd.DataFrame({'customer_id': k, 'negatives': v})
    negatives_df = (
          negatives_df
          .explode('negatives')
          .rename(columns = {'negatives': 'article_id'})
      )
    return negatives_df

In [19]:
def popular_candidates_by_age(customers_id, n_candidates = 12):
    prediction_dict = {}
    
    if VALID:
        tmp = df[(df['t_dat'] >= pd.to_datetime('2020-09-09'))].copy() 
    else:
        tmp = df[df['t_dat'] >= pd.to_datetime('2020-09-16')].copy()
    
    age_df = customer_df[["customer_id", "age"]].copy()
    age_df["age"].fillna(-1, inplace=True)
    age_df["age"] = age_df["age"] // 10

    tmp = pd.merge(tmp, age_df, on='customer_id', how='left')
    all_cust_age = dict(zip(age_df.customer_id, age_df.age))
    
    tmp = tmp.groupby(['age'])['article_id'].value_counts().reset_index(name='count')
    tmp = tmp.sort_values(['age', "count"], ascending=False).reset_index(drop=True)
    tmp = tmp.groupby('age').head(n_candidates)
    tmp = tmp.groupby('age')["article_id"].apply(list).reset_index()
    tmp = dict(zip(tmp.age, tmp.article_id))
    
    for cust_id in customers_id: 
        prediction_dict[cust_id] = tmp[all_cust_age[cust_id]]

    k = list(map(lambda x: x[0], prediction_dict.items()))
    v = list(map(lambda x: x[1], prediction_dict.items()))
    negatives_df = pd.DataFrame({'customer_id': k, 'negatives': v})
    negatives_df = (
          negatives_df
          .explode('negatives')
          .rename(columns = {'negatives': 'article_id'})
      )
    return negatives_df

In [20]:
def similar_candidates(customers_id, n_candidates = 12):
    prediction_dict = {}
    
    tmp = article_df[["article_id"]].reset_index()
    ind2art = dict(zip(tmp.index, tmp.article_id))
    art2ind = dict(zip(tmp.article_id, tmp.index))
    del tmp
    
    if VALID:
        with open('user_last_dict_valid.pickle', mode='rb') as f:
            user_last_dict = pickle.load(f)
        df_w = df[(df['t_dat'] >= pd.to_datetime('2020-09-09'))].copy() 
    else:
        with open('user_last_dict.pickle', mode='rb') as f:
            user_last_dict = pickle.load(f)  
        df_w = df[df['t_dat'] >= pd.to_datetime('2020-09-16')].copy()
            
    with open('article2vec_idx.pickle', mode='rb') as f:
        article2vec_idx = pickle.load(f)     
        
    dummy_list = list((df_w['article_id'].value_counts()).index)[:n_candidates]
    dummy_list = [ind2art[article2vec_idx[art2ind[i]]] for i in dummy_list]
    
    for cust_id in customers_id: 
        if cust_id in user_last_dict.keys():
            items = user_last_dict[cust_id][:n_candidates]
            prediction_dict[cust_id] = [ind2art[article2vec_idx[art2ind[i]]] for i in items]
        else:
            prediction_dict[cust_id] = dummy_list     

    k = list(map(lambda x: x[0], prediction_dict.items()))
    v = list(map(lambda x: x[1], prediction_dict.items()))
    negatives_df = pd.DataFrame({'customer_id': k, 'negatives': v})
    negatives_df = (
          negatives_df
          .explode('negatives')
          .rename(columns = {'negatives': 'article_id'})
      )
    return negatives_df

In [21]:
def prepare_candidates(user_ids, n_candidates = 12):
    #all_cands = repurchase_candidates(user_ids, n_candidates) 
    most_popular_cands_df = most_popular_candidates(user_ids, n_candidates) 
    #postal_popular_cands_df = popular_candidates_by_postal_code(user_ids, n_candidates)
    #ch_popular_cands_df = popular_candidates_by_sales_channel(user_ids, n_candidates)
    #gender_popular_cands_df = popular_candidates_by_gender(user_ids, n_candidates)
    #age_popular_cands_df = popular_candidates_by_age(user_ids, n_candidates)
    #similar_cands_df = similar_candidates(user_ids, n_candidates)
    
    # connect all_candidates
    #all_cands = pd.concat([all_cands, most_popular_cands_df], axis=0)
    #all_cands = pd.concat([all_cands, postal_popular_cands_df], axis=0)
    #all_cands = pd.concat([all_cands, ch_popular_cands_df], axis=0)
    #all_cands = pd.concat([all_cands, gender_popular_cands_df], axis=0)
    #all_cands = pd.concat([all_cands, age_popular_cands_df], axis=0)
    #all_cands = pd.concat([all_cands, similar_cands_df], axis=0)
    all_cands = most_popular_cands_df
    prev_length = all_cands.shape[0]
    
    all_cands = all_cands.drop_duplicates(subset=['customer_id', 'article_id']).reset_index(drop=True)
    cur_length = all_cands.shape[0]
    
    print("ratio: ", cur_length / prev_length)
    
    return all_cands 

In [22]:
# make candidates
train_users = df['customer_id'].unique()
data = prepare_candidates(train_users, ELEMENTS)

ratio:  1.0


In [23]:
print(data["customer_id"].value_counts())

0011e0bd4c39195ff342c0ca0ac0601ce2b943a826550674298a5ba49d9f8e70    100
1efc1b074f84469b30732f9266b7d9d5f6a1d1be414e39f58648a6d97b7aa3bb    100
1f3c951c93f03c1a07267720a8343c6848b8c36703b7f1eebb9fc656867619cf    100
1f1d1633cc47e03568e17246972fa49ebf1198b673d8daa9ef5fefbb9b38c1ba    100
1f1595007d369c12d0038db5ee65da9b3bd82dc5358490abcb490c292f82b9a6    100
                                                                   ... 
d725cf9861007f14048ed898bd37695db661d9bfa8b37564ce50a2bbf07ee1e1    100
d712162e45a05d1487729b758fb0e7c125792ff2d4b62d3ac5795581e00418f2    100
d6fd8c11811aa9ec8ca785ffba095e1dde7b6188a91cf59514b40b0fa9ab6c92    100
d6f97a9acb97889011e42d3faca5e6ad6fb5075cb2bf5eb816b89948328c5811    100
fff5bd112051feb2367276df143f79bc69126814c73e21bb9d3a58f43a0c5f58    100
Name: customer_id, Length: 312215, dtype: int64


In [24]:
prev_length = valid.shape[0]

tmp_df = pd.merge(valid, data, on = ["customer_id", "article_id"], how="inner")
after_length = tmp_df.shape[0]
tmp_df["article_id"].nunique(), after_length * 100 / prev_length

(100, 6.297256471821931)

# add label

In [25]:
# 生成したcandidatesのうちdfの中で買っているものを1でラベルする
positive = df[["customer_id", "article_id"]].drop_duplicates(["customer_id", "article_id"])
positive['purchased'] = 1
print(positive.shape)
positive_length = positive.shape[0]

data = pd.merge(data, positive, on = ["customer_id", "article_id"], how="left")
del positive
data["purchased"].fillna(0, inplace=True)

print(data.shape, data[data.purchased==1].shape[0] / positive_length)

data.sort_values(['customer_id', 'article_id'], inplace = True)
data.reset_index(drop=True, inplace=True)

(1426468, 3)
(31221500, 3) 0.06475013810334336


In [26]:
# negative down sampling
seed = 42
pos_index = list(data[data['purchased']==1].index)
neg_index = list(data[data['purchased']==0].groupby("customer_id").sample(frac = 0.5, random_state=seed).index)
all_index = sorted(pos_index + neg_index)
data = data.loc[all_index][:].reset_index(drop=True)
del all_index, neg_index, pos_index

In [27]:
print(data.shape, data.purchased.mean(), list(data.columns))
print(
      data.groupby("customer_id")["purchased"].mean().max(),
      data.groupby("customer_id")["purchased"].mean().min(),
    )

(15679230, 3) 0.005890850507327209 ['customer_id', 'article_id', 'purchased']
0.21428571428571427 0.0


# add features

In [28]:
if VALID:
    sales_channel_count = pd.read_csv("sales_channel_count_valid.csv")
    item_count = pd.read_csv("item_count_valid.csv")
    #article_week = pd.read_csv("article_week_range_valid.csv")
else:
    sales_channel_count = pd.read_csv("sales_channel_count_full.csv")
    item_count = pd.read_csv("item_count_full.csv")
    #article_week = pd.read_csv("article_week_range_full.csv")
    
def encoding(df, categoricals):
    for f in categoricals:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df[f]))
        df[f] = lbl.transform(list(df[f]))
        
    return df

In [29]:
categoricals = ["postal_code"]
customer_df = encoding(customer_df, categoricals)

In [30]:
def make_features(tmp):
    tmp["article_id"] = tmp["article_id"].astype(int)
    tmp = pd.merge(tmp, article_df[["article_id"] + article_columns], on='article_id', how='left')        
    tmp = pd.merge(tmp, item_count, on='article_id', how='left')
    tmp = pd.merge(tmp, sales_channel_count, on='article_id', how='left')   
    tmp = pd.merge(tmp, customer_df[["customer_id", "age", "postal_code"]], on='customer_id', how='left')    
    return tmp

In [31]:
data = make_features(data) 
print(data.columns)

Index(['customer_id', 'article_id', 'purchased', 'product_code',
       'product_type_no', 'graphical_appearance_no', 'colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id',
       'department_no', 'index_group_no', 'section_no', 'garment_group_no',
       'item_count', 'count_in1', 'count_in2', 'channel_ratio', 'age',
       'postal_code'],
      dtype='object')


In [32]:
columns_to_use = [
                  "count_in1", "count_in2", "channel_ratio",
                  "item_count", "age", "postal_code"
                 ] + article_columns

train_baskets = data.groupby(['customer_id'])['article_id'].count().values

# model training

In [33]:
X_train = np.ndarray(shape=(len(data), len(columns_to_use)), dtype=np.float32)

for idx, feature in enumerate(columns_to_use):
    X_train[:,idx] = data[feature].astype(np.float32)
    del data[feature]
y_train = data["purchased"].astype(np.float32)

In [34]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    importance_type='gain',
    learning_rate = 0.02,
    n_estimators=30,
    verbose=10,
    random_state = 0
)

In [35]:
ranker.fit(
    X_train,
    y_train,
    group=train_baskets,    
    )

LGBMRanker(boosting_type='dart', importance_type='gain', learning_rate=0.02,
           metric='ndcg', n_estimators=30, objective='lambdarank',
           random_state=0, verbose=10)

In [36]:
# feature importance
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i] / ranker.feature_importances_.sum())

item_count 0.6883961269036631
channel_ratio 0.1043701512718029
count_in1 0.049304772854364194
age 0.041779669790377526
count_in2 0.036957395183681865
product_code 0.035341590819253
department_no 0.017684134050500382
colour_group_code 0.00769238696733875
garment_group_no 0.006773255273778621
perceived_colour_master_id 0.0037436615357423117
product_type_no 0.0035830097181401947
section_no 0.0019224062830674098
graphical_appearance_no 0.0019216608011139974
perceived_colour_value_id 0.0005297785471757314
index_group_no 0.0
postal_code 0.0


# candidate selection

In [37]:
pred_users = customer_df.customer_id.unique()
    
final_outputs = dict()
user_nums = len(pred_users)
batch_size = 100_000

In [38]:
for bucket in tqdm(range(0, user_nums, batch_size)):
    candidates = prepare_candidates(pred_users[bucket: bucket+batch_size], ELEMENTS) 
    candidates["week_no"] = test_week + 1
    candidates = make_features(candidates)

    X_test = np.ndarray(shape=(len(candidates), len(columns_to_use)), dtype=np.float32)

    for idx, feature in enumerate(columns_to_use):
        X_test[:,idx] = candidates[feature].astype(np.float32)
            
    outputs = ranker.predict(X_test)
    candidates["preds"] = outputs
    candidates = candidates[["customer_id", "article_id", "preds"]]
    candidates.sort_values(['customer_id', 'preds'], ascending=False, inplace = True)
 
    candidates = candidates.groupby('customer_id').head(100).reset_index(drop=True)
    candidates = candidates.groupby('customer_id')['article_id'].apply(list).reset_index()
    candidates = dict(zip(candidates.customer_id, candidates.article_id))

    final_outputs = {**final_outputs, **candidates}

  0%|          | 0/14 [00:00<?, ?it/s]

ratio:  1.0


  7%|▋         | 1/14 [00:40<08:49, 40.70s/it]

ratio:  1.0


 14%|█▍        | 2/14 [01:20<08:04, 40.40s/it]

ratio:  1.0


 21%|██▏       | 3/14 [02:00<07:19, 39.99s/it]

ratio:  1.0


 29%|██▊       | 4/14 [02:42<06:48, 40.86s/it]

ratio:  1.0


 36%|███▌      | 5/14 [03:21<06:02, 40.22s/it]

ratio:  1.0


 43%|████▎     | 6/14 [04:00<05:17, 39.68s/it]

ratio:  1.0


 50%|█████     | 7/14 [04:39<04:36, 39.53s/it]

ratio:  1.0


 57%|█████▋    | 8/14 [05:16<03:52, 38.83s/it]

ratio:  1.0


 64%|██████▍   | 9/14 [05:54<03:11, 38.40s/it]

ratio:  1.0


 71%|███████▏  | 10/14 [06:35<02:36, 39.22s/it]

ratio:  1.0


 79%|███████▊  | 11/14 [07:15<01:58, 39.51s/it]

ratio:  1.0


 86%|████████▌ | 12/14 [07:54<01:18, 39.36s/it]

ratio:  1.0


 93%|█████████▎| 13/14 [08:34<00:39, 39.60s/it]

ratio:  1.0


100%|██████████| 14/14 [09:04<00:00, 38.93s/it]


In [44]:
# recall check
# https://stackoverflow.com/questions/39011511/pandas-expand-rows-from-list-data-available-in-column
def recall_check(input_df, flg=False):
    keys = set(input_df.customer_id)
    cons_dict = dict(filter(lambda item: item[0] in keys, final_outputs.items()))
    cons_df = pd.DataFrame(list(cons_dict.items()) ,columns=['customer_id', 'article_id'])
    cons_df = cons_df.explode('article_id').reset_index(drop=True)
    prev_length = input_df.shape[0]

    tmp_df = pd.merge(input_df, cons_df, on = ["customer_id", "article_id"], how="inner")
    after_length = tmp_df.shape[0]

    if flg:
        valid_check = input_df.groupby('customer_id')['article_id'].apply(list).reset_index()
        valid_check = valid_check.rename({'article_id':'prediction'},axis=1)
        valid_check['prediction'] = valid_check.prediction.apply(lambda x: ' '.join(['0'+str(k) for k in x]))

        cons_df = cons_df.groupby('customer_id')['article_id'].apply(list).reset_index()
        cons_df = cons_df.rename({'article_id':'prediction'},axis=1)
        cons_df['prediction'] = cons_df.prediction.apply(lambda x: ' '.join(['0'+str(k) for k in x]))

        sub_check = cons_df.set_index('customer_id').loc[valid_check.customer_id].reset_index()
        print("t", mapk(valid_check.prediction.str.split(), sub_check.prediction.str.split(), k=12))
    
        print(preds["prediction"].value_counts())

    return tmp_df["article_id"].nunique(), after_length * 100 / prev_length

print("last", recall_check(valid, True))
print("last-1", recall_check(df[df.week_no == test_week-1]))
print("last-2", recall_check(df[df.week_no == test_week-2]))

t 0.006123625795922799
0918292001 0915526001 0751471001 0448509014 0706016001 0372860002 0866731001 0685814003 0685814001 0372860001 0706016003 0715624001    10224
0918292001 0915526001 0751471001 0706016001 0448509014 0896152002 0863595006 0866731001 0372860002 0372860001 0706016003 0751471043     8691
0751471001 0918292001 0706016001 0751471043 0915526001 0863595006 0896152002 0915529003 0448509014 0372860002 0866731001 0372860001     6719
0751471043 0751471001 0918292001 0863595006 0706016001 0896152002 0915526001 0915529003 0448509014 0372860002 0866731001 0372860001     6048
0918292001 0915526001 0751471001 0448509014 0706016001 0896152002 0863595006 0866731001 0372860002 0372860001 0706016003 0751471043     5556
0896152002 0896169002 0751471001 0751471043 0863595006 0706016001 0918292001 0915526001 0783346001 0915529003 0865929003 0850917001     5460
0918292001 0751471001 0915526001 0706016001 0751471043 0863595006 0896152002 0915529003 0448509014 0372860002 0866731001 0372860001

(2303, 7.38584584143048)
(18109, 76.77841726054983)
(17113, 57.64744556161371)

In [None]:
if VALID:
    with open('final_outputs_valid.pickle', mode='wb') as f:
        pickle.dump(final_outputs, f)
else:
    with open('final_outputs.pickle', mode='wb') as f:
        pickle.dump(final_outputs, f)