- https://making.lyst.com/lightfm/docs/lightfm.html
- modify mistake in valid label
- consider item, user features

In [1]:
import os
import tqdm

import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn import preprocessing

import itertools
from multiprocessing import Pool

SEED = 42
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

VALID = True

In [2]:
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]
        
    score = 0.0
    num_hits = 0.0
    
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

## Configs

In [3]:
# default number of recommendations
K = 12
EPOCHS = 100
LEARNING_RATE = 0.1
NO_COMPONENTS = 50
NO_THREADS = 32
ITEM_ALPHA=1e-6
USER_ALPHA=1e-6
# cold_userは1e-5がよさげ

num_cores=4

## Load the data

In [4]:
main_dir = "../input/h-and-m-personalized-fashion-recommendations"
images_dir = main_dir+"/images/" 
customers = pd.read_csv(main_dir+"/customers.csv")
articles = pd.read_csv(main_dir+"/articles.csv", dtype={'article_id': str})
sample_submission = pd.read_csv(main_dir+"/sample_submission.csv", dtype={'article_id': str})
train = pd.read_csv(main_dir+'/transactions_train.csv',  dtype={'article_id': str}, parse_dates=['t_dat'])

# dataset

In [5]:
# item
article_columns = [i for i in articles.columns if "int" in str(articles[i].dtype)]
articles = articles[['article_id'] + article_columns]

#image_feats = pd.read_csv("../input/hm-table-dataset/autox_features.csv")
#articles = pd.concat([articles, image_feats], axis=1)
#articles.fillna(-999, inplace=True)
#del image_feats
#article_columns = [i for i in articles.columns if "int" in str(articles[i].dtype) or "float" in str(articles[i].dtype)]
article_columns

['product_code',
 'product_type_no',
 'graphical_appearance_no',
 'colour_group_code',
 'perceived_colour_value_id',
 'perceived_colour_master_id',
 'department_no',
 'index_group_no',
 'section_no',
 'garment_group_no']

In [6]:
# user
user_columns = ['FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'postal_code']
customers.fillna(-1, inplace=True)
for feat in user_columns:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(customers[feat]))
    customers[feat] = lbl.transform(list(customers[feat]))
#user_columns += ['age']
user_columns

['FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'postal_code']

In [7]:
dataset = Dataset()
dataset.fit(users=customers['customer_id'], 
            items=articles['article_id'],
            user_features = user_columns,
            item_features = article_columns) 

num_users, num_topics = dataset.interactions_shape()
print(f'Number of users: {num_users}, Number of topics: {num_topics}.')

#Get the mappings
'''
uid = mapping from customer_id to model equivalent user_id
iid = mapping from article_id to  model equivalent article_id
'''
uid_map, ufeature_map, iid_map, ifeature_map = dataset.mapping() 
inv_uid_map = {v:k for k, v in uid_map.items()}
inv_iid_map = {v:k for k, v in iid_map.items()}

Number of users: 1371980, Number of topics: 105542.


In [8]:
train["t_dat"] = pd.to_datetime(train["t_dat"])

date_week_df = train.drop_duplicates("t_dat")[["t_dat"]].reset_index(drop=True)
date_week_df['week_no'] = (date_week_df['t_dat'] + pd.DateOffset(days=5)).dt.week
date_week_df["week_no"] = date_week_df["week_no"].diff(1)
date_week_df["week_no"].fillna(0, inplace=True)
date_week_df["week_no"] = date_week_df["week_no"] != 0
date_week_df["week_no"] = date_week_df["week_no"].cumsum()

train = pd.merge(train, date_week_df, on="t_dat", how="left")
test_week = train.week_no.max()

del date_week_df

  after removing the cwd from sys.path.


In [9]:
#item_last_week = train.groupby("article_id")["week_no"].max().reset_index()
#weak_weight_items = item_last_week[item_last_week.week_no < 50]["article_id"].values
#weak_weight_id = [iid_map[ele] for ele in weak_weight_items]
#del item_last_week

In [10]:
if VALID:
    train_set = train[train.week_no < test_week]
    (interactions, weights) = dataset.build_interactions(train_set.iloc[:, 1:3].values)

    valid = train[train.week_no == test_week].groupby('customer_id')['article_id'].apply(list).reset_index()
    valid = valid.rename({'article_id':'prediction'},axis=1)
    valid['prediction'] = valid.prediction.apply(lambda x: ' '.join([str(k) for k in x]))

    inactive_users = list(set(train_set.customer_id) - set(train_set[train_set.week_no > test_week - 10]["customer_id"]))
    cold_users = list(set(valid.customer_id) - set(train_set.customer_id))
    del train_set
else:
    (interactions, weights) = dataset.build_interactions(train.iloc[:, 1:3].values)
    
dataset.build_item_features(((articles.iloc[index]["article_id"], article_columns) for index in range(articles.shape[0])))
dataset.build_user_features(((customers.iloc[index]["customer_id"], user_columns) for index in range(customers.shape[0])))

print(interactions.shape)
del train

(1371980, 105542)


In [11]:
#change_index = np.isin(weights.col, weak_weight_id)
#weights.data[change_index] = 0.9

# fitting

In [12]:
model = LightFM(loss='bpr', 
                no_components=NO_COMPONENTS, 
                learning_rate=LEARNING_RATE,                 
                random_state=np.random.RandomState(SEED),
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA
               )

model.fit(interactions=interactions, 
          epochs=EPOCHS, 
          verbose=1, 
          num_threads=NO_THREADS,
          #sample_weight=weights,
         )

Epoch: 100%|██████████| 100/100 [1:14:25<00:00, 44.66s/it]


<lightfm.lightfm.LightFM at 0x7f58fd2c72d0>

In [13]:
#!pip3 install pickle5
#import pickle5 as pickle
#with open('../input/lightfm1/lightFM1.pickle', "rb") as fh:
#    trained_model = pickle.load(fh)

# Submission

In [14]:
if VALID:
    test_X = valid.customer_id.values
else:
    test_X = sample_submission.customer_id.values    

lfn_user = lambda x: uid_map[x]
test_X_m = [lfn_user(tx) for tx in test_X]

print(len(test_X_m))

68984


In [15]:
def predict_submission():
    df_split = np.array_split(test_X_m, num_cores)
    chunk_size = len(df_split[0])
    
    chunk1 = list(df_split[0])
    chunk2 = list(df_split[1])
    chunk3 = list(df_split[2])
    chunk4 = list(df_split[3])
    
    #list of items for multiprocessing, 4 since using 4 cores
    all_chunks = [chunk1, chunk2, chunk3, chunk4]
    
    pool = Pool(num_cores)
    result = pool.map(make_predict, all_chunks)
    
    pred_combined = list(itertools.chain(result[0], result[1], result[2], result[3]))

    return pred_combined

def make_predict(usr_chunk):
    customer_ids = []
    preds = []
    item_array = np.array(list(iid_map.values()))

    for usr_ in usr_chunk:
        m_opt = model.predict(np.array([usr_] * len(iid_map)), item_array)
        pred = np.argsort(-m_opt)[:K]
        preds.append(' '.join([inv_iid_map[p] for p in pred]).strip())
    
    preds = np.array(preds).reshape(-1, 1)
    return preds

In [16]:
%%time
final_predictions = predict_submission()

CPU times: user 969 ms, sys: 646 ms, total: 1.62 s
Wall time: 10min 57s


In [17]:
final_sub = pd.DataFrame(data=np.concatenate((np.array(test_X).reshape(-1,1), np.array(final_predictions)), axis=1).reshape(-1, 2),
                         columns=['customer_id', 'prediction'])

sample_submission.loc[sample_submission.customer_id.isin(final_sub.customer_id), 'prediction'] = ( 
    final_sub['prediction']
    )
sample_submission.to_csv('submission.csv', index=False)

In [18]:
# local score
if VALID:
    sub_check = final_sub.copy()
    sub_check = sub_check.set_index('customer_id').reset_index()
    print("t", mapk(valid.prediction.str.split(), sub_check.prediction.str.split(), k=12))    
    print("i", 
            mapk(valid[valid.customer_id.isin(inactive_users)].prediction.str.split(), 
               sub_check[sub_check.customer_id.isin(inactive_users)].prediction.str.split(), k=12),
            valid[valid.customer_id.isin(inactive_users)].shape[0])
    print("c", 
            mapk(valid[valid.customer_id.isin(cold_users)].prediction.str.split(), 
               sub_check[sub_check.customer_id.isin(cold_users)].prediction.str.split(), k=12),
            valid[valid.customer_id.isin(cold_users)].shape[0])
    
    print("  ")
    print(sub_check["prediction"].value_counts())  

t 0.0051678753192868875
i 0.005933723515352707 19048
c 0.0001270168871568728 5572
  
0754362003 0697564037 0664405005 0731743001 0682236001 0757903001 0764646003 0762286004 0750397013 0780418001 0809278006 0724906003    5572
0120129001 0534164001 0215589001 0300024058 0567532004 0433444001 0510465001 0775382001 0442915001 0752657001 0469562002 0262277011       5
0573937001 0294008002 0652924004 0751551001 0368979001 0779551002 0750330002 0652924010 0779554002 0640542002 0294008005 0624257001       4
0573937001 0294008002 0652924004 0751551001 0368979001 0779551002 0750330002 0652924010 0640542002 0779554002 0624257001 0294008005       3
0573937001 0294008002 0652924004 0751551001 0368979001 0779551002 0750330002 0652924010 0779554002 0624257001 0294008005 0698387001       3
                                                                                                                                       ... 
0699923008 0783707002 0685811002 0853612001 0783707047 0656677019 050162004