- https://making.lyst.com/lightfm/docs/lightfm.html
- modify mistake in valid label

In [1]:
import os
import cv2
import tqdm

import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k


from numba import jit
import itertools
from multiprocessing import Pool

SEED = 42
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

VALID = True

In [2]:
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]
        
    score = 0.0
    num_hits = 0.0
    
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

## Configs

In [3]:
# default number of recommendations
K = 12
EPOCHS = 50

# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 10

# no of threads to fit model
NO_THREADS = 32
# regularisation for both user and item features
ITEM_ALPHA=1e-6
USER_ALPHA=1e-6

## Load the data

In [4]:
main_dir = "../input/h-and-m-personalized-fashion-recommendations"
images_dir = main_dir+"/images/" 
customers = pd.read_csv(main_dir+"/customers.csv")
articles = pd.read_csv(main_dir+"/articles.csv", dtype={'article_id': str})
sample_submission = pd.read_csv(main_dir+"/sample_submission.csv", dtype={'article_id': str})

train = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv',  dtype={'article_id': str}, parse_dates=['t_dat'])

## dataset

In [5]:
dataset = Dataset()
dataset.fit(users=customers['customer_id'], 
            items=articles['article_id']) # consider both new user and item

num_users, num_topics = dataset.interactions_shape()
print(f'Number of users: {num_users}, Number of topics: {num_topics}.')

Number of users: 1371980, Number of topics: 105542.


In [6]:
train["t_dat"] = pd.to_datetime(train["t_dat"])

date_week_df = train.drop_duplicates("t_dat")[["t_dat"]].reset_index(drop=True)
date_week_df['week_no'] = (date_week_df['t_dat'] + pd.DateOffset(days=5)).dt.week
date_week_df["week_no"] = date_week_df["week_no"].diff(1)
date_week_df["week_no"].fillna(0, inplace=True)
date_week_df["week_no"] = date_week_df["week_no"] != 0
date_week_df["week_no"] = date_week_df["week_no"].cumsum()

train = pd.merge(train, date_week_df, on="t_dat", how="left")

#train.sort_values(['t_dat', 'customer_id'], inplace=True)

test_week = train.week_no.max()
del date_week_df

  after removing the cwd from sys.path.


In [7]:
train_set = train[train.week_no < test_week]
val_set = train[train.week_no == test_week]

(interactions, weights) = dataset.build_interactions(train_set.iloc[:, 1:3].values)
(val_interactions, val_weights) = dataset.build_interactions(val_set.iloc[:, 1:3].values)
#build_item_features
#build_user_features
print(interactions.shape, val_interactions.shape)
del train

(1371980, 105542) (1371980, 105542)


In [8]:
valid = val_set.groupby('customer_id')['article_id'].apply(list).reset_index()
valid = valid.rename({'article_id':'prediction'},axis=1)
valid['prediction'] = valid.prediction.apply(lambda x: ' '.join([str(k) for k in x]))

inactive_users = list(set(train_set.customer_id) - set(train_set[train_set.week_no > test_week - 10]["customer_id"]))
cold_users = list(set(valid.customer_id) - set(train_set.customer_id))
del train_set, val_set

# fitting

In [9]:
model = LightFM(loss='bpr', no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE,                 
                 random_state=np.random.RandomState(SEED))
model.fit(interactions=interactions, epochs=EPOCHS, verbose=1, num_threads=NO_THREADS)

Epoch: 100%|██████████| 50/50 [19:36<00:00, 23.53s/it]


<lightfm.lightfm.LightFM at 0x7ff555525450>

In [10]:
#Load Trained Model

#!pip3 install pickle5
#import pickle5 as pickle
#with open('../input/lightfm1/lightFM1.pickle', "rb") as fh:
#    trained_model = pickle.load(fh)

#if VALID:
#    val_precision = precision_at_k(model, val_interactions, k=K).mean()
#    print(val_precision)

# Submission

In [11]:
#Get the mappings
'''
uid = mapping from customer_id to model equivalent user_id
iid = mapping from article_id to  model equivalent article_id
'''
uid_map, ufeature_map, iid_map, ifeature_map = dataset.mapping() 
'''
create inverse mappings
'''
inv_uid_map = {v:k for k, v in uid_map.items()}
inv_iid_map = {v:k for k, v in iid_map.items()}

#convert submission user_id and article_id to model equivalent user_id and article_id

if VALID:
    test_X = valid.customer_id.values
else:
    test_X = sample_submission.customer_id.values    

lfn_user = lambda x: uid_map[x]
test_X_m = [lfn_user(tx) for tx in test_X]

print(len(test_X_m))

68984


In [12]:
num_cores=4

def predict_submission():
    df_split = np.array_split(test_X_m, num_cores)
    chunk_size = len(df_split[0])
    
    chunk1 = list(df_split[0])
    chunk2 = list(df_split[1])
    chunk3 = list(df_split[2])
    chunk4 = list(df_split[3])
    
    #list of items for multiprocessing, 4 since using 4 cores
    all_chunks = [chunk1, chunk2, chunk3, chunk4]
    
    pool = Pool(num_cores)
    result = pool.map(make_predict, all_chunks)
    
    pred_combined = list(itertools.chain(result[0], result[1], result[2], result[3]))

    return pred_combined

def make_predict(usr_chunk):
    customer_ids = []
    preds = []
    item_array = np.array(list(iid_map.values()))

    for usr_ in usr_chunk:
        m_opt = model.predict(np.array([usr_] * len(iid_map)), item_array)
        pred = np.argsort(-m_opt)[:K]
        #customer_ids.append(inv_uid_map[usr_])
        preds.append(' '.join([inv_iid_map[p] for p in pred]).strip())
        #break
    
    #customer_ids = np.array(customer_ids).reshape(-1, 1)
    preds = np.array(preds).reshape(-1, 1)
    return preds

In [13]:
%%time
final_predictions = predict_submission()

CPU times: user 1.05 s, sys: 704 ms, total: 1.76 s
Wall time: 11min


In [14]:
final_sub = pd.DataFrame(data=np.concatenate((np.array(test_X).reshape(-1,1), np.array(final_predictions)), axis=1).reshape(-1, 2),
                         columns=['customer_id', 'prediction'])

sample_submission.loc[sample_submission.customer_id.isin(final_sub.customer_id), 'prediction'] = ( 
    final_sub['prediction']
    )
sample_submission.to_csv('submission.csv', index=False)

In [15]:
# local score
if VALID:
    sub_check = final_sub.copy()
    sub_check = sub_check.set_index('customer_id').reset_index()
    print("t", mapk(valid.prediction.str.split(), sub_check.prediction.str.split(), k=12))    
    print("i", 
            mapk(valid[valid.customer_id.isin(inactive_users)].prediction.str.split(), 
               sub_check[sub_check.customer_id.isin(inactive_users)].prediction.str.split(), k=12),
            valid[valid.customer_id.isin(inactive_users)].shape[0])
    print("c", 
            mapk(valid[valid.customer_id.isin(cold_users)].prediction.str.split(), 
               sub_check[sub_check.customer_id.isin(cold_users)].prediction.str.split(), k=12),
            valid[valid.customer_id.isin(cold_users)].shape[0])
    
    print("  ")
    print(sub_check["prediction"].value_counts())  

t 0.0007800701522315257
i 0.0006007137748659168 19048
c 1.0207879913551126e-05 5572
  
0111586001 0234432001 0417951005 0457466002 0111593001 0158340001 0436261001 0123173001 0671607001 0774258001 0728162009 0767304002    2
0619881007 0582138002 0573716007 0610730026 0567494002 0534044001 0591439009 0662344008 0443000001 0719589004 0634248003 0639323001    1
0621381022 0621381012 0621381016 0621381019 0621381027 0621381001 0621381014 0724791002 0755754002 0732842008 0724791006 0777743001    1
0851363004 0877773003 0837741005 0886390002 0889669006 0857812007 0864513001 0877773004 0852809001 0873604003 0848359001 0806916008    1
0753379001 0838197003 0831998007 0784848001 0761308001 0860305002 0880238005 0890631005 0761308002 0844059003 0633136010 0798901008    1
                                                                                                                                      ..
0663282007 0810187004 0756684005 0761218004 0761432001 0756662004 0622966001 0809240001 074