- https://making.lyst.com/lightfm/docs/lightfm.html
- modify mistake in valid label
- consider item, user features

In [1]:
import os
import tqdm

import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn import preprocessing

import itertools
from multiprocessing import Pool

SEED = 42
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

VALID = True

In [2]:
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]
        
    score = 0.0
    num_hits = 0.0
    
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

## Configs

In [3]:
# default number of recommendations
K = 12
EPOCHS = 50

# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 40

# no of threads to fit model
NO_THREADS = 32
# regularisation for both user and item features
ITEM_ALPHA=1e-6
USER_ALPHA=1e-6

# for multiprocssing of final predictions
num_cores=4

## Load the data

In [4]:
main_dir = "../input/h-and-m-personalized-fashion-recommendations"
images_dir = main_dir+"/images/" 
customers = pd.read_csv(main_dir+"/customers.csv")
articles = pd.read_csv(main_dir+"/articles.csv", dtype={'article_id': str})
sample_submission = pd.read_csv(main_dir+"/sample_submission.csv", dtype={'article_id': str})

train = pd.read_csv(main_dir+'/transactions_train.csv',  dtype={'article_id': str}, parse_dates=['t_dat'])

## dataset

In [5]:
# item
article_columns = [i for i in articles.columns if "int" in str(articles[i].dtype)]
articles = articles[['article_id'] + article_columns]

#image_feats = pd.read_csv("../input/hm-table-dataset/autox_features.csv")
#articles = pd.concat([articles, image_feats], axis=1)
#articles.fillna(-999, inplace=True)
#del image_feats
#article_columns = [i for i in articles.columns if "int" in str(articles[i].dtype) or "float" in str(articles[i].dtype)]

In [6]:
# user
user_columns = ['FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'postal_code']
customers.fillna(-1, inplace=True)
for feat in user_columns:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(customers[feat]))
    customers[feat] = lbl.transform(list(customers[feat]))

In [7]:
dataset = Dataset()
dataset.fit(users=customers['customer_id'], 
            items=articles['article_id'],
            user_features = user_columns,
            item_features = article_columns) 

num_users, num_topics = dataset.interactions_shape()
print(f'Number of users: {num_users}, Number of topics: {num_topics}.')

Number of users: 1371980, Number of topics: 105542.


In [8]:
train["t_dat"] = pd.to_datetime(train["t_dat"])

date_week_df = train.drop_duplicates("t_dat")[["t_dat"]].reset_index(drop=True)
date_week_df['week_no'] = (date_week_df['t_dat'] + pd.DateOffset(days=5)).dt.week
date_week_df["week_no"] = date_week_df["week_no"].diff(1)
date_week_df["week_no"].fillna(0, inplace=True)
date_week_df["week_no"] = date_week_df["week_no"] != 0
date_week_df["week_no"] = date_week_df["week_no"].cumsum()

train = pd.merge(train, date_week_df, on="t_dat", how="left")
test_week = train.week_no.max()

del date_week_df

  after removing the cwd from sys.path.


In [9]:
train_set = train[train.week_no < test_week]
val_set = train[train.week_no == test_week]

(interactions, weights) = dataset.build_interactions(train_set.iloc[:, 1:3].values)
(val_interactions, val_weights) = dataset.build_interactions(val_set.iloc[:, 1:3].values)
dataset.build_item_features(((articles.iloc[index]["article_id"], article_columns) for index in range(articles.shape[0])))
dataset.build_user_features(((customers.iloc[index]["customer_id"], user_columns) for index in range(customers.shape[0])))

print(interactions.shape, val_interactions.shape)
del train

(1371980, 105542) (1371980, 105542)


In [10]:
valid = val_set.groupby('customer_id')['article_id'].apply(list).reset_index()
valid = valid.rename({'article_id':'prediction'},axis=1)
valid['prediction'] = valid.prediction.apply(lambda x: ' '.join([str(k) for k in x]))

inactive_users = list(set(train_set.customer_id) - set(train_set[train_set.week_no > test_week - 10]["customer_id"]))
cold_users = list(set(valid.customer_id) - set(train_set.customer_id))
del train_set, val_set

# fitting

In [11]:
model = LightFM(loss='bpr', 
                no_components=NO_COMPONENTS, 
                learning_rate=LEARNING_RATE,                 
                random_state=np.random.RandomState(SEED),
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA
               )
model.fit(interactions=interactions, epochs=EPOCHS, verbose=1, num_threads=NO_THREADS)

Epoch: 100%|██████████| 50/50 [37:37<00:00, 45.15s/it]


<lightfm.lightfm.LightFM at 0x7fae1e4ac9d0>

In [12]:
#Load Trained Model

#!pip3 install pickle5
#import pickle5 as pickle
#with open('../input/lightfm1/lightFM1.pickle', "rb") as fh:
#    trained_model = pickle.load(fh)

# Submission

In [13]:
#Get the mappings
'''
uid = mapping from customer_id to model equivalent user_id
iid = mapping from article_id to  model equivalent article_id
'''
uid_map, ufeature_map, iid_map, ifeature_map = dataset.mapping() 
inv_uid_map = {v:k for k, v in uid_map.items()}
inv_iid_map = {v:k for k, v in iid_map.items()}

if VALID:
    test_X = valid.customer_id.values
else:
    test_X = sample_submission.customer_id.values    

lfn_user = lambda x: uid_map[x]
test_X_m = [lfn_user(tx) for tx in test_X]

print(len(test_X_m))

68984


In [14]:
def predict_submission():
    df_split = np.array_split(test_X_m, num_cores)
    chunk_size = len(df_split[0])
    
    chunk1 = list(df_split[0])
    chunk2 = list(df_split[1])
    chunk3 = list(df_split[2])
    chunk4 = list(df_split[3])
    
    #list of items for multiprocessing, 4 since using 4 cores
    all_chunks = [chunk1, chunk2, chunk3, chunk4]
    
    pool = Pool(num_cores)
    result = pool.map(make_predict, all_chunks)
    
    pred_combined = list(itertools.chain(result[0], result[1], result[2], result[3]))

    return pred_combined

def make_predict(usr_chunk):
    customer_ids = []
    preds = []
    item_array = np.array(list(iid_map.values()))

    for usr_ in usr_chunk:
        m_opt = model.predict(np.array([usr_] * len(iid_map)), item_array)
        pred = np.argsort(-m_opt)[:K]
        #customer_ids.append(inv_uid_map[usr_])
        preds.append(' '.join([inv_iid_map[p] for p in pred]).strip())
    
    #customer_ids = np.array(customer_ids).reshape(-1, 1)
    preds = np.array(preds).reshape(-1, 1)
    return preds

In [15]:
%%time
final_predictions = predict_submission()

CPU times: user 876 ms, sys: 467 ms, total: 1.34 s
Wall time: 9min 24s


In [16]:
final_sub = pd.DataFrame(data=np.concatenate((np.array(test_X).reshape(-1,1), np.array(final_predictions)), axis=1).reshape(-1, 2),
                         columns=['customer_id', 'prediction'])

sample_submission.loc[sample_submission.customer_id.isin(final_sub.customer_id), 'prediction'] = ( 
    final_sub['prediction']
    )
sample_submission.to_csv('submission.csv', index=False)

In [17]:
# local score
if VALID:
    sub_check = final_sub.copy()
    sub_check = sub_check.set_index('customer_id').reset_index()
    print("t", mapk(valid.prediction.str.split(), sub_check.prediction.str.split(), k=12))    
    print("i", 
            mapk(valid[valid.customer_id.isin(inactive_users)].prediction.str.split(), 
               sub_check[sub_check.customer_id.isin(inactive_users)].prediction.str.split(), k=12),
            valid[valid.customer_id.isin(inactive_users)].shape[0])
    print("c", 
            mapk(valid[valid.customer_id.isin(cold_users)].prediction.str.split(), 
               sub_check[sub_check.customer_id.isin(cold_users)].prediction.str.split(), k=12),
            valid[valid.customer_id.isin(cold_users)].shape[0])
    
    print("  ")
    print(sub_check["prediction"].value_counts())  

t 0.0033021174700348186
i 0.003568400955732471 19048
c 2.5638396061942365e-05 5572
  
0724906003 0699071004 0815024001 0799797001 0537895008 0689365057 0691489006 0519847021 0786791002 0683395002 0767249003 0759160002    5572
0294008002 0573937001 0751551001 0652924004 0640542002 0779551002 0368979001 0294008005 0742274001 0750330002 0698387001 0624257001       3
0120129001 0215589001 0534164001 0300024058 0510465001 0775382001 0567532004 0300024013 0433444001 0752657001 0120129014 0300024009       2
0706016001 0706016002 0706016003 0399223001 0706016015 0573085028 0706016006 0399223029 0573085043 0706016019 0573085004 0562245046       2
0220094001 0636505001 0713200003 0728197001 0733123004 0806261002 0823317002 0794454002 0689047004 0787696003 0744180001 0720506007       1
                                                                                                                                       ... 
0658030011 0658030006 0658030005 0399087030 0399087014 0399136004 06580300