## Hybrid Recommendation Algorithm
### Existing User
#### Collaborative Filtering: 
Save user actions such as purchasing history, browsing history and generate interaction matrix for collaborative filtering algorithm, then recommend items to users based on their similarity with other users.

### New User
#### Content-Based Filtering:
Ask user for basic demographic information and recommend items based on their similarity with other items.
Item features such as price, cluster (category) also included in the recommendation algorithm.


In [61]:
import pandas as pd
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
import numpy as np
from scipy import sparse
from lightfm import LightFM
from sklearn.base import clone


In [62]:
# Accomodate new users, items into old model
# credits: https://github.com/lyst/lightfm/issues/347#issuecomment-707829342
class LightFMResizable(LightFM):
    """A LightFM that resizes the model to accomodate new users,
    items, and features"""

    def fit_partial(
        self,
        interactions,
        user_features=None,
        item_features=None,
        sample_weight=None,
        epochs=1,
        num_threads=1,
        verbose=False,
    ):
        try:
            self._check_initialized()
            self._resize(interactions, user_features, item_features)
        except ValueError:
            # This is the first call so just fit without resizing
            pass

        super().fit_partial(
            interactions,
            user_features,
            item_features,
            sample_weight,
            epochs,
            num_threads,
            verbose,
        )

        return self

    def _resize(self, interactions, user_features=None, item_features=None):
        """Resizes the model to accommodate new users/items/features"""

        no_components = self.no_components
        no_user_features, no_item_features = interactions.shape  # default

        if hasattr(user_features, "shape"):
            no_user_features = user_features.shape[-1]
        if hasattr(item_features, "shape"):
            no_item_features = item_features.shape[-1]

        if (
            no_user_features == self.user_embeddings.shape[0]
            and no_item_features == self.item_embeddings.shape[0]
        ):
            return self

        new_model = clone(self)
        new_model._initialize(no_components, no_item_features, no_user_features)

        # update all attributes from self._check_initialized
        for attr in (
            "item_embeddings",
            "item_embedding_gradients",
            "item_embedding_momentum",
            "item_biases",
            "item_bias_gradients",
            "item_bias_momentum",
            "user_embeddings",
            "user_embedding_gradients",
            "user_embedding_momentum",
            "user_biases",
            "user_bias_gradients",
            "user_bias_momentum",
        ):
            # extend attribute matrices with new rows/cols from
            # freshly initialized model with right shape
            old_array = getattr(self, attr)
            old_slice = [slice(None, i) for i in old_array.shape]
            new_array = getattr(new_model, attr)
            new_array[tuple(old_slice)] = old_array
            setattr(self, attr, new_array)

        return self

In [63]:
df = pd.read_csv('data.csv',index_col=0)
df.dtypes

ProductTitle    object
Image           object
Price            int64
cluster          int64
dtype: object

In [64]:
def get_user_features():
    user_feaures = ['sex:M','sex:F','sex:P']
    for i in range(100): user_feaures.append("age:"+str(i))
    return user_feaures
# dummy user for testing purposes
users = [
    {
    'id': 1,
    'age': 19,
    'sex': 'F',
    },
     {
    'id': 2,
    'age': 40,
    'sex': 'M',
    },   
]
def get_item_features():
    item_features = []
    for i in df['cluster'].unique(): item_features.append("cluster:"+str(i))
    for p in df['Price'].unique(): item_features.append("price:"+str(int(p)))
    return item_features
dummy_user_interaction = [(1,0,2)]

#### Generate Dataset for Model

In [65]:
dataset = Dataset()
user_features = get_user_features()
item_features = get_item_features()
dataset.fit([x['id'] for x in users],df.index,user_features=user_features,item_features=item_features)

In [66]:
n_users, n_items = dataset.interactions_shape()
n_users, n_items

(2, 477893)

In [67]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

In [68]:
def get_user_interaction(userId:int,it=dict()):
    ret = []
    for x in df.index:
        if x in it:
            ret.append((userId,x,it[x]))
        else: 
            ret.append((userId,x,0))
    return ret
def gen_user_feature(user:object):
    feat = []
    for x in user:
        if x=='id': continue
        feat.append(x + ':' + str(user[x]))
    return (user['id'],feat)
def gen_item_feature(id,item:object):
    try:

        return [id,['cluster:'+str(item['cluster']),'price:'+str(int(item['Price']))]]
    except Exception as e:
        print(" erro in ",id,item)
        raise(e)

In [69]:
(interactions, weights) = dataset.build_interactions(dummy_user_interaction)

In [70]:
nfs = [ gen_user_feature(users[0]),gen_user_feature(users[1]) ]
ufs = dataset.build_user_features( nfs )

In [71]:
feats = []
for x in df.index:
    feats.append(gen_item_feature(x,df.loc[x]))
ifs = dataset.build_item_features(feats)
# get row 
# [(x,['cluster:'+str(df.loc[x]['cluster'])]) for x in df.index]

In [72]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
id_item_map = dict(((x,y) for y,x in item_id_map.items()))
# user_feature_map

#### Train Model

In [73]:
model = LightFMResizable(loss='warp')
model.fit(interactions,user_features=ufs,item_features=ifs,sample_weight=weights)

<__main__.LightFMResizable at 0x7f3b0e9ea2e0>

#### Predict

In [74]:
# helper function to add and update user model
# add interaction data for old users
LIMIT = 10000
all_items = np.arange(LIMIT)
def update_user(user_id, item_id, count):
    (new_interactions,new_wts) = dataset.build_interactions([(user_id, item_id, count)])
    ufs = dataset.build_user_features(nfs)
    model.fit_partial(new_interactions,user_features=ufs,item_features=ifs, sample_weight=new_wts)
# add interaction data for new users
def add_user(user):
    global nfs
    dataset.fit_partial(users=[user['id']])
    nfs += ([ gen_user_feature(user) ])
def get_item(id):
    return df.loc[id]
# get recommendation for existing user
def recommend(userId):
    y = model.predict(userId,all_items)
    return [get_item(id_item_map[x]) for x in np.argsort(-y)] [:4]
    # return np.argsort(-y)[:4]
# get recommendation for new user who has not interacted with any item
def format_newuser_input(user_feature_map, user_feature_list):
  normalised_val = 1.0 
  target_indices = []
  for feature in user_feature_list:
    try:
        target_indices.append(user_feature_map[feature])
    except KeyError:
        print("new user feature encountered '{}'".format(feature))
        pass

  new_user_features = np.zeros(len(user_feature_map.keys()))
  for i in target_indices:
    new_user_features[i] = normalised_val
  new_user_features = sparse.csr_matrix(new_user_features)
  return(new_user_features)
def new_user_recommend(user):
    new_user_features = gen_user_feature(user)[-1]
    new_ufs = format_newuser_input(user_feature_map, new_user_features)
    y = model.predict(0,np.arange(n_items),user_features=new_ufs)
    return [get_item(id_item_map[x]) for x in np.argsort(-y)] [:4]
    # return  np.argsort(y)[:4]

    

#### Performance Metric for Recommendation System

Generate test and train based on the user type, then calculate the performance metric for recommendation system.

In [75]:
import random
def gen_interaction(userId,ids):
    ret = []
    for id in random.sample(ids,20):
        ret.append((userId,id,1))
    return ret
def gen_user(start,end,age,gender):
    for i in range(start,end+1):
        add_user({'id':i,'age':age,'sex':gender})
    return list(range(start,end+1))

In [76]:
male_train_ids = [453679,38061,53265,126759,374918,393513,452102,483388,484573,503464,8183,18431,19824,21974,25451,26354,28797,29912,30621,32804,58555,61183,68714,71165,129763,170781,171007,171638,172478,172899,175269,229622,235198,236116,238871,240714,240716,263936,296733,329742,329796,331363,331661,332846,334096,335316,336620,345662,359996,365562,393660,403206,438215,466607,479923,480442,481823,483245,483329,484265,484731,484841,484866,485696,486444,486806,487112,487259,487312,487554,488537,488596,489911,490294,490344,492681,493610,495097,496735,497063,502624,502748,502756,502786,502968,503044,503135,503388,503430,509969,509970,509981,542312,462919,2825,2946,4037,5142,6688,8426,183998,184015,304909,483357,485755,487766,490077,494975,495019,495384,546990,15764,19040,33164,33379,57620,58542,84951,124891,125119,125260,125633,157404,233896,234521,234800,235380,235389,236807,237370,237527,266669,330722,330737,332226,334021,336836,336973,396468,423791,480059,480148,482837,483272,483583,483654,485602,487082,487431,487522,487637,487696,487742,487770,488031,488098,488297,488769,490442,491994,492761,492825,492834,492955,493316,494574,494984,495029,495061,495117,495185,496408,498013,498095,547172,108205,131795,139065,1319,1935,2200,2791,2859,2911,3077,4048,4561,4808,5501,6356,6691,7369,7520,7598,8215,8237,8279,8286,8800,9029,104956,106623,107071,110957,113478,115244,115296,117896,118392,118719,120388,120691,121479,140821,381889,385584,385621,385703,388230,389139,390136,416758,417323,417949,418393,418597,418690,420074,420179,420487,420532,420610,420676,420689,420755,420983,421045,421308,421628,421876,422514,422717,422806,423174,424084,424091,424968,426363,426670,426930,427156,427618,428977,429000,430177,430479,430628,430770,431284,432253,432519,435444,479366,479624,479658,479694,479695,479701,479770,479950,479954,480041,480042,480134,480547,480906,481018,482057,483519,484215,485071,485211,487118,487511,487626,488484,489011,489987,490119,490138,490170,491137,491236,491276,491334,491778,492703,492932,493804,494937,82326,84759,84779,84923,329910,373522,155449,5232,12470,22381,24325,57979,76668,111890,123753,150381,153430,162566,231920,242693,262110,285115,311961,330762,338939,343372,370765,381325,394413,437662,475158,515043,1180,1629,2391,2859,2863,3330,3362,3371,4048,4990,5148,5343,5433,6023,6245,6432,6691,6829,8074,8195,8411,9319,9406,10482,12068,12277,13164,13188,13242,14610,15031,15134,16687,18792,20597,21493,21809,22295,22534,23017,23594,24809,33940,34689,34690,34691,34872,35739,36069,46126,56457,59259,66458,69040,77077,79087,80769,81182,84732,107952,114006,124228,124504,124592,130089,150218,154722,157591]
male_test_ids =  [349413,503337,3744,6420,7795,8472,23455,267519,267842,267908,268011,268263,342192,344313,345380,345889,347751,856,2983,3863,3967,4018,4900,5054,6340,6820,6909,9093,23956,37115,37220,38272,38664,38681,40076,41586,41922,42123,43135,43198,43983,44392,44830,45615,46142,46709,46846,46878,48461,48584,48966,50073,52952,53024,53068,57788,79720,193271,267088,267107,267184,267201,267271,267308,267343,267361,267409,267520,267545,267644,267645,267680,267691,267758,267765,267777,267788,267813,267864,267934,267979,268066,268076,268112,268274,268276,338746,339975,341604,342283,342359,342396,343108,343229,344256,344609,344674,346381,347834,348508,6822,10245,28611,28650,38048,40091,47851,63467,74577,86210,86578,87666,97443,97580,98031,99023,99395,179640,181864,186871,186872,186873,186874,186875,190070,268225,344611,347268,349403,375296,448304,451695,455358,455809,456075,467472,468695,4495,5231,21489,21533,21658,23689,23774,24163,24349,25343,25491,25676,25713,25717,25789,25844,25860,26103,26218,26235,26808,27272,27328,27703,28085,28201,28371,28429,28691,30254,30331,30383,30427,30460,30519,30552,30680,30686,31203,31207,31211,31215,31231,36673,36695,36706,36724,36774,37419,37966,38184,38286,38480,39066,39108,39238,39419,39696,40005,40043,40549,40885,41086,13787,34480,82438,125935,400475,506415,5857,10838,27013,61028,63280,70944,74293,84925,110130,115483,149587,154919,192740,266524,290615,320021,343887,405067,502692,550119,12366,26354,29191,35254,46080,68483,129475,129993,153167,153168,165406,165619,194142,194719,204924,205150,235648,267683,270819,329796,331887,394567,405141,480442,487554,487653,488056,488262,503388,503430,503464,505964,505965,59692,183993,461497,1380,5110,7114,8340,9003,9168,11557,16797,18053,18350,19092,20050,24011,24901,25949,27087,36142,37164,39555,44974,49232,50441,52031,52055,61279,62165,62799,63103,64582,65473,67981,68134,69366,71312,72361,72626,73577,73619]

female_train_ids = [29689,30059,329796,503415,17695,1407,2003,3020,6660,7660,7846,8515,9352,10432,11387,12344,14319,14371,15555,17894,18321,18358,21079,21353,22490,22945,23321,23349,23438,23591,23625,23632,23815,24719,24925,25268,25355,26424,26618,27055,28151,29175,29269,31133,32536,32545,33175,33735,33872,33875,33940,34501,34646,34852,34855,34864,34880,37205,37779,38916,39023,39249,39421,40539,49269,49275,49598,51155,51683,55626,55686,57289,57520,57611,57713,57910,57924,58087,58553,58807,59008,59385,59421,59485,59736,60990,61395,61462,65673,66107,67188,67201,70478,70890,71507,71605,71985,76682,77930,78560,58553,151555,160562,25555,57736,58276,113714,128196,164109,232003,242989,243040,243113,353755,371744,6332,22247,26846,42408,43717,43768,47875,48026,51418,57589,57686,57821,57884,58513,58528,58673,61783,72062,79737,106401,110253,117886,138552,147708,152526,153746,157179,162580,162626,164310,194228,194798,215530,217544,224312,227076,232404,243071,243376,243639,243642,265884,271954,344250,350082,361023,370902,370939,371034,371035,371052,371073,371096,371134,371183,371423,371424,371433,371434,371439,371442,371443,371444,371445,371447,371448,371449,371450,371452,371453,371459,371461,371463,371464,371465,371466,371467,371470,371472,371474,371476,371480,371481,371482,371483,148691,110633,116433,157073,221222,225120,357360,393938,530818,532742,536088,541490,22339,25968,26027,27292,27620,28207,28538,37260,39074,39314,39508,39700,40187,40346,40772,41033,41529,42029,42263,42415,43412,45491,46424,46852,46855,46983,47213,47360,47504,48002,48127,48280,48496,48671,48863,48896,49229,49799,49826,50252,50295,50364,50909,51084,51530,51634,51967,52027,52051,52671,52775,53064,53384,54317,54420,54541,54578,54899,54921,54991,55312,55368,55591,55762,55840,55933,56053,56226,56581,56744,56956,95632,101999,104239,104413,104446,104470,104942,104987,105177,105180,105289,105608,105823,105958,106191,106395,106448,152782,30593,31138,160646,329315,329352,152971,159838,162020,165619,322590,502668,164722,4138,32593,33293,34116,34408,53284,118677,149587,151449,151887,152252,152261,154919,156061,162553,165015,167848,220132,227603,232568,311120,327483,329844,329910,330337,331345,332774,333592,334965,336565,372345,376132,392071,392096,400019,400027,400033,403540,404463,415736,437218,475298,502692,502735,503017,503108,503250,549042,129440,8578,11063,24624,24878,33164,33179,33379,34264,46624,46648,49137,50396,50436,58158,70854,72856,78833,125548,126052,126469,126771,128180,128647,129586,129778,129783,158878,160700,162806,163349,166178,167424,171530,223240,231920,243201,249077,249322]
female_test_ids = [115690,116768,118428,122359,134067,134666,134776,135169,138233,140405,140638,141189,141324,141463,141691,141934,142522,143609,145388,145522,146001,147970,148948,149335,158129,331776,415786,534184,544731,1951,2145,2349,2593,2612,3222,4221,4849,5182,5270,5362,6080,6573,6745,6831,6940,7954,7985,8028,8058,8240,8308,8567,8568,8571,9445,9615,10231,10390,11578,11613,11652,11807,11819,11940,12166,12333,12436,12486,12672,13105,13299,13380,13528,13707,14115,14132,14882,15136,15355,15574,16541,17304,17826,18248,18381,18543,18918,19350,19728,19943,22373,22603,22697,22835,23172,23349,23426,23527,24795,25843,502550,6691,8215,10432,16322,18447,23095,23934,24783,27469,31143,70303,71292,80637,84045,84205,84681,84774,85041,99165,114697,116309,118044,128541,128714,134281,134948,137671,138672,147676,152103,152910,153993,157544,159605,160112,160462,162070,166341,166628,166664,167194,171524,173246,188092,193837,232944,253815,253882,254621,259187,259842,261988,265710,267291,267328,267735,268067,268389,269102,312029,323505,327742,333005,344515,345685,349324,365653,369508,371086,372906,373792,381158,393823,438447,475566,502668,536741,796,838,860,935,1004,1064,1240,1396,1552,1935,2041,2132,2505,2622,2707,2859,2979,3077,3127,3153,3723,3762,150334,152039,152911,157233,158529,158899,158939,161566,163528,164068,27516,150125,153209,153335,154856,155023,155938,156794,156989,157514,158082,159248,159940,162595,162965,163426,163961,164094,164179,164305,164591,164681,164833,164839,165025,165267,166172,166189,166388,166652,166688,167630,167884,168109,168301,168419,311140,313342,316007,332391,377700,381392,549564,26613,26744,27007,27155,27604,27807,57866,149452,149482,149508,149556,149575,149592,149600,149604,149610,149702,149757,149768,149817,149845,149883,149918,149932,150068,150089,150111,150115,150175,150195,150210,150260,150290,150296,150462,150639,150751,150783,150861,150931,150949,150980,151012,151038,151094,151111,151118,332207,30703,154757,549042,28825,29175,112377,126132,27013,31027,104795,108220,115668,126429,226253,318628,332275,332858,333381,403048,416074,25865,34116,34408,53284,105487,106401,107952,110250,113715,120663,122407,126014,126988,152311,154722,155110,159811,159838,161517,161587,161694,171062,171153,220132,232568,239055,259057,259387,260093,261325,263927,321270,327874,329910,334264,372422,376355,385917,386478,389833,403540,404463,415736,502667,502668,503250,548791,5093,6529,7246,10231,19537,19824,20719,20733,20864,24937,25581,25785,28732,32299,32474,33179,34264,45812,45842,50396,50436,52499,53473,55017,58099,58487,62104,65685,66076,72550,74762,77370]


In [77]:
# adding users
AGE = 20
cap = 30
m_train_uids = gen_user(10,10+cap,AGE,'M')
m_test_uids = gen_user(10+cap,10+2*cap,AGE,'M')
f_train_uids = gen_user(10+2*cap,10+3*cap,AGE,'F')
f_test_uids = gen_user(10+3*cap,10+4*cap,AGE,'F')


In [78]:
train_it = []
for g in ['M','F']:
    for uid in (m_train_uids if g=='M' else f_train_uids):
        train_it.extend(gen_interaction(uid,male_train_ids  if g=='M' else female_train_ids))
test_it = []
for g in ['M','F']:
    for uid in (m_test_uids if g=='M' else f_test_uids):
        test_it.extend(gen_interaction(uid,male_test_ids  if g=='M' else female_test_ids))


In [79]:
train_interaction,train_wts = dataset.build_interactions(train_it)
test_interaction,_ = dataset.build_interactions(test_it)

In [80]:
ufs = dataset.build_user_features(nfs)
ufs.todense().shape

(123, 226)

Evaluate the model performance based on the following metrics:
- ROC AUC : Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
- Precision@k : Precision at k (P@k) metric computes the percentage of recommended items in the top-k list that are relevant to the user.
- Recall@k : Recall at k (R@k) metric computes the percentage of relevant items that are recommended in the top-k list.

In [83]:
# auc
model.fit(train_interaction,user_features=ufs,item_features=ifs,sample_weight=train_wts,epochs=10)
train_auc = auc_score(model,train_interaction,user_features=ufs,item_features=ifs).mean()
test_auc = auc_score(model,test_interaction,user_features=ufs,item_features=ifs).mean()
print("Train AUC: %.2f" % (train_auc))
print("Train AUC: %.2f" % (test_auc))

Train AUC: 1.00
Train AUC: 0.69


In [84]:
#precision@k
train_precision = precision_at_k(model,train_interaction,user_features=ufs,item_features=ifs).mean()
test_precision = precision_at_k(model,test_interaction,user_features=ufs,item_features=ifs).mean()
print("Train Precision @k %.2f" % train_precision);
print("Test Precision @k%.2f" % test_precision);


Train Precision @k 0.06451613
Test Precision @k 0.0016129032


In [85]:
#recall@k
train_recall = recall_at_k(model,train_interaction,user_features=ufs,item_features=ifs).mean()
test_recall = recall_at_k(model,test_interaction,user_features=ufs,item_features=ifs).mean()
print("Train recall @k %.2f" % train_recall);
print("Test recall @k %.2f" % test_recall);

Train recall @k 0.03225806451612904
Test recall @k 0.0008064516129032258


#### Misc
save data for server

In [138]:
import pickle
# pcickle both the model and the dataset
# pickle.dump(model, open('model.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(dataset, open('dataset.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(nfs, open('nfs.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(ifs, open('ifs.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(user_feature_map, open('user_feature_map.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(id_item_map, open('id_item_map.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
