## Hybrid Recommendation Algorithm
### Existing User
#### Collaborative Filtering: 
Save user actions such as purchasing history, browsing history and generate interaction matrix for collaborative filtering algorithm, then recommend items to users based on their similarity with other users.

### New User
#### Content-Based Filtering:
Ask user for basic demographic information and recommend items based on their similarity with other items.
Item features such as price, cluster (category) also included in the recommendation algorithm.


In [430]:
import pandas as pd
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
import numpy as np
from scipy import sparse
from lightfm import LightFM
from sklearn.base import clone


In [431]:
# Accomodate new users, items into old model
# credits: https://github.com/lyst/lightfm/issues/347#issuecomment-707829342
class LightFMResizable(LightFM):
    """A LightFM that resizes the model to accomodate new users,
    items, and features"""

    def fit_partial(
        self,
        interactions,
        user_features=None,
        item_features=None,
        sample_weight=None,
        epochs=1,
        num_threads=1,
        verbose=False,
    ):
        try:
            self._check_initialized()
            self._resize(interactions, user_features, item_features)
        except ValueError:
            # This is the first call so just fit without resizing
            pass

        super().fit_partial(
            interactions,
            user_features,
            item_features,
            sample_weight,
            epochs,
            num_threads,
            verbose,
        )

        return self

    def _resize(self, interactions, user_features=None, item_features=None):
        """Resizes the model to accommodate new users/items/features"""

        no_components = self.no_components
        no_user_features, no_item_features = interactions.shape  # default

        if hasattr(user_features, "shape"):
            no_user_features = user_features.shape[-1]
        if hasattr(item_features, "shape"):
            no_item_features = item_features.shape[-1]

        if (
            no_user_features == self.user_embeddings.shape[0]
            and no_item_features == self.item_embeddings.shape[0]
        ):
            return self

        new_model = clone(self)
        new_model._initialize(no_components, no_item_features, no_user_features)

        # update all attributes from self._check_initialized
        for attr in (
            "item_embeddings",
            "item_embedding_gradients",
            "item_embedding_momentum",
            "item_biases",
            "item_bias_gradients",
            "item_bias_momentum",
            "user_embeddings",
            "user_embedding_gradients",
            "user_embedding_momentum",
            "user_biases",
            "user_bias_gradients",
            "user_bias_momentum",
        ):
            # extend attribute matrices with new rows/cols from
            # freshly initialized model with right shape
            old_array = getattr(self, attr)
            old_slice = [slice(None, i) for i in old_array.shape]
            new_array = getattr(new_model, attr)
            new_array[tuple(old_slice)] = old_array
            setattr(self, attr, new_array)

        return self

In [432]:
df = pd.read_csv('data.csv',index_col=0)
df.dtypes

ProductTitle    object
Image           object
Price            int64
cluster          int64
dtype: object

In [433]:
def get_user_features():
    user_feaures = ['sex:M','sex:F','sex:P']
    for i in range(100): user_feaures.append("age:"+str(i))
    return user_feaures
# dummy user for testing purposes
users = [
    {
    'id': 1,
    'age': 19,
    'sex': 'F',
    },
     {
    'id': 2,
    'age': 40,
    'sex': 'M',
    },   
]
def get_item_features():
    item_features = []
    for i in df['cluster'].unique(): item_features.append("cluster:"+str(i))
    for p in df['Price'].unique(): item_features.append("price:"+str(int(p)))
    return item_features
dummy_user_interaction = [(1,0,2)]

#### Generate Dataset for Model

In [434]:
dataset = Dataset()
user_features = get_user_features()
item_features = get_item_features()
dataset.fit([x['id'] for x in users],df.index,user_features=user_features,item_features=item_features)

In [435]:
n_users, n_items = dataset.interactions_shape()
n_users, n_items

(2, 477893)

In [436]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

In [437]:
def get_user_interaction(userId:int,it=dict()):
    ret = []
    for x in df.index:
        if x in it:
            ret.append((userId,x,it[x]))
        else: 
            ret.append((userId,x,0))
    return ret
def gen_user_feature(user:object):
    feat = []
    for x in user:
        if x=='id': continue
        feat.append(x + ':' + str(user[x]))
    return (user['id'],feat)
def gen_item_feature(id,item:object):
    try:

        return [id,['cluster:'+str(item['cluster']),'price:'+str(int(item['Price']))]]
    except Exception as e:
        print(" erro in ",id,item)
        raise(e)

In [438]:
(interactions, weights) = dataset.build_interactions(dummy_user_interaction)

In [439]:
nfs = [ gen_user_feature(users[0]),gen_user_feature(users[1]) ]
ufs = dataset.build_user_features( nfs )

In [440]:
feats = []
for x in df.index:
    feats.append(gen_item_feature(x,df.loc[x]))
ifs = dataset.build_item_features(feats)
# get row 
# [(x,['cluster:'+str(df.loc[x]['cluster'])]) for x in df.index]

In [441]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
id_item_map = dict(((x,y) for y,x in item_id_map.items()))
# user_feature_map

#### Train Model

In [466]:
model = LightFMResizable(loss='warp')
model.fit(interactions,user_features=ufs,item_features=ifs,sample_weight=weights)

<__main__.LightFMResizable at 0x7f3b0e980610>

#### Predict

In [467]:
# helper function to add and update user model
# add interaction data for old users
LIMIT = 10000
all_items = np.arange(LIMIT)
def update_user(user_id, item_id, count):
    (new_interactions,new_wts) = dataset.build_interactions([(user_id, item_id, count)])
    ufs = dataset.build_user_features(nfs)
    model.fit_partial(new_interactions,user_features=ufs,item_features=ifs, sample_weight=new_wts)
# add interaction data for new users
def add_user(user):
    global nfs
    dataset.fit_partial(users=[user['id']])
    nfs += ([ gen_user_feature(user) ])
def get_item(id):
    return df.loc[id]
# get recommendation for existing user
def recommend(userId):
    y = model.predict(userId,all_items)
    return [get_item(id_item_map[x]) for x in np.argsort(-y)] [:4]
    # return np.argsort(-y)[:4]
# get recommendation for new user who has not interacted with any item
def format_newuser_input(user_feature_map, user_feature_list):
  normalised_val = 1.0 
  target_indices = []
  for feature in user_feature_list:
    try:
        target_indices.append(user_feature_map[feature])
    except KeyError:
        print("new user feature encountered '{}'".format(feature))
        pass

  new_user_features = np.zeros(len(user_feature_map.keys()))
  for i in target_indices:
    new_user_features[i] = normalised_val
  new_user_features = sparse.csr_matrix(new_user_features)
  return(new_user_features)
def new_user_recommend(user):
    new_user_features = gen_user_feature(user)[-1]
    new_ufs = format_newuser_input(user_feature_map, new_user_features)
    y = model.predict(0,np.arange(n_items),user_features=new_ufs)
    return [get_item(id_item_map[x]) for x in np.argsort(-y)] [:4]
    # return  np.argsort(y)[:4]

    

#### Performance Metric for Recommendation System

Generate test and train based on the user type, then calculate the performance metric for recommendation system.

In [468]:
import random
SIZE = 20
def gen_interaction(userId,ids):
    ret = []
    for id in ids:
        ret.append((userId,id,1))
    return ret
def gen_user(start,end,age,gender):
    for i in range(start,end+1):
        add_user({'id':i,'age':age,'sex':gender})
    return list(range(start,end+1))

In [469]:
male_train_terms = ['boys shoes', 'helmet', 'watches for men', 'cool hoodies for boys']
male_test_terms = ['men\'s  dress', 'men\'s shorts', 'men\'s sunglasses',' classic watches ']

female_train_terms = ['women\'s dresses', 'high heels', 'handbags', 'makeup']
female_test_terms = ['women\'s winter coats', 'women\'s heels','girls top','handbags']
# based on above we create item ids using our search based algorithm
male_train_ids = [173357,333098,380969,451515,451675,452551,460693,468030,1738,2483,394585,10312,10324,10343,10347,10355,10426,10432,10448,10452,377213,521045,521107,17604,26679,27524,58641,333165,336905,336952,236116,329796,551034,551047,23547,29191,331641,393513,550825,28797]
male_test_ids =  [117825,138282,147046,532784,533296,533545,540792,542829,543907,547486,113440,368557,544612,546121,546208,116440,532522,532744,533046,533168,105169,109310,112581,362566,379966,416600,417115,423692,425726,430281,27524,42476,58641,333486,333557,333794,333829,336661,336783,336906]

female_train_ids = [109973,265312,416392,530618,534157,544407,547492,539016,541736,545724,4138,9336,9403,24500,49995,126825,192874,192898,193222,193689,27584,38679,41065,45072,46754,49846,50199,50505,51642,51848,26757,58663,371812,371900,371968,372322,372371,372735,372753,372780]
female_test_ids = [109973,265312,416392,530618,534157,544407,547492,539016,541736,545724,4138,9336,9403,24500,49995,126825,192874,192898,193222,193689,27584,38679,41065,45072,46754,49846,50199,50505,51642,51848,26757,58663,371812,371900,371968,372322,372371,372735,372753,372780]


In [470]:
# adding users
AGE = 20
train_cap = 5000 # number of user in test and train data 
m_train_uids = gen_user(10,10+train_cap,AGE,'M')
f_train_uids = gen_user(10+2*train_cap,10+3*train_cap,AGE,'F')
test_cap = 2000
f_test_uids = gen_user(10+3*train_cap,10+3*train_cap + test_cap,AGE,'F')
m_test_uids = gen_user(10+3*train_cap + test_cap,10+3*train_cap + 2*test_cap,AGE,'M')


In [471]:
train_it = []
for g in ['M','F']:
    for uid in (m_train_uids if g=='M' else f_train_uids):
        train_it.extend(gen_interaction(uid,male_train_ids  if g=='M' else female_train_ids))
test_it = []
for g in ['M','F']:
    for uid in (m_test_uids if g=='M' else f_test_uids):
        test_it.extend(gen_interaction(uid,male_test_ids  if g=='M' else female_test_ids))


In [472]:
train_interaction,train_wts = dataset.build_interactions(train_it)
test_interaction,_ = dataset.build_interactions(test_it)

In [473]:
ufs = dataset.build_user_features(nfs)
ufs.todense().shape

(4003, 4106)

Evaluate the model performance based on the following metrics:
- ROC AUC : Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
- Precision@k : Precision at k (P@k) metric computes the percentage of recommended items in the top-k list that are relevant to the user.
- Recall@k : Recall at k (R@k) metric computes the percentage of relevant items that are recommended in the top-k list.

In [474]:
# auc
model.fit(train_interaction,user_features=ufs,item_features=ifs,sample_weight=train_wts,epochs=50)
train_auc = auc_score(model,train_interaction,user_features=ufs,item_features=ifs).mean()
test_auc = auc_score(model,test_interaction,user_features=ufs,item_features=ifs).mean()
print("Train AUC: %.2f" % (train_auc))
print("Train AUC: %.2f" % (test_auc))

Train AUC: 1.00
Train AUC: 0.81


In [475]:
#precision@k
train_precision = precision_at_k(model,train_interaction,user_features=ufs,item_features=ifs).mean()
test_precision = precision_at_k(model,test_interaction,user_features=ufs,item_features=ifs).mean()
print("Train Precision @k " ,(train_precision));
print("Test Precision @k " ,(test_precision));

Train Precision @k  0.9891109
Test Precision @k  0.59995


In [476]:
#recall@k
train_recall = recall_at_k(model,train_interaction,user_features=ufs,item_features=ifs).mean()
test_recall = recall_at_k(model,test_interaction,user_features=ufs,item_features=ifs).mean()
print("Train recall @k %.2f" % (train_recall));
print("Test recall @k ",(test_recall));

Why so low metrics?
Mainly duet to test data which generated based on fixed parameters which might not do justice for the model performance.

#### Misc
save data for server

In [None]:
import pickle
# pcickle both the model and the dataset
# pickle.dump(model, open('model.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(dataset, open('dataset.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(nfs, open('nfs.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(ifs, open('ifs.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(user_feature_map, open('user_feature_map.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(id_item_map, open('id_item_map.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
