## Hybrid Recommendation System

Both Content based filtering and Collaborative filtering has its advantages and disadvantages and we can combine both these models to form a hybrid model. This can be achieved in multiple ways: either by building CF and CBF separately and then combining them or by doing them one after the other or incorporating one into the other. Here we are going to use a library called LightFM to implement the Hybrid model.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Dataset
from surprise import Reader, KNNWithMeans, SVD
from ast import literal_eval
from lightfm.data import Dataset
from lightfm.datasets import fetch_movielens
from lightfm import LightFM, cross_validation
from lightfm.evaluation import precision_at_k, auc_score, recall_at_k



Load the data into the dataframe. The data is scaled down by using the ones only in the links file.

In [3]:
df = pd.read_csv('movies_metadata.csv',low_memory=False)
ratings = pd.read_csv('ratings.csv',low_memory=False)
links = pd.read_csv('links_small.csv',low_memory=False)
links = links[links['tmdbId'].isna() == False]['tmdbId']
links_mov = pd.to_numeric(df['id'],errors="coerce")
ratings = ratings[ratings['movieId'].isin(links)]
ratings = ratings[ratings['movieId'].isin(links_mov)]
ratings = ratings.reset_index(drop=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1246,5.0,1425941556
4,1,1968,4.0,1425942148


In [4]:
df['id'] = pd.to_numeric(df['id'],errors="coerce")
df = df[df['id'].isin(links)]

In [5]:
movies = df.copy()
movies= df[['id','title','genres']]
mov_links = ratings[ratings['movieId'].isna() == False]['movieId']
movies = movies[movies['id'].isin(mov_links)]
movies = movies.dropna(subset=['title']).reset_index(drop=True)
movies['genres'] = [list(set([y['name'] for y in x])) for x in movies['genres'].apply(literal_eval)]
movies

Unnamed: 0,id,title,genres
0,862.0,Toy Story,"[Comedy, Animation, Family]"
1,8844.0,Jumanji,"[Adventure, Fantasy, Family]"
2,949.0,Heat,"[Drama, Thriller, Action, Crime]"
3,710.0,GoldenEye,"[Thriller, Adventure, Action]"
4,1408.0,Cutthroat Island,"[Adventure, Action]"
...,...,...,...
2431,131634.0,The Hunger Games: Mockingjay - Part 2,"[Adventure, Action, Science Fiction]"
2432,5900.0,The Dress,"[Comedy, Drama]"
2433,81704.0,Romeos,[Drama]
2434,4912.0,Confessions of a Dangerous Mind,"[Thriller, Drama, Comedy, Romance, Crime]"


In [6]:
movies[movies[['id','title']].duplicated()]

Unnamed: 0,id,title,genres
1737,5511.0,Le Samouraï,"[Drama, Thriller, Crime]"
2426,168538.0,"Nana, the True Key of Pleasure","[Drama, Comedy]"
2434,4912.0,Confessions of a Dangerous Mind,"[Thriller, Drama, Comedy, Romance, Crime]"


In [7]:
movies.drop_duplicates(['id','title'],inplace=True)
movies = movies.reset_index(drop=True)
movies

Unnamed: 0,id,title,genres
0,862.0,Toy Story,"[Comedy, Animation, Family]"
1,8844.0,Jumanji,"[Adventure, Fantasy, Family]"
2,949.0,Heat,"[Drama, Thriller, Action, Crime]"
3,710.0,GoldenEye,"[Thriller, Adventure, Action]"
4,1408.0,Cutthroat Island,"[Adventure, Action]"
...,...,...,...
2428,150540.0,Inside Out,"[Drama, Comedy, Animation, Family]"
2429,131634.0,The Hunger Games: Mockingjay - Part 2,"[Adventure, Action, Science Fiction]"
2430,5900.0,The Dress,"[Comedy, Drama]"
2431,81704.0,Romeos,[Drama]


In [8]:
dataset = Dataset()
dataset.fit(ratings['userId'], ratings['movieId'])
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 261457, num_items 2433.


In [9]:
dataset.fit_partial(items=movies['id'], item_features=movies['title'])
interactions, weights = dataset.build_interactions([tuple(i) for i in ratings.drop(['timestamp', 'rating'], axis = 1).values])
print(repr(interactions)) 

<261457x2433 sparse matrix of type '<class 'numpy.int32'>'
	with 8052909 stored elements in COOrdinate format>


In [10]:
item_features = dataset.build_item_features(((x, [y]) for x, y in zip(movies['id'], movies['title'])))
print(repr(item_features))

<2433x4835 sparse matrix of type '<class 'numpy.float32'>'
	with 4866 stored elements in Compressed Sparse Row format>


In [11]:
train, test = cross_validation.random_train_test_split(interactions, test_percentage=0.25)
model_lfm = LightFM(loss='warp')
model_lfm.fit(train, item_features=item_features, epochs=40)

<lightfm.lightfm.LightFM at 0x1e0060a7288>

In [12]:
def get_recommended_movies(model, user_ids):
    n_users, n_items = train.shape
    for user_id in user_ids:
        best = ratings[(ratings.userId == user_id) ].movieId.values 
        known_positives =  movies.loc[movies['id'].isin(best)].title.values
        scores = model.predict(user_id, np.arange(n_items))
        top_items = movies['title'][np.argsort(-scores)]
        print('User %s' % user_id)
        print(' Known positives:')
        for x in known_positives[:3]:
            print('  %s' % x)
        print(' Recommended: ')
        for x in top_items[:3]:
             print('  %s' % x)


In [13]:
get_recommended_movies(model_lfm, [3, 25, 451])

User 3
 Known positives:
  Once Were Warriors
  Sleepless in Seattle
  Reservoir Dogs
 Recommended: 
  Beverly Hills Cop III
  The Fugitive
  East of Eden
User 25
 Known positives:
  Psycho
  Night on Earth
  Titanic
 Recommended: 
  Cool Hand Luke
  Beverly Hills Cop III
  My Son the Fanatic
User 451
 Known positives:
  Beyond Rangoon
  Once Were Warriors
  Three Colors: Red
 Recommended: 
  Beverly Hills Cop III
  Angels and Insects
  Timecop


In [14]:
def print_eval(model_lfm):
    precision = precision_at_k(model_lfm, train, k=10, item_features=item_features).mean()
    auc = auc_score(model_lfm, train, item_features=item_features).mean()
    recall = recall_at_k(model_lfm, train, item_features=item_features).mean()
    
    print('AUC: %.2f.' % (auc))
    print('Precision: %.2f.' % (precision))
    print('Recall: %.2f.' % (recall))    

In [15]:
print_eval(model_lfm)

AUC: 0.98.
Precision: 0.35.
Recall: 0.33.
