### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import re
import os
import pickle
from sklearn.preprocessing import LabelEncoder
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k
from scipy.sparse import csr_matrix, coo_matrix
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



### Import Data

In [2]:
movie = pd.read_pickle('../assets/movie.pkl')
rating = pd.read_pickle('../assets/new_rating.pkl')

FileNotFoundError: [Errno 2] No such file or directory: '../assets/new_rating.pkl'

In [4]:
num_of_rows = 3000000
new_rating = rating.sample(num_of_rows)

### Helpers

In [5]:
def get_cutoff(data, target):
    
    """This function will identify summary statistics from our dataset.
    Input:  
    data (dataframe): the original dataframe
    target (str): a column that we want to get the summary statistics
    Output:
    summaries (tuple): contains 25% and 75% from the dataset
    """
    
    # check if the values under the target are int/float
    
    value_type = data[target].dtypes
    if value_type == 'O':
        print(f"{target}'s data type is not int or float. Please check it again.'")

    
    # get the summary stats
    summary_stats = data[target].describe()
    
    low = summary_stats['25%']
    high = summary_stats['75%']
    
    # combine the results as tuple
    summaries = (low, high)
    
    return summaries

In [11]:
user_movie_interaction = pd.pivot_table(new_rating, 
                                        index = 'userId', 
                                        columns='movieId', 
                                        values='rating')
user_movie_interaction.fillna(0,inplace=True)
user_movie_csr = csr_matrix(user_movie_interaction.values)



In [12]:
n_users, n_items = user_movie_interaction.shape

In [105]:
def get_user_features(item_df, user_df):
    
    # select features from the dataset
    item_features = ['clean_movie_title', 'new_genres']
    item_df = item_df[item_features]

    user_features = ['movieId', 'rating', 'movie_popularities', 'groupped_year']
    user_df = user_df[user_features]
    
    # create a list to store the results
    user_data = []
    
    # unique user in the dataset
    unique_user = list(set(user_df.index))
    unique_genre = sorted(set(map(lambda genre: \
                                  genre.strip(),','.join(map(lambda genre: str(genre), item_df['new_genres'].values))\
                                  .split(','))))[:-1]
    
    # get user specific information
    for user in unique_user:
        user_specific_df = user_df[user_df.index == user]
        total_reviews = len(user_specific_df)
        genres = item_df.loc[user_specific_df['movieId']]['new_genres'].values
        user_genre_list = list(genres)
        user_genre_list_split = (','.join(map(lambda genre: str(genre), user_genre_list))).split(',')
        
        user_genre_score = {}
        
        # normalize the score
        for genre in unique_genre:
            user_genre_score[genre] = user_genre_list_split.count(genre)/total_reviews
    
        user_data.append([user, user_genre_score])
    
    return user_data
            

In [15]:
movie_metadata = movie.set_index('movieId')
user_metadata = new_rating.set_index('userId')

In [19]:
user_feats = get_user_features(movie_metadata, user_metadata)

In [63]:
unique_genre = sorted(set(map(lambda genre: \
                                  genre.strip(),','.join(map(lambda genre: str(genre), movie['new_genres'].values))\
                                  .split(','))))

In [64]:
dataset = Dataset(user_identity_features = False)
dataset.fit(new_rating.sort_values(by = 'userId')['userId'].unique(),
            new_rating.sort_values(by = 'movieId')['movieId'].unique(),
            user_features = unique_genre,
            item_features = ["Name"])
user_features = dataset.build_user_features(user_feats,
                                            normalize=True)

In [65]:
model_hybrid = LightFM(loss='warp')

for epoch in tqdm(range(10)):
    model_hybrid.fit(user_movie_csr, 
                     user_features=user_features,
                     epochs= 10, 
                     num_threads= 32)

100%|███████████████████████████████████████████| 10/10 [01:55<00:00, 11.54s/it]


In [25]:
unique_users = new_rating.sort_values(by = 'userId')['userId'].unique()

In [107]:
def recommend_top5(model, movie, dataset, user_id=None, new_user_feature=None, k=5):
    
    nmovie=movie.set_index('movieId')
    
    max_user_id = max(rating.userId)
    if user_id is None:
        user_id = max_user_id +1
    if user_id > max_user_id:
        for genre in unique_genre:
            if genre not in list(new_user_feature.keys()):
                new_user_feature[genre] = 0
        dataset.fit_partial(users=[user_id],user_features=unique_genre)
        new_user_feature = [user_id,new_user_feature]
        new_user_feature = dataset.build_user_features([new_user_feature],normalize=False)

    user_id_map = dataset.mapping()[0][user_id] 
    scores = model.predict(user_id_map, np.arange(n_items),user_features=new_user_feature)
    rank = np.argsort(-scores)
    selected_movie_id =np.array(list(dataset.mapping()[2].keys()))[rank]
    top_items = nmovie.loc[selected_movie_id]

        
    return top_items['clean_movie_title'][:5]

### Testing

change a bit so that we can see what they've watched and their recommended movies

In [127]:
recommend_top5(model_hybrid,
            movie,
            dataset,
            user_id = 2)

movieId
77561              Iron Man 2
89745           Avengers, The
6365     Matrix Reloaded, The
2571              Matrix, The
72998                  Avatar
Name: clean_movie_title, dtype: object

In [112]:
recommend_top5(model_hybrid,
            movie,
            dataset,
            user_id = None,
            new_user_feature= {'Comedy':1, 'Mystery': 0.5, 'Adventure':0.8})

movieId
2918           Ferris Bueller's Day Off
223                              Clerks
141                       Birdcage, The
344          Ace Ventura: Pet Detective
231     Dumb & Dumber (Dumb and Dumber)
Name: clean_movie_title, dtype: object

In [126]:
recommend_top5(model_hybrid,
            movie,
            dataset,
            user_id = None,
            new_user_feature= {'Comedy':0.6, 'Mystery': 0.5, 'Adventure':0.8})

movieId
2918           Ferris Bueller's Day Off
231     Dumb & Dumber (Dumb and Dumber)
380                           True Lies
223                              Clerks
1197                Princess Bride, The
Name: clean_movie_title, dtype: object

In [113]:
rand_user_info = {}

for genre in unique_genre:
    rand_score = round(np.random.rand(1)[0], 4)
    rand_user_info[genre] = rand_score

In [131]:
recommend_top5(model_hybrid,
            movie,
            dataset,
            user_id = None,
            new_user_feature= rand_user_info)

movieId
364           Lion King, The
68954                     Up
51662                    300
116797    The Imitation Game
115210                  Fury
Name: clean_movie_title, dtype: object