### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import re
import os
import pickle
import random
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k
from scipy.sparse import csr_matrix
from lightfm.cross_validation import random_train_test_split
from tqdm import tqdm 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



### Import Data

In [2]:
movie = pd.read_pickle('../assets/movie.pkl')
rating = pd.read_pickle('../assets/new_rating.pkl')

select number of rows in the rating dataset due to a computation issue

### Config

Setting up config for the light fm model

In [3]:
config = {}

SEED = 42
USER_ALPHA = 1e-6
THREAD = 32
COMPONENTS = 20
EPOCHS = 50
NUM_ROWS = 3000000
LOSS = 'warp'

config['seed'] = SEED
config['user_alpha'] = USER_ALPHA
config['thread'] = THREAD
config['components'] = COMPONENTS
config['epochs'] = EPOCHS
config['num_rows'] = NUM_ROWS
config['loss'] = LOSS

### Helpers

In [4]:
def get_user_features(item_df, user_df):
    
    """This function will get information from users regarding their total movie reviews 
    and the genres of the movies that they reviewed.
    
    Input:
    item_df (dataframe): the original dataframe that contains movie information
    user_df (dataframe): the original dataframe that contains rating information
    
    Output:
    user_data (list): a combination of user id and their information. 
                      their information is composed of total movies they reviewed and the genre.
                      the score is then calculated based on the total movies that they have reviewed
    """
    # select features from the dataset
    item_features = ['movieId', 'clean_movie_title', 'new_genres']
    item_df = item_df[item_features]

    user_features = ['userId', 'movieId', 'rating']
    user_df = user_df[user_features]
    
    item_df = item_df.set_index('movieId')
    user_df = user_df.set_index('userId')
    
    # create a list to store the results
    user_data = []
    
    # unique user in the dataset
    unique_user = list(set(user_df.index))
    unique_genre = sorted(set(map(lambda genre: \
                                  genre.strip(),','.join(map(lambda genre: str(genre), item_df['new_genres'].values))\
                                  .split(','))))[:-1]
    
    # get user specific information
    for user in unique_user:
        user_specific_df = user_df[user_df.index == user]
        total_reviews = len(user_specific_df)
        genres = item_df.loc[user_specific_df['movieId']]['new_genres'].values
        user_genre_list = list(genres)
        user_genre_list_split = (','.join(map(lambda genre: str(genre), user_genre_list))).split(',')
        
        user_genre_score = {}
        
        # normalize the score
        for genre in unique_genre:
            user_genre_score[genre] = user_genre_list_split.count(genre)/total_reviews
    
        user_data.append([user, user_genre_score])
    
    return user_data
            

### Movie transform

In [5]:
new_rating = rating.sample(config['num_rows'])

##### User Item Interaction

In [6]:
user_movie_interaction = pd.pivot_table(new_rating, 
                                        index = 'userId', 
                                        columns='movieId', 
                                        values='rating')
user_movie_interaction.fillna(0, inplace = True)
user_movie_csr = csr_matrix(user_movie_interaction.values)



In [7]:
unique_genre = sorted(set(map(lambda genre: \
                                  genre.strip(),','.join(map(lambda genre: str(genre), movie['new_genres'].values))\
                                  .split(','))))

In [8]:
user_feats = get_user_features(movie, new_rating)

In [9]:
sorted_user = new_rating.sort_values(by = 'userId')['userId'].unique()
sorted_item = new_rating.sort_values(by = 'movieId')['movieId'].unique()

### Train Test Split

In [10]:
user_movie_csr_train, user_movie_csr_test = random_train_test_split(user_movie_csr, 0.3)

- cf

In [11]:
dataset = Dataset()
dataset.fit(sorted_user,
            sorted_item,
            item_features = ["Name"])

In [13]:
model_cf = LightFM(loss = config['loss'],
                   no_components = config['components'],
                   user_alpha = config['user_alpha'],
                   random_state = np.random.RandomState(config['seed']))

for epoch in tqdm(range(config['epochs'])):
    model_cf.fit(user_movie_csr_train, 
                 epochs = config['epochs'],
                 num_threads = config['thread']  j

100%|████████████████████████████████████████| 50/50 [1:43:54<00:00, 124.69s/it]


In [14]:
auc_train_cf = auc_score(model_cf, user_movie_csr_train).mean()
auc_test_cf = auc_score(model_cf, user_movie_csr_test).mean()
auc_train_cf, auc_test_cf

(0.9940751, 0.9726679)

In [15]:
precision_train_cf = precision_at_k(model_cf, user_movie_csr_train, k = 7).mean()
precision_test_cf = precision_at_k(model_cf, user_movie_csr_test, k = 7).mean()
precision_train_cf, precision_test_cf

(0.09427207, 0.018750219)

- hybrid

use hybrid model

In [16]:
dataset = Dataset()
dataset.fit(sorted_user,
            sorted_item,
            user_features = unique_genre,
            item_features = ["Name"])
user_features = dataset.build_user_features(user_feats,
                                            normalize=True)

In [17]:
model_hybrid = LightFM(loss = config['loss'],
                   no_components = config['components'],
                   user_alpha = config['user_alpha'])

for epoch in tqdm(range(config['epochs'])):
    model_hybrid.fit(user_movie_csr_train, 
                 user_features = user_features,
                 epochs = config['epochs'],
                 num_threads = config['thread'])

100%|████████████████████████████████████████| 50/50 [2:31:32<00:00, 181.85s/it]


In [18]:
auc_train = auc_score(model_hybrid, user_movie_csr_train, user_features = user_features).mean()
auc_test = auc_score(model_hybrid, user_movie_csr_test, user_features = user_features).mean()
auc_train, auc_test

(0.987656, 0.9809609)

In [19]:
precision_train = precision_at_k(model_hybrid, user_movie_csr_train, user_features = user_features, k = 7).mean()
precision_test = precision_at_k(model_hybrid, user_movie_csr_test, user_features = user_features, k = 7).mean()
precision_train, precision_test 

(0.084207825, 0.03900354)

### Export the results

In [22]:
movie_path = '../assets/final_movie.pkl'
rating_path = '../assets/final_rating.pkl'
model_path = '../assets/model.pkl'
user_item_path = '../assets/user_movie_interaction.pkl'
dataset_path = '../assets/dataset.pkl'
user_feats_path = '../assets/user_feats.pkl'
csr_path = '../assets/csr.pkl'

In [21]:
movie.to_pickle(movie_path)
new_rating.to_pickle(rating_path)

In [23]:
with open(model_path, 'wb') as m:
    pickle.dump(model_hybrid, m)

In [24]:
with open(user_item_path, 'wb') as u:
    pickle.dump(user_movie_interaction, u)

In [25]:
with open(dataset_path, 'wb') as d:
    pickle.dump(dataset, d)

In [26]:
with open(user_item_path, 'wb') as u:
    pickle.dump(user_feats, u)

In [28]:
with open(csr_path, 'wb') as u:
    pickle.dump(user_movie_csr, u)