# Example of SGD

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


In [2]:
R = np.array([[4, np.NaN, np.NaN, 2, np.NaN],
              [np.NaN, 5, np.NaN, 3, 1],
              [np.NaN,np.NaN,3,4,4],
              [5,2,1,2,np.NaN]])

num_users, num_items = R.shape

#latent dimension k
k = 3

In [3]:
np.random.seed(123)
# Make P,Q with number with nurmal distribution
P = np.random.normal(scale =1./k, size =(num_users, k))
Q = np.random.normal(scale = 1./k, size =(num_items, k))

In [4]:
P, Q

(array([[-0.36187687,  0.33244848,  0.09432617],
        [-0.50209824, -0.19286675,  0.55047885],
        [-0.80889308, -0.14297088,  0.42197875],
        [-0.28891347, -0.22629538, -0.03156966]]),
 array([[ 0.49712988, -0.21296733, -0.14799399],
        [-0.14478376,  0.73531003,  0.7289287 ],
        [ 0.33468463,  0.1287288 ,  0.24578953],
        [ 0.49691068, -0.31194462,  0.39194301],
        [-0.41796022, -0.21258383,  0.3023684 ]]))

In [5]:
#Loss RMSE

def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    full_pred_matrix = np.dot(P,Q.T)
    
    # from R extract Non null value and calculate RMSE
    row_non_zero_idx = [non_zero[0] for non_zero in non_zeros] #row idx
    col_non_zero_idx = [non_zero[1] for non_zero in non_zeros] #column idx
    
    R_non_zeros = R[row_non_zero_idx, col_non_zero_idx]
    
    #extract non null values from predicted matrix
    full_pred_matrix_non_zeros = full_pred_matrix[row_non_zero_idx, col_non_zero_idx]
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    
    return rmse    

In [6]:
# non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] >0]

steps = 1000
learning_rate = 0.01
r_lambda = 0.01

#Update P and Q with SGD
for step in range(steps):
    for i,j,r in non_zeros:
        #error rate
        eij = r - np.dot(P[i,:],Q[j,:].T)
        #SGD Update Formula
        P[i, :] = P[i,:] + learning_rate * (eij * Q[j, :] - r_lambda * P[i,:])
        Q[j, :] = Q[j,:] + learning_rate * (eij * P[i, :] - r_lambda * Q[j,:])
        
        rmse = get_rmse(R,P,Q, non_zeros)
        
        if(step % 10) == 0:
            print(" ### iteration step : ",step," rmse : ", rmse)      
        
        


NameError: name 'non_zeros' is not defined

In [None]:
pred_matrix = np.dot(P,Q.T)
print('predicted matrix:\n',np.round(pred_matrix, 3))

# TMDB Movie Dataset

In [None]:
import warnings; warnings.filterwarnings('ignore')

### Using Genre for Content Based Filtering 

In [None]:
movies = pd.read_csv('tmdb_5000_movies.csv')
print(movies.shape)
movies.head(1)

In [None]:
movies_df = movies[['id', 'title', 'genres', 'vote_average','vote_count',
                   'popularity','keywords','overview']]

In [None]:
movies_df.head(1)

In [None]:
pd.set_option('max_colwidth', 100)
movies_df[['genres','keywords']][:2]

In [None]:
#convert genres to list
from ast import literal_eval

movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

In [None]:
movies_df['genres'][:1], movies_df['keywords'][:1]

In [None]:
#extract only genre name
movies_df['genres'] = movies_df['genres'].apply(lambda x: [y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [y['name'] for y in x])


In [None]:
movies_df[['genres','keywords']].head(1)

In [None]:
# Content Similarity Using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df = 0, ngram_range = (1,2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)

In [None]:

vars(genre_mat)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
#represent genre's similarity each other
genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
genre_sim
temp = pd.DataFrame(genre_sim)
temp

In [None]:
#use argsort to find index of higher similarity row

genre_sim_sorted_idx = genre_sim.argsort()[:, ::-1]
genre_sim_sorted_idx[:2]

In [None]:
def find_sim_movie(df, sorted_idx, title, top_n =10):
    title_movie = df[df['title'] == title]
    
    title_idx = title_movie.index.values
    similar_indexes = sorted_idx[title_idx, :top_n]
    
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]

In [None]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_idx, 'The Godfather', 10)
similar_movies[['title','vote_average']][:10]

In [None]:
movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average', ascending = False)[:10]

In [None]:
"""
USE WEIGHTED RATING USED BY IMDB
v : vote_count
m : minimum_count
R : vote_average of each movie
C : average rating of total movie rating
"""

C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6) # top 60%
round(C, 3) , round(m,3)

In [None]:
percentile = 0.6
m = movies_df['vote_count'].quantile(percentile)
C = movies_df['vote_average'].mean()

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ((v/(v+m)) * R) + ((m/(m+v)) * C )

movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1)

In [None]:
movies_df.columns

In [None]:
movies[['title','vote_average','vote_count','weighted_vote']].sort_values('weighted_vote', ascending = False)

In [None]:
def find_sim_movie(df, sorted_idx, title, top_n =10):
    title_movie = df[df['title'] == title]
    title_idx = title_movie.index.values
    
    similar_indexes = sorted_idx[title_idx, :top_n*2]
    similar_indexes = similar_indexes.reshape(-1)
    #delete input title
    similar_indexes = similar_indexes[similar_indexes != title_idx]
    
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending = False)[:top_n]

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_idx, 'The Godfather', 10)
similar_movies[['title','vote_average','weighted_vote']]

## Item Based Collaborative Filtering

In [None]:
movies = pd.read_csv('Movielens/movies.csv')
ratings = pd.read_csv('Movielens/ratings.csv')
movies.shape , ratings.shape

In [None]:
movies.head()

In [None]:
ratings.head()

In [None]:
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index = 'userId', columns = 'movieId')
ratings_matrix.head(3)

In [None]:
rating_movies = pd.merge(ratings, movies, on ='movieId')

ratings_matrix = rating_movies.pivot_table('rating', index = 'userId', columns = 'title')

ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix.head(3)

In [None]:
#Similarity between Movies

ratings_matrix_T = ratings_matrix.transpose()
ratings_matrix_T.head(3)

In [None]:
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)

item_sim_df = pd.DataFrame(data = item_sim, index = ratings_matrix.columns, columns = ratings_matrix.columns)

print(item_sim_df.shape)
item_sim_df.head(3)

In [None]:
item_sim_df['Godfather, The (1972)'].sort_values(ascending=False)[:10]

In [None]:
item_sim_df['Inception (2010)'].sort_values(ascending=False)[:10]

In [None]:
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr)/np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index = ratings_matrix.index, columns = ratings_matrix.columns)

ratings_pred_matrix.head(3)

In [None]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

get_mse(ratings_pred, ratings_matrix.values)

In [None]:
# predic_rating uses all movies similarities which makes accuracy lower
# make function to use movies have higher similarities

def predict_rating_topsim(ratings_arr, item_sim_arr, n =20):
    
    pred = np.zeros(ratings_arr.shape)
    
    for col in range(ratings_arr.shape[1]):
        top_n_items = [np.argsort(item_sim_arr[:,col])[:-n-1:-1]]
        
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col,:][top_n_items]))
            
    return pred

In [None]:
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n =20)

In [None]:
#Top 20 mse
print(get_mse(ratings_pred, ratings_matrix.values))

ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index = ratings_matrix.index, columns = ratings_matrix.columns)

In [None]:
ratings_matrix.shape[1]

In [None]:
user_rating_id = ratings_matrix.iloc[9,:]
user_rating_id[user_rating_id>0].sort_values(ascending=False)[:10]

In [None]:
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    
    already_seen = user_rating[user_rating>0].index.tolist()
    
    movies_list = ratings_matrix.columns.tolist()
    
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [None]:
def recommend_movie_by_userId(pred_df, userId, unseen_list, top_n =10):
    
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

unseen_list = get_unseen_movies(ratings_matrix, 9)

recomm_movies = recommend_movie_by_userId(ratings_pred_matrix, 9, unseen_list, top_n = 20)

recomm_movies = pd.DataFrame(recomm_movies.values, index = recomm_movies.index, columns =['pred_score'])

recomm_movies

## Latent Collaborative Filtering Using Matrix Factorization (SGD)

In [None]:
#Loss RMSE

def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    full_pred_matrix = np.dot(P,Q.T)
    
    # from R extract Non null value and calculate RMSE
    row_non_zero_idx = [non_zero[0] for non_zero in non_zeros] #row idx
    col_non_zero_idx = [non_zero[1] for non_zero in non_zeros] #column idx
    
    R_non_zeros = R[row_non_zero_idx, col_non_zero_idx]
    
    #extract non null values from predicted matrix
    full_pred_matrix_non_zeros = full_pred_matrix[row_non_zero_idx, col_non_zero_idx]
#     print(R_non_zeros)
#     print(full_pred_matrix_non_zeros)
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    
    return rmse    

In [None]:
def matrix_factorization(R, K, steps=2000, learning_rate = 0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    
    np.random.seed(123)
    P = np.random.normal(scale = 1./K, size=(num_users, K))
    Q = np.random.normal(scale = 1./K, size=(num_items, K))
    
    prev_rmse = 10000
    break_point = 0
    
    non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] >0]

    
    #SGD
    for step in range(steps):
        for i, j , r in non_zeros:
            
            eij = r - np.dot(P[i, :], Q[j,:].T)
            
            P[i, :] = P[i, :] + learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] = Q[j, :] + learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])
            
        rmse = get_rmse(R, P, Q, non_zeros)
        if(step % 10) == 0:
            print("### iteration step : ",step," rmse : ", rmse)
    
    return P,Q    

non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] >0]

      


In [None]:
movies = pd.read_csv('Movielens/movies.csv')
ratings = pd.read_csv('Movielens/ratings.csv')
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index = 'userId', columns = 'movieId')

In [None]:
ratings_matrix.head(1)

In [None]:
rating_movies = pd.merge(ratings, movies, on ='movieId')
ratings_matrix = rating_movies.pivot_table('rating', index = 'userId', columns ='title')
ratings_matrix.head(1)

In [None]:
ratings_matrix

In [None]:
P,Q = matrix_factorization(ratings_matrix.values, K=50, steps =1000, learning_rate = 0.01, r_lambda = 0.01)

pred_matrix = np.dot(P, Q.T)

In [None]:
pd.DataFrame(Q)

In [None]:
ratings_pred_matrix = pd.DataFrame(data = pred_matrix, index = ratings_matrix.index, columns = ratings_matrix.columns)

ratings_pred_matrix.head(3)

In [None]:
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    
    already_seen = user_rating[user_rating>0].index.tolist()
    
    movies_list = ratings_matrix.columns.tolist()
    
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [None]:
def recommend_movie_by_userId(pred_df, userId, unseen_list, top_n =10):
    
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

unseen_list = get_unseen_movies(ratings_matrix, 9)

recomm_movies = recommend_movie_by_userId(ratings_pred_matrix, 9, unseen_list, top_n = 20)

recomm_movies = pd.DataFrame(recomm_movies.values, index = recomm_movies.index, columns =['pred_score'])

recomm_movies

In [None]:
unseen_list = get_unseen_movies(ratings_matrix, 9)

recomm_movies = recommend_movie_by_userId(ratings_pred_matrix, 9, unseen_list, top_n=20)

recomm_movies = pd.DataFrame(data = recomm_movies.values, index = recomm_movies.index, columns = ['pred_score'])
recomm_movies