In [1]:
#Libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation
from sklearn.decomposition import TruncatedSVD
import warnings
from scipy.sparse.linalg import svds


In [2]:
#Reading and cleaning the dataset 
df = pd.read_stata('Dataset_merged.dta').drop(columns=['timestamp', '_merge']).dropna()
df['userid'] = df['userid'].astype(int)

In [3]:
df

Unnamed: 0,userid,movieid,rating,title
0,55889,1,3.5,Toy Story (1995)
1,64965,1,4.5,Toy Story (1995)
2,61730,1,3.0,Toy Story (1995)
3,68941,1,5.0,Toy Story (1995)
4,34902,1,5.0,Toy Story (1995)
5,3366,1,4.0,Toy Story (1995)
6,52070,1,3.0,Toy Story (1995)
7,35235,1,4.0,Toy Story (1995)
8,55417,1,4.0,Toy Story (1995)
9,49932,1,5.0,Toy Story (1995)


In [4]:
#Splitting dataset
df_train, df_test = train_test_split(df, test_size=0.3, random_state=0)

In [5]:
#Sparse Matrix Train
# pivot ratings into movie features
df_movie_features1 = df_train.pivot(
    index='movieid',
    columns='userid',
    values='rating'
).fillna(0)
# convert dataframe of movie features to scipy sparse matrix
mat_movie_features_train = csr_matrix(df_movie_features1.values)


In [6]:
#Sparse Matrix Test
# pivot ratings into movie features
df_movie_features2 = df_test.pivot(
    index='movieid',
    columns='userid',
    values='rating'
).fillna(0)
# convert dataframe of movie features to scipy sparse matrix
mat_movie_features_test = csr_matrix(df_movie_features2.values)


**User-Based kNN method**

**Matrix Factorization**

In [12]:
#R train matrix
R_df_train = df_train.pivot_table(
    index='userid',columns='title', values='rating').fillna(0)

#R test matrix
R_df_test = df_test.pivot_table(
    index='userid',columns='title', values='rating').fillna(0)

In [8]:
R_df_train.head()

title,'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),*batteries not included (1987),...All the Marbles (a.k.a. The California Dolls) (1981),...And God Created Woman (Et Dieu... crÃÂ©a la femme) (1956),...And God Spoke (1993),...And Justice for All (1979),"1, 2, 3, Sun (Un, deuz, trois, soleil) (1993)",...,Zorba the Greek (Alexis Zorbas) (1964),"Zorro, the Gay Blade (1981)",Zulu (1964),Zus & Zo (2001),[Rec] (2007),eXistenZ (1999),ffolks (a.k.a. North Sea Hijack) (1980),loudQUIETloud,xXx,"ÃÂge d'or, L' (1930)"
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#De-meaning the R train matrix
R = R_df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

  


In [11]:
#SVD with k latent factors 
U, sigma, Vt = svds(R_demeaned, k = 2)

KeyboardInterrupt: 

In [None]:
sigma = np.diag(sigma)

In [None]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [None]:
preds_df.head()

In [None]:
#mat_movie_features_train2.shape

In [None]:
#X = mat_movie_features_train2.values.T
#X.shape

In [None]:
#SVD = TruncatedSVD(n_components = 12, random_state = 17)
#matrix = SVD.fit_transform(X)
#matrix.shape

In [None]:
#warnings.filterwarnings("ignore", category=RuntimeWarning#)
#corr = np.corrcoef(matrix)
#corr.shape