## Load Data

In [0]:
import pandas as pd
import numpy as np

In [0]:
movies_info=pd.read_csv("/Users/Sloth/desktop/bt4222_project/movie_data_merged_v1.csv").drop("Unnamed: 0",axis=1)
user_info=pd.read_csv(r'/Users/Sloth/desktop/bt4222_project/users.dat',sep=r'::',names=["user","gender","age","occupation","zip_code"],engine='python')
movie_ratings = pd.read_csv(r'/Users/Sloth/desktop/bt4222_project/training_ratings_for_kaggle_comp.csv')
# user_info = pd.read_csv('users.csv', names=["user","gender","age","occupation","zip_code"])

In [0]:
ratings_v1=movie_ratings.merge(user_info,how="left",left_on="user",right_on="user")
ratings_v2=ratings_v1.merge(movies_info,how="inner",left_on="movie",right_on="movie_id")

In [0]:
ratings_v2 = ratings_v2.sample(frac=1, axis=0,random_state=1).reset_index(drop=True)
n_rows=ratings_v2.shape[0]
one_tenth=n_rows//10
##train
train = ratings_v2.iloc[:8*one_tenth,]
validation=ratings_v2.iloc[8*one_tenth:9*one_tenth,]
test=ratings_v2.iloc[9*one_tenth:,]

## Set up data into surprise package formats

In [0]:
from surprise import SVD,NMF,SVDpp,Dataset,Reader
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))

data_1 = Dataset.load_from_df(train[['user', 'movie', 'rating']], reader)
data_2 = Dataset.load_from_df(validation[['user', 'movie', 'rating']], reader)
data_3 = Dataset.load_from_df(test[['user', 'movie', 'rating']], reader)

trainset,dummyset = train_test_split(data_1, test_size=0.00001)
dummyset,valset = train_test_split(data_2, test_size=1.0)
dummyset,testset= train_test_split(data_3, test_size=1.0)


In [0]:
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score

def diagnose_reccomendation(model,data):
  pred = model.test(data)
  pred_rec=pd.DataFrame(pred)
  pred_rec["real_rec"]= pred_rec["r_ui"].apply(lambda x: x>=3.5)
  pred_rec["predict_rec"] = pred_rec["est"].apply(lambda x: x>=3.5)
  print(confusion_matrix(pred_rec["real_rec"],pred_rec["predict_rec"]))
  print(accuracy_score(pred_rec["real_rec"],pred_rec["predict_rec"]))
  print(f1_score(pred_rec["real_rec"],pred_rec["predict_rec"]))
  

## SVD++

In [0]:
svd_pp_model = SVDpp(verbose=True,n_epochs=20,n_factors=200,reg_all=0.05)
svd_pp_model.fit(trainset)
#svd++ model takes super long to train

In [0]:
#svd++
diagnose_reccomendation(svd_pp_model,valset)

[[13412  6789]
 [ 5563 22443]]
0.7437716514199183
0.7841993081519271


## SVD

In [0]:
#svd base model
svd_model_base = SVD(verbose=False,n_epochs=20)
svd_model_base.fit(trainset)
diagnose_reccomendation(svd_model_base,valset)

[[13365  6836]
 [ 6029 21977]]
0.7331300433546996
0.7735792604586494


## NMF

In [0]:
nmf_model = NMF(verbose=False)
nmf_model.fit(trainset)
diagnose_reccomendation(nmf_model,valset)

[[13424  6777]
 [ 7076 20930]]
0.7126350944883523
0.7513506721949993


SVD++ has best performance, 74.3% accuracy. 
However much long training time for marginal improvement over SVD

SVD seems most promising

## SVD - Tuned

In [0]:
#up numer of epochs and factors to increase latent vector dimensions/complexity
svd_model_impv = SVD(verbose=False,n_epochs=55,n_factors=550,reg_all=0.07)
svd_model_impv.fit(trainset)

diagnose_reccomendation(svd_model_impv,valset)

[[13477  6724]
 [ 5513 22493]]
0.7461571970875599
0.7861524212292261


In [0]:
## check for accuracy on people with less than 10 ratings given in trainset
num_ratings=train["user"].value_counts()
few_ratings=num_ratings[num_ratings<=10]
a=set(few_ratings.index)

#filter validationset for people with <=10 ratings in train

b=list(filter(lambda x: x[0] in a,valset))
diagnose_reccomendation(svd_model_impv,b)

[[ 4  3]
 [ 6 15]]
0.6785714285714286
0.7692307692307692


Although not too bad, there is still the problem of a cold start for 
new users and new movies for the SVD model.

In those cases, it may be a better idea to us the XGboost prediction model

## Final check on testset

In [0]:
diagnose_reccomendation(svd_model_impv,testset)

[[13388  6833]
 [ 5445 22544]]
0.7453225471893798
0.7859707840881358


In [0]:
from surprise import dump
#save model and reload
dump.dump(file_name="/Users/Sloth/Desktop/bt4222_Project/svd_model",algo=svd_model_impv)
loaded_model = dump.load("/Users/Sloth/Desktop/bt4222_Project/svd_model")[1]
diagnose_reccomendation(loaded_model,testset)

[[13388  6833]
 [ 5445 22544]]
0.7453225471893798
0.7859707840881358
