# Matrix Factorization

  1. Preparation of Data Set
  2. Modeling
  3. Model Tuning
  4. Final Model and Prediction

## Import Necessary Libraries

In [4]:
!pip install surprise
import pandas as pd
pd.set_option("display.max_columns",None)
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import Dataset

In [7]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/datasets/movie_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,2457839,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.0,4.0,1996-08-24 09:28:42
1,2457840,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7.0,4.0,2002-01-16 19:02:55
2,2457841,356,Forrest Gump (1994),Comedy|Drama|Romance|War,8.0,5.0,1996-06-05 13:44:19
3,2457842,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9.0,4.0,2001-07-01 20:26:38
4,2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,1999-11-25 02:32:02


## Preparation of Dataset

In [8]:
user_movie_df = df.pivot_table(index=["userId"],columns=["title"],values=["rating"])
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(df[["userId","movieId","rating"]],reader)

## Modelling

In [11]:
trainset, testset = train_test_split(data, test_size=0.25)
svd_model = SVD()
svd_model.fit(trainset)
predictions = svd_model.test(testset)
accuracy.rmse(predictions)

RMSE: 0.9317


0.9317394515089114

In [25]:
df[df["userId"]==1]

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,userId,rating,timestamp
66172,3612352,541,Blade Runner (1982),Action|Sci-Fi|Thriller,1.0,4.0,2005-04-02 23:30:03


In [16]:
svd_model.predict(uid=1.0,iid=541,verbose=True)

user: 1.0        item: 541        r_ui = None   est = 4.02   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.024221737267684, details={'was_impossible': False})

## Model Tuning

In [17]:
param_grid = {"n_epochs":[5,10,15,20,25],
              "lr_all": [0.002,0.005,0.007,0.009]}
gs = GridSearchCV(SVD,param_grid,measures=["rmse","mae"],cv=3,n_jobs=-1,joblib_verbose=True)
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   45.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.1min finished


In [18]:
gs.best_score["rmse"]

0.930617829392412

In [19]:
gs.best_params["rmse"]

{'n_epochs': 5, 'lr_all': 0.005}

## Final Model and Prediction

In [20]:
svd_model_final = SVD(**gs.best_params["rmse"])
data = data.build_full_trainset()
svd_model_final.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f8aa617f160>

In [24]:
df[df["userId"]==1]

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,userId,rating,timestamp
66172,3612352,541,Blade Runner (1982),Action|Sci-Fi|Thriller,1.0,4.0,2005-04-02 23:30:03


In [21]:
svd_model_final.predict(uid=1.0,iid=541,verbose=True)

user: 1.0        item: 541        r_ui = None   est = 4.25   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.246023844796393, details={'was_impossible': False})