# Matrix Factorization

  1. Preparation of Data Set
  2. Modeling
  3. Model Tuning
  4. Final Model and Prediction

# Import Necessary Libraries

In [None]:
!pip install surprise
import numpy as np
import pandas as pd
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", 500)
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163006 sha256=013c1be924e9069c6e165bc99461592d2c7ba06b7f923c8fa608e63d58d3180b
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


# Import Dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/datasets/movie_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,2457839,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.0,4.0,1996-08-24 09:28:42
1,2457840,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7.0,4.0,2002-01-16 19:02:55
2,2457841,356,Forrest Gump (1994),Comedy|Drama|Romance|War,8.0,5.0,1996-06-05 13:44:19
3,2457842,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9.0,4.0,2001-07-01 20:26:38
4,2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,1999-11-25 02:32:02


# Preparation of Dataset

In [None]:
user_movie_df = df.pivot_table(index=["userId"],columns=["title"],values=["rating"]) # user_movie_df oluşturmak
reader = Reader(rating_scale=(1,5)) # puanların hangi sklada gezdiğini veriyoruz
data = Dataset.load_from_df(df[["userId","movieId","rating"]],reader) # surprise kütüphanesine uygun veriyi hazırlıyoruz

# Modelling

In [None]:
# veriyi eğitim ve test setlerine bölerek veriyi eğitim veri seti üzerinde eğitip sonra test seti üzerinde deniyoruz
train_data, test_data = train_test_split(data, test_size=0.25)
svd_model = SVD()
svd_model.fit(train_data)
predictions = svd_model.test(test_data)
accuracy.rmse(predictions)

RMSE: 0.9353


0.9353372844214818

In [None]:
df["movieId"].value_counts()

356       66172
541       30526
4422        644
130219        1
Name: movieId, dtype: int64

In [None]:
df[df["userId"]==10]

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,userId,rating,timestamp
4,2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,1999-11-25 02:32:02


In [None]:
svd_model.predict(uid=10,iid=356,verbose=True)

user: 10         item: 356        r_ui = None   est = 3.68   {'was_impossible': False}


Prediction(uid=10, iid=356, r_ui=None, est=3.6780627639537125, details={'was_impossible': False})

# Model Tuning

In [None]:
param_grid = {"n_epochs":[5,10,15,20,25],
              "lr_all": [0.002,0.005,0.007,0.009]}
# cv: veriyi üç ayrı bölüme ayır ve her defasında ikisiyle model kur diğer bölümü ile test et
# n_jobs: -1 ise işlemcileri full erforması ile kullanmak
# joblib_verbose: raporlama oluştursun mu anlamında
gs = GridSearchCV(SVD,param_grid,measures=["rmse","mae"],cv=3,n_jobs=-1,joblib_verbose=True)
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   39.6s finished


In [None]:
gs.best_score["rmse"]

0.9303445050910867

In [None]:
gs.best_params["rmse"]

{'n_epochs': 10, 'lr_all': 0.002}

# Final Model and Prediction

In [None]:
svd_model_final = SVD(**gs.best_params["rmse"])
data = data.build_full_trainset()
svd_model_final.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f5574dc1030>

In [None]:
df[df["userId"]==10]

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,userId,rating,timestamp
4,2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,1999-11-25 02:32:02


In [None]:
svd_model_final.predict(uid=10,iid=356,verbose=True)

user: 10         item: 356        r_ui = None   est = 4.03   {'was_impossible': False}


Prediction(uid=10, iid=356, r_ui=None, est=4.026772879034854, details={'was_impossible': False})