In [None]:
import pandas as pd
import numpy as np

from surprise import NormalPredictor, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import SVD, SVDpp, NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import Dataset
from surprise.reader import Reader
from surprise.model_selection import cross_validate

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')
cv_num = 5
measure_set = ['RMSE', 'MSE', 'MAE']
data


In [None]:
import pyodbc
import pandas as pd
db = "UCM"
table = "dbo.[student_course_dataset]"

query = "SELECT * FROM " + db + "." + table

conn = pyodbc.connect(
    'Driver={SQL Server};'
    'Server=DESKTOP-8LSE8HT;'
    'Database=UCM;'
    'Trusted_Connection=yes;'
)


df = pd.read_sql_query(query, conn)
reader = Reader()
df = Dataset.load_from_df(df, reader)


### Normal Predictor
Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be NORMAL.

The prediction $\hat{r}_{ui}$ is generated from a normal distribution with sample mean and sample deviation are calculated using Maximum Likelihood Estimation.

 * $\hat{\mu} = \frac{1}{|R_{train}|} \sum_{r_{ui}\epsilon R_{train}}r_{ui}$
 
 * $\hat{\sigma}^2 = \sum_{r_{ui} \epsilon R_{train} } \frac{(r_{ui} - \hat{\mu})^2}{|R_{train}|}$
 
 
For prediction on a given user we have $\hat{r_{ui}} = b_{ui} = \mu + b_u + b_i$, with user u, bias $b_u$ and item $b_i$ for an unknown user $b_u$ is assumed to be zero

In [None]:
# Normal Predictor
rs = NormalPredictor()
cval = cross_validate(
    rs, 
    data, 
    measures = measure_set, 
    cv = cv_num 
    #,verbose=True
)

## k-NN Algorithms

### Base k-NN Algorithm
Predictions $\hat{r_{ui}}$ are set with:

 * $\hat{r_{ui}} = \frac{\sum_{v \epsilon N_i^k(u)} sim(u,v) * r_{vi} }{\sum_{v \epsilon N_i^k(u)} sim(u,v) }$
 
 or 
 
 * $\hat{r_{ui}} = \frac{\sum_{j \epsilon N_u^k(i)} sim(i,j) * r_{uj} }{\sum_{j \epsilon N_u^k(i)} sim(i,j) }$
  
  
### k-NN with Means Algorithm
Predictions $\hat{r_{ui}}$ are set with:

 * $\hat{r_{ui}} = \mu_u + \frac{\sum_{v \epsilon N_i^k(u)} sim(u,v) * (r_{vi} - \mu_v) }{\sum_{v \epsilon N_i^k(u)} sim(u,v) }$
 
 or 
 
 * $\hat{r_{ui}} = \mu_i + \frac{\sum_{j \epsilon N_u^k(i)} sim(i,j) * (r_{uj} - \mu_j) }{\sum_{j \epsilon N_u^k(i)} sim(i,j) }$
  
  
### k-NN with Z-Score Algorithm
Predictions $\hat{r_{ui}}$ are set with:

 * $\hat{r_{ui}} = \mu_u + \sigma_u * \frac{\sum_{v \epsilon N_i^k(u)} sim(u,v) * (r_{vi} - \mu_v)/\sigma_v }{\sum_{v \epsilon N_i^k(u)} sim(u,v) }$
 
 or 
 
 * $\hat{r_{ui}} = \mu_i + \sigma_u * \frac{\sum_{j \epsilon N_u^k(i)} sim(i,j) * (r_{uj} - \mu_j)/\sigma_j }{\sum_{j \epsilon N_u^k(i)} sim(i,j) }$
  
  
### k-NN with Baseline Algorithm
Predictions $\hat{r_{ui}}$ are set with:

 * $\hat{r_{ui}} = b_{ui} + \frac{\sum_{v \epsilon N_i^k(u)} sim(u,v) * (r_{vi} - b_{vi}) }{\sum_{v \epsilon N_i^k(u)} sim(u,v) }$
 
 or 
 
 * $\hat{r_{ui}} = b_{ui} + \frac{\sum_{j \epsilon N_u^k(i)} sim(i,j) * (r_{uj} - b_{uj}) }{\sum_{j \epsilon N_u^k(i)} sim(i,j) }$
  

In [None]:
# Basic KNN collaborative filtering
rs = KNNBasic(
    k = 40,    # Number of neighbors
    min_k = 1, # Minimum number of neighbors to take into account for aggregation
               # When not met the global average is used
    #sim_options # A dictionary of options for the similarity measure
    #,verbose = True
)
cval = cross_validate(
    rs, 
    data, 
    measures = measure_set, 
    cv = cv_num 
    #,verbose=True
)

# KNN With Means collaborative filtering
rs = KNNWithMeans(
    k = 40,    # Number of neighbors
    min_k = 1, # Minimum number of neighbors to take into account for aggregation
               # When not met the global average is used
    #sim_options # A dictionary of options for the similarity measure
    #,verbose = True
)
cval = cross_validate(
    rs, 
    data, 
    measures = measure_set, 
    cv = cv_num 
    #,verbose=True
)

# KNN with Z-Score collaborative filtering
rs = KNNWithZScore(
    k = 40,    # Number of neighbors
    min_k = 1, # Minimum number of neighbors to take into account for aggregation
               # When not met the global average is used
    #sim_options # A dictionary of options for the similarity measure
    #,verbose = True
)
cval = cross_validate(
    rs, 
    data, 
    measures = measure_set, 
    cv = cv_num 
    #,verbose=True
)

# KNN With Means collaborative filtering
rs = KNNBaseline(
    k = 40,    # Number of neighbors
    min_k = 1, # Minimum number of neighbors to take into account for aggregation
               # When not met the global average is used
    #sim_options # A dictionary of options for the similarity measure
    #bsl_options # A dictionary containing the baseline scores
    #,verbose = True
)
cval = cross_validate(
    rs, 
    data, 
    measures = measure_set, 
    cv = cv_num 
    #,verbose=True
)

## Matrix Factorization-based algorithms

### SVD

Intuitively, a matrix can be viewed as a collection of transformation from some basis. SVD is then the composition of these transformations. In 3D a matrix can be scaling, spinning, and rotations and then the SVD decomposition would be each of those individual components.

The SVD algorithm has several variants, when baselines are not used then this algorithm is equivalent to Probabilistic Matrix Factorization

The predictions $\hat{r_{ui}}$ is set as:
 * $\hat{r_{ui}} = \mu + b_u + b_i + q_i^Tp_u$ 

If the user u is unknown, then the bias $b_u$ and the factors $p_u$ are assumed to be zero. The same applies for item i with $b_i$ and $q_i$ 

To estimate all the unknown values, the following regularized squared error is minimized.

 * $ \sum_{r_{ui} \epsilon R_{train}} (r_{ui} - \hat{r_{ui}})^2 + \lambda (b_i^2 + b_u^2 + ||q_i||^2 + ||p_u||^2)$
 
This minimum is calculated using a straightforward gradient descent. 

 * $b_u \leftarrow b_u + \gamma (e_{ui} - \lambda b_u)$
 * $b_i \leftarrow b_i + \gamma (e_{ui} - \lambda b_i)$
 * $p_u \leftarrow p_u + \gamma (e_{ui}*q_i - \lambda p_u)$
 * $q_i \leftarrow q_i + \gamma (e_{ui}*p_u - \lambda q_i)$
 
Where $e_{ui} = r_{ui} \hat{r_{ui}}$. These steps are performed over all the ratings of the itemset and repeated for each epoch by the parameter $n_epochs$. These baselines are initialized to 0. User and item factors are randomly initialized according to a normal distribution, which can can be updated with the init_mean and init_std_dev parameters. In addition, the learning rate $\gamma$ and the regularization term $\lambda$ can be manually adjusted. The defaults for the above parameters are 0.005 and 0.02 respectively.

To change to the unbiased version of this algorithm predict

 * $\hat{r_{ui}} = q_i^Tp_u$

Which is the equivalent to Probabilistic Matrix Factorization

### SVDpp

The SVD plus plus algorithm, similar to the SVD algorithm but with implicit ratings. The predictions for $\hat{r_{ui}}$ is set as

 * $\hat{r_{ui}} = \mu + b_u + b_i + q_i^T(p_u + |I_u|^{\frac{-1}{2}} \sum_{j \epsilon I_u} y_j)$
 
In this variation, the $y_j$ terms are a new set of items factors that capture implicit ratings. An implicit rating is defined as a user u rated an item j regardless of the rating value.

Moreover if the user u is unknown, then the bias $b_u$ and the factors $p_u$ are assumed to be zero. The same applies for item i with $b_i$, $q_i$ and $y_i$.

The baselines are initialized to 0. User and item factors are randomly initialized according to normal distribution. Altered with the parameters init_mean and init_std_dev. And control over learning rate and regularization terms with lr_all and reg_all.

### NMF - Non-negative Matrix Factorization

Intuitively, a matrix factorization into two matrices. All of these matrices the starting matrix and the two from the decomposition have all positive elements.

A collaborative filtering algorithm based NMF, very close to SVD. The prediction $\hat{r_{ui}}$ is set as:

 * $\hat{r_{ui}} = q_i^Tp_u$

Where the user and item factors are kept positive. This is optimized using (regularized) stochastic gradient descent (SGD) with step sizes that ensures non-negative values.

For each step the SGD procedure, the factors f for user u and item i are updated as follows:

 * $p_{uf} \leftarrow p_{uf} * \frac{\sum_{i \epsilon I_u} q_{if} * r_{ui}}{\sum_{i \epsilon I_u} q_{if} * \hat{r_{ui}} + \lambda_u |I_u| p_{uf} }$
 
 * $q_{if} \leftarrow q_{if} * \frac{\sum_{u \epsilon U_i} p_{uf} * r_{ui}}{\sum_{u \epsilon U_i} p_{uf} * \hat{r_{ui}} + \lambda_i |U_i| q_{if} }$
 
Where both lambda terms are for regularization. Note that this algorithm is highly dependent on the initial values. The user and item factors are uniformly initialized between init_low and init_high. A biased version is also available, this is done by setting the biased parameter to True.

 * $\hat{r_{ui}} = \mu + b_u + b_i + q_i^Tp_u$
 
Furthermore, the baselines are optimized in the same way as in the SVD algorithm. While yeilding better accuracy, the biased version seems higly prone to overfitting thus reduction of factors may help, as well as, increasing the regularization term.

In [None]:
# Use the famous SVD algorithm.
rs = SVD(
    n_factors = 100, 
    n_epochs = 20,
    biased = True,
    init_mean = 0,
    init_std_dev = 0.1,
    lr_all = 0.005, # learnign rate for all parameters
    reg_all = 0.02, # The regularization term for all parameters
    #lr_bu, lr_bi, lr_pu, lr_qi, 
    #learning rate for bu, bi, pu, qi respectively
    # reg_bu, reg_bi, reg_pu, reg_qi
    # Regularization terms for bu, bi, pu, qi respectively
    # random_state # used for random initialization
    verbose = False
)

cval = cross_validate(
    rs, 
    data, 
    measures = measure_set, 
    cv = cv_num, 
    verbose=True
)

rs = SVDpp(
    n_factors = 100, 
    n_epochs = 20,
    init_mean = 0,
    init_std_dev = 0.1,
    lr_all = 0.005, # learnign rate for all parameters
    reg_all = 0.02, # The regularization term for all parameters
    #lr_bu, lr_bi, lr_pu, lr_qi, 
    #learning rate for bu, bi, pu, qi respectively
    # reg_bu, reg_bi, reg_pu, reg_qi
    # Regularization terms for bu, bi, pu, qi respectively
    # random_state # used for random initialization
    verbose = False
)

cval = cross_validate(
    rs, 
    data, 
    measures = measure_set, 
    cv = cv_num, 
    verbose=True
)

rs = NMF(
    n_factors = 15, 
    n_epochs = 20,
    biased = False,
    # reg_pu, reg_qi, reg_bu, reg_bi
    # regularization parameters
    # lr_bu, lr_bi,
    # learning rate parameters
    # init_low, lower bound for random initialization
    # init_high, higher bound for initialization of factors
    # random_state # used for random initialization
    verbose = False
)

cval = cross_validate(
    rs, 
    data, 
    measures = measure_set, 
    cv = cv_num, 
    verbose=True
)

## SlopeOne

This algorithm can be easily used as a base line in performance. This is due to Slope One using a simpler form of linear regression when making predictions. Instead of $f(x) = ax + b$ it uses a simpler form $f(x) = x + b$. This version has been shown to be much more accurate than linear regression for some instances and takes much less storage.

A simple collaborative filtering algorithm, the implementation of the SlopeOne algorithm in surprise follows a simple implementation. The prediction $\hat{r_{ui}}$

 * $\mu_u + \frac{1}{|R_i(u)|} * \sum_{j \epsilon R_i(u)} dev(i,j)$
 
The $R_i(u)$ is the set of relevant items, this is the set of items j rated by u that also have at least one common user with i. The dev(i,j) term is defined as the average difference between the ratings of i and j.

 * $d(i,j) = \frac{1}{U_{ij}} * \sum_{u \epsilon U_{ij}} r_{ui} - r_{uj}$

In [None]:
rs = SlopeOne()

cval = cross_validate(
    rs, 
    data, 
    measures = measure_set, 
    cv = cv_num, 
    verbose=True
)

## Co-Clustering
A collaborative filtering algorithm based on co-clustering. This works by assigning users and items to some clusters $C_u$ and $C_i$ and some co-clusters $C_{ui}$. The prediction $\hat{r_{ui}}$ is set with:

 * $\hat{r_{ui}} = \bar{C_{ui}} + (\mu_u - \bar{C_u}) + (\mu_i - \bar{C_i})$
 
 The $\bar{C_{ui}}$ is the average rating of co-clustering $C_{ui}$. $\bar{C_u}$ is the average rating of u's cluster, and $\bar{C_i}$ is the average rating of i's cluster. If the user is unknown, the prediction is $\hat{r_{ui}} = \mu_i$. If the item is unknown, the prediction is $\hat{r_{ui}} = \mu_u$. If both the user and item are unknown, the prediction is just $\hat{r_{ui}} = \mu$

In [None]:
rs = CoClustering(
    n_cltr_u = 3, # number of user clusters
    n_cltr_i = 3, # number of item clusters
    n_epochs = 20, #Number of iterations of optimization loop
    #random_state,
    #verbose
)

cval = cross_validate(
    rs, 
    data, 
    measures = measure_set, 
    cv = cv_num, 
    verbose=True
)

In [None]:
print("KNNBasic:")
rs = KNNBasic(
    k = 40,    # Number of neighbors
    min_k = 1, # Minimum number of neighbors to take into account for aggregation
               # When not met the global average is used
    #sim_options # A dictionary of options for the similarity measure
    #,verbose = True
)
cval = cross_validate(
    rs, 
    df, 
    measures = measure_set, 
    cv = cv_num 
    #,verbose=True
)
print(cval)
print("SVD:")
rs = SVD(
    n_factors = 100, 
    n_epochs = 20,
    biased = True,
    init_mean = 0,
    init_std_dev = 0.1,
    lr_all = 0.005, # learnign rate for all parameters
    reg_all = 0.02, # The regularization term for all parameters
    #lr_bu, lr_bi, lr_pu, lr_qi, 
    #learning rate for bu, bi, pu, qi respectively
    # reg_bu, reg_bi, reg_pu, reg_qi
    # Regularization terms for bu, bi, pu, qi respectively
    # random_state # used for random initialization
    verbose = False
)

cval = cross_validate(
    rs, 
    df, 
    measures = measure_set, 
    cv = cv_num, 
    verbose=True
)
print(cval)
print("SVDpp:")
rs = SVDpp(
    n_factors = 100, 
    n_epochs = 20,
    init_mean = 0,
    init_std_dev = 0.1,
    lr_all = 0.005, # learnign rate for all parameters
    reg_all = 0.02, # The regularization term for all parameters
    #lr_bu, lr_bi, lr_pu, lr_qi, 
    #learning rate for bu, bi, pu, qi respectively
    # reg_bu, reg_bi, reg_pu, reg_qi
    # Regularization terms for bu, bi, pu, qi respectively
    # random_state # used for random initialization
    verbose = False
)

cval = cross_validate(
    rs, 
    df, 
    measures = measure_set, 
    cv = cv_num, 
    verbose=True
)
print(cval)

print("CoClustering:")
rs = CoClustering(
    n_cltr_u = 3, # number of user clusters
    n_cltr_i = 3, # number of item clusters
    n_epochs = 20, #Number of iterations of optimization loop
    #random_state,
    #verbose
)

cval = cross_validate(
    rs, 
    df, 
    measures = measure_set, 
    cv = cv_num, 
    verbose=True
)
print(cval)

In [None]:
print( "Slope One:")
rs = SlopeOne()

cval = cross_validate(
    rs, 
    df, 
    measures = measure_set, 
    cv = 10, 
    verbose=True
)


