In [5]:
import pandas as pd
import numpy as np
from random import randrange

In [6]:
def Split_Model(file_name, k):
    
    def rating_mat(file_name):
        df = pd.read_csv(file_name)
        df_pivot = df.pivot(index='userId', columns='movieId', values='rating')
        mat = df_pivot.values
        return mat
    
    def dataSplit(mat, split = k):
        n = split
        all_indices = np.argwhere(~np.isnan(mat))
        np.random.shuffle(all_indices)
        train_ind, test_ind = all_indices[:int(n*len(all_indices))], all_indices[int(n*len(all_indices))+1:]
        train = np.empty(mat.shape)
        test = np.empty(mat.shape)
        train[:] = np.nan
        test[:] = np.nan
        for i in train_ind:
            train[i[0], i[1]] = mat[i[0], i[1]]
        for j in test_ind:
            test[j[0], j[1]] = mat[j[0], j[1]]
        
        return train, test
    
    def trainingModel(train):
        '''Learn the vectors p_u and q_i for users and items'''

        n_factors = 15  # number of factors
        alpha = .01  # learning rate
        n_epochs = 10  # number of iteration of the SGD procedure

    # Randomly initialize the user and item factors.
        p = np.random.normal(0, .1, (train.shape[0], n_factors))
        q = np.random.normal(0, .1, (train.shape[1], n_factors))
    
        rating_present = np.argwhere(~np.isnan(train))

        for _ in range(n_epochs):
            for i in rating_present:
                err = train[i[0],i[1]] - np.dot(p[i[0]], q[i[1]])
            # Update vectors p_u and q_i
                p[i[0]] += alpha * err * q[i[1]]
                q[i[1]] += alpha * err * p[i[0]]
            
        return p , q
    
    def testingModel(test, p, q):
        SE = 0
        rating_present_test = np.argwhere(~np.isnan(test))
        for j in rating_present_test:
            error = (test[j[0],j[1]] - np.dot(p[j[0]],q[j[1]]))**2
            SE += error
        MSE = (SE/len(rating_present_test))
        print('The MSE for the model is',k, MSE)
        return MSE
    
    mat = rating_mat(file_name)
    train, test = dataSplit(mat)
    p, q = trainingModel(train)
    testingModel(test, p, q)
    

In [7]:
for t in range(2, 10):
    k = 0.1*t
    Split_Model('movie_ratings.csv', k)

The MSE for the model is 0.2 3.414105656875951
The MSE for the model is 0.30000000000000004 2.2867490553242136
The MSE for the model is 0.4 1.884767647147273
The MSE for the model is 0.5 1.6390743646106047
The MSE for the model is 0.6000000000000001 1.510265623079752
The MSE for the model is 0.7000000000000001 1.3978398931661327
The MSE for the model is 0.8 1.37764408225474
The MSE for the model is 0.9 1.3064373812578163


In [4]:
Split_Model('movie_ratings.csv')

The MSE for the model is 1.3862230906145325


In [9]:
def KFOLD_model(file_name, k):
    def rating_mat(file_name):
        df = pd.read_csv(file_name)
        df_pivot = df.pivot(index='userId', columns='movieId', values='rating')
        mat = df_pivot.values
        return mat
    
    def trainingModelKFOLD(train_ind):
    ## Learn the vectors p_u and q_i for users and items
        n_factors = 15  # number of factors
        alpha = .01  # learning rate
        n_epochs = 10  # number of iteration of the SGD procedure
        train = np.empty(mat.shape)
    
        for i in train_ind:
            train[i[0], i[1]] = mat[i[0], i[1]]

    ## Randomly initialize the user and item factors.
        p = np.random.normal(0, .1, (train.shape[0], n_factors))
        q = np.random.normal(0, .1, (train.shape[1], n_factors))
        for _ in range(n_epochs):
            for i in train_ind:
                err = train[i[0],i[1]] - np.dot(p[i[0]], q[i[1]])
            # Update vectors p_u and q_i
                p[i[0]] += alpha * err * q[i[1]]
                q[i[1]] += alpha * err * p[i[0]]
        return p , q
    
    def testingModelKFOLD(test_ind, p, q):
        SE = 0
        test = np.empty(mat.shape)
    
        for j in test_ind:
            test[j[0], j[1]] = mat[j[0], j[1]]
    
        for j in test_ind:
            error = (test[j[0],j[1]] - np.dot(p[j[0]],q[j[1]]))**2
            SE += error
        MSE = (SE/len(rating_present))**0.5
        return MSE
    
    def cross_validation_split(dataset, folds=k):
        dataset_split = list()
        dataset_copy = list(dataset)
        fold_size = int(len(dataset) / folds)
        for i in range(folds):
            fold = list()
            while len(fold) < fold_size:
                index = randrange(len(dataset_copy))
                fold.append(dataset_copy.pop(index))
            dataset_split.append(fold)
        return dataset_split
    
    mat = rating_mat(file_name)
    rating_present = np.argwhere(~np.isnan(mat))
    kfold = cross_validation_split(rating_present)
    MSE = 0
    for i in range(len(kfold)):
        test_ind = kfold[i]
        temp = list(kfold)
        temp.pop(i)
        train_ind = list()
        for j in temp:
            for i in j:
                train_ind.append(i)
        p, q = trainingModelKFOLD(train_ind)
        MSE += testingModelKFOLD(test_ind, p, q)
    Final_MSE = MSE / len(kfold)
    print('The MSE for K-Fold is ',k, (k-1)/k, Final_MSE)

In [4]:
KFOLD_model('movie_ratings.csv')

The MSE for K-Fold is  0.5197907415708798


In [10]:
for k in range(2, 10):
    KFOLD_model('movie_ratings.csv', k)

The MSE for K-Fold is  2 0.5 0.9208249453314504
The MSE for K-Fold is  3 0.6666666666666666 0.69725904312868
The MSE for K-Fold is  4 0.75 0.5868887307234306
The MSE for K-Fold is  5 0.8 0.5192234490626204
The MSE for K-Fold is  6 0.8333333333333334 0.46928731359127224
The MSE for K-Fold is  7 0.8571428571428571 0.4328003011580859
The MSE for K-Fold is  8 0.875 0.40293676066303785
The MSE for K-Fold is  9 0.8888888888888888 0.37927397116940514
