In [1]:
import random
import math
import numpy as np
import scipy.io as sio
import matplotlib
import matplotlib.pyplot as plt
from numpy.matlib import repmat
import pandas as pd
from sklearn.preprocessing import normalize

%matplotlib inline

In [2]:
def eigsort(V, eigvals): 
    # Sort the eigenvalues from largest to smallest. Store the sorted
    # eigenvalues in the column vector lambd.
    lohival = np.sort(eigvals)
    lohiindex = np.argsort(eigvals)
    lambd = np.flip(lohival)
    index = np.flip(lohiindex)
    Dsort = np.diag(lambd)
    
    # Sort eigenvectors to correspond to the ordered eigenvalues. Store sorted
    # eigenvectors as columns of the matrix vsort.
    M = np.size(lambd)
    Vsort = np.zeros((M, M))
    for i in range(M):
        Vsort[:,i] = V[:,index[i]]
    return Vsort, Dsort

# normc(M) normalizes the columns of M to a length of 1.
def normc(Mat):
    return normalize(Mat, norm='l2', axis=0)

In [3]:
# import rawdata as numpy
rawdata = pd.read_csv('data/rating_cleaned_update.csv')
rawdata = rawdata.to_numpy(dtype=int)

In [4]:
# Randomly seperate rawdata into training and testing based on the user id. 80:20 split
ids = list(set(rawdata[:,0]))
num_users = len(ids)
random.shuffle(ids)
split = int(len(ids) * 0.8)
train_id = np.sort(ids[:split])
test_id = np.sort(ids[split:])
ids = np.sort(ids)
num_train_user = len(train_id)
num_test_user = len(test_id)
print(num_users, num_train_user, num_test_user)
print(ids, train_id, test_id)

54004 43203 10801
[    3     5     7 ... 73512 73513 73515] [    3     7     8 ... 73512 73513 73515] [    5    11    21 ... 73499 73502 73511]


In [5]:
# Base on the user id split, split the dataset as well.
train_data = []
test_data = []
train_index = 0
test_index = 0
for i in rawdata:
    if i[0] > train_id[train_index] and train_index < len(train_id)-1:
        train_index += 1
    if i[0] > test_id[test_index] and test_index < len(test_id)-1:
        test_index += 1
    
    if i[0] == train_id[train_index]:
        train_data.append(i)
    elif i[0] == test_id[test_index]:
        test_data.append(i)
    else:
        print("Error " + str(i[0]))

In [6]:
# Calculate the average score for each anime in the trainning set
average_dict = {}
for i in train_data:
    if i[1] in average_dict.keys():
        average_dict[i[1]][0] += i[2]
        average_dict[i[1]][1] += 1
    else:
        average_dict[i[1]] = [i[2], 1]
for i in average_dict.keys():
    average_dict[i] = average_dict[i][0]/average_dict[i][1]
num_anime = len(average_dict)
print(num_anime)

2815


In [7]:
# Some dictionary that can look up the id with index or index with id
user_idx_to_id = {idx: i for idx, i in enumerate(train_id)}
user_id_to_idx = {i: idx for idx, i in enumerate(train_id)}
anime_idx_to_id = {idx: i for idx, i in enumerate(average_dict.keys())}
anime_id_to_idx = {i: idx for idx, i in enumerate(average_dict.keys())}
# An array with average score of each anime according to the index of anime
average_anime = np.array([average_dict[i] for idx, i in enumerate(average_dict.keys())])

In [8]:
# Make training metrix, contains users rating - mean rating, note that if a user doesn't score one anime, 
# then it by default use the average
average_arr = np.ones((num_train_user, num_anime)) * average_anime
anime_user_arr = average_arr.copy()
for i in train_data:
    anime_user_arr[user_id_to_idx[i[0]],anime_id_to_idx[i[1]]] = i[2]
anime_user_arr -= average_arr

In [None]:
# Calculate the covariance matrix
covMat = np.matmul(anime_user_arr.transpose(), anime_user_arr)
print(covMat.shape)

(2815, 2815)


In [None]:
# Calculate the eigenvalues and eigenvectors
eigvals, V = np.linalg.eig(covMat)
print(V.shape)

(2815, 2815)


In [None]:
# Sort the eigenvalues and eigenectors
V, eigvals = eigsort(V, eigvals)

In [None]:
# Use the first user to regenerate
C = np.matmul(V.transpose(), anime_user_arr[0,:])

In [None]:
# Calculate the first user's regenerated matrix
R = np.matmul(V[:,:], C[:]) + average_anime

In [None]:
# Print and see there are no difference
print(R)
print(anime_user_arr[0] + average_anime)

[8.         6.         9.         ... 7.23036649 7.18562874 7.015625  ]
[8.         6.         9.         ... 7.23036649 7.18562874 7.015625  ]


In [None]:
# Now reshuffle train data
index_list = [i for i in range(len(train_data))]
random.shuffle(index_list)

In [None]:
# Take out one rating of each user and get a new train not enough information to calculate error
train_error_arr = average_arr.copy()
train_MSE_arr = []
met = set()
for i in index_list:
    one_data = train_data[i]
    if not one_data[0] in met:
        met.add(one_data[0])
        train_MSE_arr.append(one_data)
    else:
        train_error_arr[user_id_to_idx[one_data[0]],anime_id_to_idx[one_data[1]]] = one_data[2]
train_error_arr -= average_arr
print(train_error_arr.shape)

(43203, 2815)


In [None]:
# Calculate the regenerating matrix
C = np.matmul(V.transpose(), train_error_arr[:,:].transpose())

In [None]:
# Try different number of top array, and obtain the one with smallest error 
lowest = 0
lowest_value = 2
top_arr = [1, 10, 20, 50, 100, 150, 200, 500, 1000]
# Use all the data above to regenerate
for i in top_arr:
    # Calculate all users' regenerated matrix, use only top n eigenvectors
    R = np.matmul(V[:,:i], C[:i]).transpose() + average_arr
    # Calculate the difference between newly regenerated matrix and the hidden scores, this is training error
    difference = 0
    total = 0
    for j in train_MSE_arr:
        difference += (R[user_id_to_idx[j[0]],anime_id_to_idx[j[1]]] - j[2]) ** 2
        total += 1
    mse = difference/total
    rmse = math.sqrt(mse)
    if rmse < lowest_value:
        lowest_value = rmse
        lowest = i
    print(str(i) + ": " + str(mse) + " " + str(rmse))

1: 1.9144589308332196 1.3836397402623342
10: 1.7962157210788563 1.340229726979243
20: 1.756261745298915 1.3252402594620023
50: 1.7101173594572476 1.3077145558023155
100: 1.6815110860166143 1.2967309227502113
150: 1.6834723778109735 1.2974869470676664
200: 1.6858004632275088 1.298383788880433
500: 1.7214832397192388 1.3120530628443496


In [None]:
# Now calculate the test error, first prepare dictionary that can look up in testing set
test_user_idx_to_id = {idx: i for idx, i in enumerate(test_id)}
test_user_id_to_idx = {i: idx for idx, i in enumerate(test_id)}

In [None]:
# randomly shaffle test dataset
index_list = [i for i in range(len(test_data))]
random.shuffle(index_list)

In [None]:
# Do the same as above to the test dataset
test_average_arr = np.ones((num_test_user, num_anime)) * average_anime
test_error_arr = test_average_arr.copy()
test_MSE_arr = []
met = set()
for i in index_list:
    one_data = test_data[i]
    if not one_data[0] in met:
        met.add(one_data[0])
        test_MSE_arr.append(one_data)
    else:
        test_error_arr[test_user_id_to_idx[one_data[0]],anime_id_to_idx[one_data[1]]] = one_data[2]
test_error_arr -= test_average_arr
print(test_error_arr.shape)
print(len(test_MSE_arr))

In [None]:
C = np.matmul(V.transpose(), test_error_arr[:,:].transpose())
R = np.matmul(V[:,:lowest], C[:lowest]).transpose() + test_average_arr
# Obtain test error MSE and RMSE, use only top 100 eigenvectors
difference = 0
total = 0
for i in test_MSE_arr:
    difference += (R[test_user_id_to_idx[i[0]],anime_id_to_idx[i[1]]] - i[2]) ** 2
    total += 1
mse = difference/total
rmse = math.sqrt(mse)
print(mse, rmse)

In [None]:
# Obtain variance and STD
difference = 0
total = 0
for i in test_MSE_arr:
    difference += (average_anime[anime_id_to_idx[i[1]]] - i[2]) ** 2
    total += 1
mse = difference/total
rmse = math.sqrt(mse)
print(mse, rmse)