### Singular Value Thresholding (testing on joke rating dataset)

##### 1. load in dataset

In [14]:
#python
import scipy.io as sio
import numpy as np
import random
import matplotlib.pyplot as plt

# load the data matrix X
d_jest = sio.loadmat('jesterdata.mat') 
X = d_jest['X']
# load known ratings y and true ratings truey
d_new = sio.loadmat('newuser.mat') 
y = d_new['y']
true_y = d_new['truey']
# total number of joke ratings should be m = 100, n = 7200
m, n = X.shape

# train on ratings we know for the new user
train_indices = np.squeeze(y != -99) 
num_train = np.count_nonzero(train_indices)

# test on ratings we don’t know
test_indices = np.logical_not(train_indices) 
num_test = m - num_train
X_data = X[train_indices , 0:20] 
y_data = y[train_indices]
y_test = true_y[test_indices]

In [3]:
print("The training features dataset X_data:", X_data.shape)

The training features dataset X_data: (25, 20)


##### 2. create incomplete data on training dataset

In [46]:
# Make a copy of the original training data
X_incomplete = X_data.copy()

# Randomly create 20% vacancies inside the matrix 
# (automatically round up to an integer)
n, p = X_incomplete.shape

NUM_VAC = int(X_incomplete.size/5)
VAC = -99

for _ in range(NUM_VAC):
    i = random.randint(0,n-1)
    j = random.randint(0,p-1)
    X_incomplete[i,j] = VAC


##### 3. use singular value thresholding to complete the matrix

In [89]:
X_guess = np.zeros([n,p])

for i in range(n):
    for j in range(p):
        if X_incomplete[i][j] != VAC:
            X_guess[i][j] = X_incomplete[i][j]

In [90]:
# set iteration limit k
k = 10
# set threshold for singular values
threshold = 40
# set error tolerance
max_err = 0.5

for _ in range(k):

    # get SVD of the old matrix and threshold singular values
    u, s, vh = np.linalg.svd(X_guess)
    s[s < threshold] = 0

    # use remaining singular values build new sigma matrix
    new_sigma = np.zeros([n, p])
    for idx in range(len(s)):
        new_sigma[idx][idx] = s[idx]
    
    # build new matrix
    X_new = u @ new_sigma @ vh
    # recover existing entries
    for i in range(n):
        for j in range(p):
            if X_incomplete[i][j] != VAC:
                X_new[i][j] = X_incomplete[i][j]
    err = np.linalg.norm(X_guess - X_new)
    print("Error reduced to ", err)
    if err < max_err:
        break
    else:
        X_guess = X_new

print("Get an predicted matrix X")

Error reduced to  21.977317204734835
Error reduced to  7.207214039346938
Error reduced to  3.1457464916538544
Error reduced to  1.5580123083340172
Error reduced to  0.8206299510602088
Error reduced to  0.44815153492995663
Get an predicted matrix X
