In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("../../data/ml-100k/ub.base", sep="\t", 
                 header=None, names=['user id', 'item id', 'rating', 'timestamp'])
df_test = pd.read_csv("../../data/ml-100k/ub.test", sep="\t", 
                 header=None, names=['user id', 'item id', 'rating', 'timestamp'])

In [3]:
df_train.shape, df_test.shape

((90570, 4), (9430, 4))

In [4]:
og_matrix = (
    pd.pivot(data=df_train, index='user id', columns='item id', values='rating')
)

In [5]:
test_matrix = (
    pd.pivot(data=df_test, index='user id', columns='item id', values='rating')
)
not_present = [i for i in df_train['item id'].unique() if i not in df_test['item id'].unique()]
not_present_og = [i for i in df_test['item id'].unique() if i not in df_train['item id'].unique()]

test_matrix = pd.concat([
    test_matrix,
    pd.DataFrame(index=test_matrix.index, columns=not_present)],
    axis=1
)

for c in not_present_og:
    test_matrix = test_matrix.drop(c, axis=1).copy()
    
test_matrix = test_matrix.reindex(sorted(test_matrix.columns), axis=1).copy()

In [6]:
item_mean = og_matrix.mean(axis=0)
matrix = (og_matrix - item_mean).fillna(0)
U, s, V = np.linalg.svd(matrix, full_matrices=True)

In [7]:
k=10

In [8]:
def get_results(U, s, V, k):
    print(k)

    s=np.diag(s)
    # we take only the k most significant features
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]
    s_root=np.sqrt(s)
    Usk=np.dot(U,s_root)
    skV=np.dot(s_root,V)
    UsV = np.dot(Usk, skV)
    print(UsV.shape)

    x = np.tile(item_mean, (matrix.shape[0],1))
    UsV = UsV + x
        
    squared_errors = [x for x in map(lambda x: np.power(x, 2), (og_matrix - UsV).values.flatten())
                      if pd.notna(x)]
    rmse = np.sqrt(np.sum(squared_errors) / len(squared_errors))
    print("Train RMSE", rmse)
    
    
    squared_errors = [x for x in map(lambda x: np.power(x, 2), (test_matrix - UsV).values.flatten())
                  if pd.notna(x)]
    rmse = np.sqrt(np.sum(squared_errors) / len(squared_errors))
    print("Test RMSE", rmse)
    
    print(50*'=')

In [9]:
for k in [8,10,12,14,17]:
    get_results(U, s, V, k)

8
(943, 1675)
Train RMSE 0.8861326668620242
Test RMSE 1.015653883191067
10
(943, 1675)
Train RMSE 0.8732436486681692
Test RMSE 1.0143116763796434
12
(943, 1675)
Train RMSE 0.8618170543911519
Test RMSE 1.0125982413804526
14
(943, 1675)
Train RMSE 0.8505886395341836
Test RMSE 1.0125974809841913
17
(943, 1675)
Train RMSE 0.8342582700722256
Test RMSE 1.0122006147967102
