In [1]:
import random
import operator
import pandas as pd
import numpy as np

In [2]:
rating_path = '../data/ml-latest-small/ratings.csv'
train_path = '../data/train_set.csv'
test_path = '../data/test_set.csv'

train_frac = 0.8
latent_dim = 10
sigma = 0.5
sigma_p = 0.5
sigma_q = 0.5
mu = 0

In [3]:
rating_pd = pd.read_csv(rating_path)
num_user = np.unique(rating_pd.values[:, 0]).shape[0]
num_movie = np.unique(rating_pd.values[:, 1]).shape[0]

# df_shuffled = rating_pd.sample(frac=1).reset_index(drop=True)
# train_set = df_shuffled.iloc[:int(train_frac * len(df_shuffled))]
# train_set.to_csv(path_or_buf=train_path, index=False)
# test_set = df_shuffled.iloc[int(train_frac * len(df_shuffled)):]
# test_set.to_csv(path_or_buf=test_path, index=False)
train_rating_pd = pd.read_csv(train_path)

In [4]:
r_matrix = np.zeros((num_user, num_movie))
p_matrix = np.random.normal(mu, sigma_p, (num_user, latent_dim))
q_matrix = np.random.normal(mu, sigma_q, (num_movie, latent_dim))

movie_dic = {}
idx = 0
for movie_id in np.unique(train_rating_pd.values[:, 1]):
    movie_dic[movie_id] = idx
    idx += 1
    
movie_IDD = set(rating_pd['movieId'].unique().tolist())
train_ID = set(train_rating_pd['movieId'].unique().tolist())
not_in_training_ID = movie_IDD - train_ID    

for index, row in train_rating_pd.iterrows():
    r_matrix[int(row['userId'] - 1), movie_dic[row['movieId']]] = row['rating']
    
sorted_dic = sorted(movie_dic.items(), key=operator.itemgetter(1))


## saving the movie_ID dictionary
with open('../output/movie_indexes.csv', 'w') as f:
    f.write('matrix_index,movie_id\n')
    for movie_id, index in sorted_dic:
        f.write('{},{}\n'.format(index, int(movie_id)))  
index = len(sorted_dic)
with open('../doc/movie_indexes.csv', 'a') as f:
    for movie_id in not_in_training_ID:
        f.write('{},{}\n'.format(index, int(movie_id)))
        index += 1

In [5]:
def MSE (r, p, q):
    p_qt_matrix = np.dot(p, q.T)
    result = r - p_qt_matrix
    I = np.zeros_like(r)
    I[r != 0] = 0.5
    result = np.power(result, 2)
    result = np.multiply(I, result)
    return np.sum(result)
    
def L2_loss(sig1, sig2, matrix):
    result = np.power(matrix, 2)
    result = np.sum(result)
    constant = sig1 / float(sig2 * 2)
    return constant * result  
    
def p_q_derivative (r, p, q, sigma, sigma_p, sigma_q):
    p_qt_matrix = np.dot(p, q.T)
    residual = r - p_qt_matrix
    I = np.zeros_like(r)
    I[r != 0] = 1
    residual = np.multiply(I, residual)
    p_derivative = -1.0 * np.dot(residual, q) + sigma / sigma_p * p
    q_derivative = -1.0 * np.dot(residual.T, p) + sigma / sigma_q * q
    return p_derivative, q_derivative  

In [None]:
big_lr = 0.001
small_lr = 0.0001
cnt = 0

error = MSE(r_matrix, p_matrix, q_matrix) + L2_loss(sigma, sigma_q, q_matrix) + L2_loss(sigma, sigma_p, p_matrix)

while (error >= 238):


    if error > 300:
        learning_rate = big_lr
    else:
        learning_rate = small_lr

    p_derivative, q_derivative = p_q_derivative (r_matrix, p_matrix, q_matrix, sigma, sigma_p, sigma_q)
    p_matrix = p_matrix - learning_rate * p_derivative
    q_matrix = q_matrix - learning_rate * q_derivative
    error = MSE(r_matrix, p_matrix, q_matrix)
    cnt += 1
    if cnt % 100 == 0:
        print(error)


32586.83864467114
28078.824178005358
21988.209204592855
18827.181616596827
16880.93604299368
15595.207402978862
14705.14916796104
14062.617284524049
13579.566212043554
13203.830961291424
12903.988639013824
12660.133334815395
12458.887188787023
12290.802558104326
12148.95495861565
12028.128086828316
11924.310295668716
11834.366341764035
11755.815096465398
11686.675189038606
11625.355154820641
11570.572641035145
11521.292731230917
11476.679413843036
11436.056741819542
11398.877578977603
11364.698456855383
11333.159363442379
11303.96746319763
11276.883882577
11251.712820347924


In [None]:
print(cnt, error)

In [None]:
pd.DataFrame(p_matrix).to_csv("../output/p_matrix_A2_final_{}.csv".format(latent_dim))
pd.DataFrame(q_matrix).to_csv("../output/q_matrix_A2_final_{}.csv".format(latent_dim))