In [1]:
import time
start = time.time()

import sys
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
train_file_path = './test/u3.base'
test_file_path = './test/u3.test'
result_file_path = train_file_path + "_prediction.txt"
columns = ["user_id", "item_id", "rating", "time_stamp"]

train_df = pd.read_csv(train_file_path, names=columns, delimiter='\t')
test_df = pd.read_csv(test_file_path, names=columns, delimiter='\t')

# We are not going to use the last column "time_stamp".
train_df.drop(columns=train_df.columns[-1], inplace=True)
test_df.drop(columns=test_df.columns[-1], inplace=True)

train_user_item_matrix = train_df.pivot(index='user_id', columns='item_id', values='rating')
test_user_item_matrix = test_df.pivot(index='user_id', columns='item_id', values='rating')

In [3]:
# test 빼기
def matrix_factorization(train_matrix, test_matrix, lmbd=0.02, k=1, std_dev=0.5, epochs=100, learning_rate=0.01):
    num_of_user = len(train_matrix.index)
    num_of_item = len(train_matrix.columns)

    u = std_dev * np.random.randn(num_of_user, k) # user embedding
    v = std_dev * np.random.randn(num_of_item, k) # item embedding

    bias_user = np.zeros((num_of_user, 1))
    bias_item = np.zeros((num_of_item, 1))

    matrix_mean = np.mean(train_matrix.values[~np.isnan(train_matrix.values)])

    train_not_nan_indices = train_matrix.stack(dropna=True).index.tolist()
    test_not_nan_indices = test_matrix.stack(dropna=True).index.tolist()

    train_set = [(i, j, train_matrix.loc[i, j]) for i, j in train_not_nan_indices]
    test_set = [(i, j, test_matrix.loc[i, j]) for i, j in test_not_nan_indices]

    train_user_item_matrix_index = train_user_item_matrix.index.tolist()
    train_user_item_matrix_columns = train_user_item_matrix.columns.tolist()

    train_avg_costs = []
    test_avg_costs = []

    for epoch in range(epochs):
        np.random.shuffle(train_set)

        train_avg_cost = 0
        test_avg_cost = 0

        for i, j, ground_truth in train_set:
            absolute_i = train_user_item_matrix_index.index(i)
            absolute_j = train_user_item_matrix_columns.index(j)

            logit = matrix_mean + bias_user[absolute_i] + bias_item[absolute_j] + u[absolute_i, :].dot(v[absolute_j, :].T)
            e = ground_truth - logit

            u[absolute_i, :] += learning_rate * (e * v[absolute_j, :] - lmbd * u[absolute_i, :])
            v[absolute_j, :] += learning_rate * (e * u[absolute_i, :] - lmbd * v[absolute_j, :])

            bias_user[absolute_i] += learning_rate * (e - lmbd * bias_user[absolute_i])
            bias_item[absolute_j] += learning_rate * (e - lmbd * bias_item[absolute_j])

            rms =  sqrt(mean_squared_error([ground_truth], logit))
            train_avg_cost += rms / len(train_set)

        for i, j, ground_truth in test_set:
            if i not in train_user_item_matrix_index or j not in train_user_item_matrix_columns:
                continue

            absolute_i = train_user_item_matrix_index.index(i)
            absolute_j = train_user_item_matrix_columns.index(j)

            logit = matrix_mean + bias_user[absolute_i] + bias_item[absolute_j] + u[absolute_i, :].dot(v[absolute_j, :].T)

            test_rms = sqrt(mean_squared_error([ground_truth], logit))
            test_avg_cost += test_rms / len(test_set)

        train_avg_costs.append(train_avg_cost)
        test_avg_costs.append(test_avg_cost)

        print('Epoch: {} / {}\ntrain cost: {}\ntest cost: {}'.format(epoch + 1, epochs, train_avg_cost, test_avg_cost))

        if epoch > 0:
            if test_avg_costs[-2] < test_avg_cost:
                return matrix_mean, u, v, bias_user, bias_item

    return matrix_mean, u, v, bias_user, bias_item

In [4]:
mean, u, v, bias_user, bias_item = matrix_factorization(train_user_item_matrix, test_user_item_matrix, k=1, learning_rate=0.01, std_dev=0.5)

Epoch: 1 / 100
train cost: 0.8402129925859583
test cost: 0.7820128186457829
Epoch: 2 / 100
train cost: 0.7697538677490802
test cost: 0.7602714176059256
Epoch: 3 / 100
train cost: 0.7510695148703496
test cost: 0.7517144592014056
Epoch: 4 / 100
train cost: 0.7426469900902949
test cost: 0.7476961270486939
Epoch: 5 / 100
train cost: 0.7369532011532859
test cost: 0.7463670793568119
Epoch: 6 / 100
train cost: 0.7337336310423787
test cost: 0.7448079136683505
Epoch: 7 / 100
train cost: 0.7308610750309621
test cost: 0.744021234429326
Epoch: 8 / 100
train cost: 0.7294235002994306
test cost: 0.7430494388294997
Epoch: 9 / 100
train cost: 0.7275363285646748
test cost: 0.742409925266472
Epoch: 10 / 100
train cost: 0.7262775498913565
test cost: 0.7429225909159588


In [5]:
predicted_user_item_matrix = u.dot(v.T) + mean + bias_user + bias_item.T
predicted_user_item_matrix[predicted_user_item_matrix < 0] = 0
predicted_user_item_matrix[predicted_user_item_matrix > 5] = 5

result_df = pd.DataFrame(predicted_user_item_matrix).apply(lambda x: np.round(x, 0))

result_df = result_df.set_index(train_user_item_matrix.index)
result_df.columns = train_user_item_matrix.columns

In [6]:
for i in range(test_df.shape[0]):
    user_id, item_id = test_df.values[i][0], test_df.values[i][1]
    if user_id not in result_df.index:
        result_df.loc[user_id] = round(mean)
    if item_id not in result_df.columns:
        result_df[item_id] = round(mean)

In [7]:
with open(result_file_path, 'w') as file:
    for a, b in result_df.stack().items():
        user = a[0]
        item = a[1]
        rating = b
        file.write(f"{user}\t{item}\t{rating}\n")

print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

time : 132.25130200386047


In [8]:
!mono ./test/PA4.exe ./test/u3

the number of ratings that didn't be predicted: 0
the number of ratings that were unproperly predicted [ex. >=10, <0, NaN, or format errors]: 0
If the counted number is large, please check your codes again.

The bigger value means that the ratings are predicted more incorrectly
RMSE: 0.9889894
