In [1]:
import time
start = time.time()

import sys
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
train_file_path = './test/u1.base'
test_file_path = './test/u1.test'
result_file_path = train_file_path + "_prediction.txt"
columns = ["user_id", "item_id", "rating", "time_stamp"]

train_df = pd.read_csv(train_file_path, names=columns, delimiter='\t')
test_df = pd.read_csv(test_file_path, names=columns, delimiter='\t')

# We are not going to use the last column "time_stamp".
train_df.drop(columns=train_df.columns[-1], inplace=True)
test_df.drop(columns=test_df.columns[-1], inplace=True)

train_user_item_matrix = train_df.pivot(index='user_id', columns='item_id', values='rating')
test_user_item_matrix = test_df.pivot(index='user_id', columns='item_id', values='rating')

In [40]:
for i, j in test_user_item_matrix.stack().index.tolist():
    if i in train_user_item_matrix.index and j in train_user_item_matrix.columns:


nan


In [41]:
train_not_nan = train_user_item_matrix.notnull()

In [45]:
train_not_nan.where(train_not_nan > 0).stack().index

MultiIndex([(  1,    1),
            (  1,    2),
            (  1,    3),
            (  1,    4),
            (  1,    5),
            (  1,    7),
            (  1,    8),
            (  1,    9),
            (  1,   11),
            (  1,   13),
            ...
            (943,  943),
            (943, 1011),
            (943, 1028),
            (943, 1044),
            (943, 1047),
            (943, 1067),
            (943, 1074),
            (943, 1188),
            (943, 1228),
            (943, 1330)],
           names=['user_id', 'item_id'], length=80000)

In [3]:
# test 빼기
def matrix_factorization(train_matrix, test_matrix, lmbd=0.02, k=1, std_dev=0.5, epochs=100, learning_rate=0.01):
    num_of_user = len(train_matrix.index)
    num_of_item = len(train_matrix.columns)

    train_matrix_np = train_matrix.values
    test_matrix_np = test_matrix.values

    u = std_dev * np.random.randn(num_of_user, k) # user embedding
    v = std_dev * np.random.randn(num_of_item, k) # item embedding

    bias_user = np.zeros((num_of_user, 1))
    bias_item = np.zeros((num_of_item, 1))

    train_not_nan = ~np.isnan(train_matrix_np)
    test_not_nan = ~np.isnan(test_matrix_np)

    matrix_mean = np.mean(train_matrix.values[train_not_nan])

    train_i_list, train_j_list = np.where(train_not_nan > 0)
    test_i_list, test_j_list = np.where(test_not_nan > 0)

    train_set = [(i, j, train_matrix.iloc[i, j]) for i, j in zip(train_i_list, train_j_list)]
    test_set = [(i, j, test_matrix.iloc[i, j]) for i, j in zip(test_i_list, test_j_list)]

    train_avg_costs = []
    test_avg_costs = []

    for epoch in range(epochs):
        np.random.shuffle(train_set)

        train_avg_cost = 0
        test_avg_cost = 0

        for i, j, ground_truth in train_set:
            logit = matrix_mean + bias_user[i] + bias_item[j] + u[i, :].dot(v[j, :].T)
            e = ground_truth - logit

            u[i, :] += learning_rate * (e * v[j, :] - lmbd * u[i, :])
            v[j, :] += learning_rate * (e * u[i, :] - lmbd * v[j, :])

            bias_user[i] += learning_rate * (e - lmbd * bias_user[i])
            bias_item[j] += learning_rate * (e - lmbd * bias_item[j])

            rms =  sqrt(mean_squared_error([ground_truth], logit))
            train_avg_cost += rms / len(train_set)

        for i, j, ground_truth in test_set:
            logit = matrix_mean + bias_user[i] + bias_item[j] + u[i, :].dot(v[j, :].T)
            test_rms = sqrt(mean_squared_error([ground_truth], logit))
            test_avg_cost += test_rms / len(test_set)

        train_avg_costs.append(train_avg_cost)
        test_avg_costs.append(test_avg_cost)

        print('Epoch: {} / {}\ntrain cost: {}\ntest cost: {}'.format(epoch + 1, epochs, train_avg_cost, test_avg_cost))

        if epoch > 0:
            if test_avg_costs[-2] < test_avg_cost:
                return matrix_mean, u, v, bias_user, bias_item

    return matrix_mean, u, v, bias_user, bias_item

In [4]:
# def make_pre_use_preference_matrix(user_item_matrix):
#     condition = pd.notnull(user_item_matrix)
#     temp = user_item_matrix.copy()
#     temp[condition] = 1.0
#     return temp

In [5]:
# train_pre_use = make_pre_use_preference_matrix(train_user_item_matrix)
# test_pre_use = make_pre_use_preference_matrix(test_user_item_matrix)

In [6]:
# from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
#
# matrix_fact = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.005)
# matrix_fact.fit(train_df[['user_id', 'item_id']], np.full(train_df.shape[0], 1.0))

In [7]:
# bias_user = matrix_fact.user_biases
# bias_item = matrix_fact.item_biases
# mean = matrix_fact.global_mean
# u = matrix_fact.user_features
# v = matrix_fact.item_features

In [8]:
# matrix_fact = KernelMF(n_epochs=130, n_factors=60, verbose=0, lr=0.001, reg=0.005)
# matrix_fact.fit(train_df[['user_id', 'item_id']], train_df['rating'])
#
# pred = matrix_fact.predict(test_df[['user_id', 'item_id']])
# rmse = mean_squared_error(test_df['rating'], pred, squared=False)
# print(f"\nTest RMSE: {rmse:.4f}")

In [9]:
# mean, u, v, bias_user, bias_item = matrix_factorization(train_pre_use, test_pre_use, learning_rate=0.01, std_dev=0.5, epochs=10, k = 1)
# predicted_pre_use_matrix = u.dot(v.T) + mean + bias_user + bias_item.T

In [10]:
# threshold = 0.5
# train_user_item_matrix[predicted_pre_use_matrix < threshold] = 0

In [21]:
mean, u, v, bias_user, bias_item = matrix_factorization(train_user_item_matrix, test_user_item_matrix, k=20, learning_rate=0.01, std_dev=0.5)

Epoch: 1 / 100
train cost: 1.0520693546806856
test cost: 1.0058305673040673
Epoch: 2 / 100
train cost: 0.8327288061424265
test cost: 0.9420007073414711
Epoch: 3 / 100
train cost: 0.762476925788685
test cost: 0.9178810742786467
Epoch: 4 / 100
train cost: 0.7264792444785568
test cost: 0.9061756487681195
Epoch: 5 / 100
train cost: 0.703582343531787
test cost: 0.9013349226175591
Epoch: 6 / 100
train cost: 0.6868793414654897
test cost: 0.8996113696474577
Epoch: 7 / 100
train cost: 0.6732226511233174
test cost: 0.8985998627184406
Epoch: 8 / 100
train cost: 0.6618807413225942
test cost: 0.8979259299779525
Epoch: 9 / 100
train cost: 0.651458721093697
test cost: 0.8985188520563034


In [22]:
predicted_user_item_matrix = u.dot(v.T) + mean + bias_user + bias_item.T
predicted_user_item_matrix[predicted_user_item_matrix < 0] = 0
predicted_user_item_matrix[predicted_user_item_matrix > 5] = 5

result_df = pd.DataFrame(predicted_user_item_matrix).apply(lambda x: np.round(x, 0))

result_df = result_df.set_index(train_user_item_matrix.index)
result_df.columns = train_user_item_matrix.columns

In [23]:
result_df.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            934, 935, 936, 937, 938, 939, 940, 941, 942, 943],
           dtype='int64', name='user_id', length=943)

In [24]:
for i in range(test_df.shape[0]):
    user_id, item_id = test_df.values[i][0], test_df.values[i][1]
    if user_id not in result_df.index:
        print('Userid' , user_id)
        result_df.loc[user_id] = round(mean)
    if item_id not in result_df.columns:
        print('itemid', item_id)
        result_df[item_id] = round(mean)

itemid 599
itemid 711
itemid 814
itemid 830
itemid 852
itemid 857
itemid 1156
itemid 1236
itemid 1309
itemid 1310
itemid 1320
itemid 1343
itemid 1348
itemid 1364
itemid 1373
itemid 1457
itemid 1458
itemid 1492
itemid 1493
itemid 1498
itemid 1505
itemid 1520
itemid 1533
itemid 1536
itemid 1543
itemid 1557
itemid 1561
itemid 1562
itemid 1563
itemid 1565
itemid 1582
itemid 1586


In [16]:
result_df

item_id,1,2,3,4,5,6,7,8,9,10,...,1533,1536,1543,1557,1561,1562,1563,1565,1582,1586
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.0,4.0,4.0,3.0,3.0,4.0,4.0,4.0,5.0,...,4,4,4,4,4,4,4,4,4,4
2,4.0,4.0,3.0,3.0,4.0,5.0,4.0,5.0,4.0,3.0,...,4,4,4,4,4,4,4,4,4,4
3,4.0,3.0,3.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,...,4,4,4,4,4,4,4,4,4,4
4,5.0,2.0,4.0,5.0,3.0,4.0,5.0,5.0,4.0,3.0,...,4,4,4,4,4,4,4,4,4,4
5,3.0,3.0,3.0,3.0,2.0,4.0,3.0,3.0,3.0,3.0,...,4,4,4,4,4,4,4,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,5.0,4.0,3.0,4.0,5.0,5.0,4.0,5.0,5.0,5.0,...,4,4,4,4,4,4,4,4,4,4
940,4.0,3.0,3.0,2.0,4.0,5.0,3.0,4.0,4.0,3.0,...,4,4,4,4,4,4,4,4,4,4
941,4.0,3.0,3.0,3.0,4.0,5.0,4.0,5.0,5.0,4.0,...,4,4,4,4,4,4,4,4,4,4
942,4.0,4.0,3.0,4.0,4.0,5.0,4.0,4.0,4.0,4.0,...,4,4,4,4,4,4,4,4,4,4


In [17]:
with open(result_file_path, 'w') as file:
    for a, b in result_df.stack().items():
        user = a[0]
        item = a[1]
        rating = b
        file.write(f"{user}\t{item}\t{rating}\n")

print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

time : 1076.2291750907898


In [18]:
!mono ./test/PA4.exe ./test/u1

the number of ratings that didn't be predicted: 0
the number of ratings that were unproperly predicted [ex. >=10, <0, NaN, or format errors]: 0
If the counted number is large, please check your codes again.

The bigger value means that the ratings are predicted more incorrectly
RMSE: 1.073848


In [None]:
# for k in range(1, 200, 2):
#     print("K:", k)
#     train_loss, test_loss, epoch = matrix_factorization(train_user_item_matrix, test_user_item_matrix, k)
#     with open('./log/logs.txt', 'a') as file:
#         file.write(f"K: {k}\ttrain_loss: {train_loss}\ttest_loss: {test_loss}\tepoch: {epoch}\n")

In [None]:
# def loguniform(low=0, high=1, size=None):
#     return np.exp(np.random.uniform(low, high, size))

In [None]:
# for lmbd in np.log10(np.arange(1, 1.07, 0.005)):
#     print("lambda:", lmbd)


In [None]:
# from scipy.stats import loguniform
# for k in range(1, 200, 2):
#     for lmbd in np.log10(np.arange(1, 1.07, 0.005)):
#         print(f"K: {k}\tlambda: {lmbd}")
#         train_loss, test_loss, epoch = matrix_factorization(train_user_item_matrix, test_user_item_matrix, k=k, lmbd=lmbd)
#         with open('./log/logs.txt', 'a') as file:
#             file.write(f"K: {k}\tlambda: {lmbd}\ttrain_loss: {train_loss}\ttest_loss: {test_loss}\tepoch: {epoch}\n")

In [None]:
# for k in range(1, 200, 2):
#     print("K:", k)
#     train_loss, test_loss, epoch = matrix_factorization(train_user_item_matrix, test_user_item_matrix, k)
#     with open('./log/logs.txt', 'a') as file:
#         file.write(f"K: {k}\ttrain_loss: {train_loss}\ttest_loss: {test_loss}\tepoch: {epoch}\n")

In [None]:
# train_loss, test_loss, epoch = matrix_factorization(train_user_item_matrix, test_user_item_matrix, k=)