In [1]:
import pandas as pd
import numpy as np
import sys
from sklearn.model_selection import train_test_split
# train_file = sys.argv[1]
# test_file = sys.argv[2]
train_file = "u5.base"
test_file = "u5.test"

k = 50  # number of iteration
alpha = 0.25  # learning rate
_lambda = 0.1  # parameter for regularizer
patience = 5  # number of patience that not improved from previous version
attr_num = 12  # number of columns in factorized matrix
column_names = ["user_id", "item_id", "rating", "time_stamp"]

dataframe = pd.read_csv(train_file, sep="\t", names=column_names, header=None)
testframe = pd.read_csv(test_file, sep="\t", names=column_names, header=None)

dataframe = dataframe.drop(["time_stamp"], axis=1)
testframe = testframe.drop(["time_stamp"], axis=1)

train_movie_last = dataframe["item_id"].max()
test_movie_last = testframe["item_id"].max()
last_movie = max(train_movie_last, test_movie_last)

In [2]:
train_data, val_data = train_test_split(dataframe, test_size=0.05, random_state=123)
print(train_data)
print(val_data)

       user_id  item_id  rating
6793        69     1134       5
12405      130       82       5
41047      404      323       3
11533      118      675       5
31975      320      403       4
...        ...      ...     ...
63206      650      218       3
61404      627      808       2
17730      197       39       2
28030      293      770       3
15725      178      483       4

[76000 rows x 3 columns]
       user_id  item_id  rating
23438      256     1057       2
78749      903     1073       3
43994      427      990       5
25959      277      844       4
20526      224      470       4
...        ...      ...     ...
15858      180       79       3
36880      368      219       2
7921        85       53       3
71524      772      259       2
44208      429      366       3

[4000 rows x 3 columns]


In [3]:
df_table = train_data.pivot("user_id", "item_id", "rating")
for i in range(1, last_movie+1):
    if not i in df_table.columns:
        df_table[i] = np.NaN
df_table = df_table[[i for i in range(1, last_movie+1)]]
print(df_table)

item_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                              ...   
1         5.0   3.0   NaN   3.0   3.0   5.0   4.0   1.0   5.0   NaN  ...   
2         4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         4.0   3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
940       NaN   NaN   NaN   2.0   NaN   NaN   NaN   5.0   3.0   NaN  ...   
941       NaN   NaN   NaN   NaN   NaN   NaN   4.0   NaN   NaN   NaN  ...   
942       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
943       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

item_id  16

In [4]:
import copy
pre_use_mat = df_table.replace([1.0, 2.0, 3.0, 4.0, 5.0], 1.0).values
pre_use_val = copy.deepcopy(val_data.values)
for temp in pre_use_val:
    temp[2] = 1
print(pre_use_val)
row = pre_use_mat.shape[0]
col = pre_use_mat.shape[1]
rated_indexes = np.argwhere(pre_use_mat == 1)
print(rated_indexes.shape)

[[ 256 1057    1]
 [ 903 1073    1]
 [ 427  990    1]
 ...
 [  85   53    1]
 [ 772  259    1]
 [ 429  366    1]]
(76000, 2)


In [5]:
def mean_squared_error_with_regularization(y, p, q):
    return ((y-np.dot(p, q))**2+_lambda*np.sqrt(np.sum(np.square(p))) + _lambda*np.sqrt(np.sum(np.square(q))) + 0.e-10)

def mf_uninteresting(index_arr, validation_set):
    p = np.random.rand(row, attr_num)  # number of parameter : row * 3
    q = np.random.rand(col, attr_num)  # number of parameter : col * 3
    not_improved_cnt = 0
    for _ in range(k):
        d_p = [[np.zeros(attr_num), 0] for _ in range(row)]
        d_q = [[np.zeros(attr_num), 0] for _ in range(col)]
        min_rmse_error = 9999999
        error = 0
        for i, j in index_arr:
            p_i = p[i]
            q_j = q[j]
            a = np.dot(p_i, q_j)
            # values of derivation p
            d_p[i][0] += _lambda*p_i - (1-a)*q_j
            d_p[i][1] += 1  # cnt
            d_q[j][0] += _lambda*q_j - (1-a)*p_i
            d_q[j][1] += 1
            error += mean_squared_error_with_regularization(
                1, p[i], q[j])
        for i in range(row): # current value - derivation_average«
            if d_p[i][1] != 0:
                p[i] -= alpha*(d_p[i][0]/d_p[i][1])
        for j in range(col):
            if d_q[j][1] != 0:
                q[j] -= alpha*(d_q[j][0]/d_q[j][1])
        error_ = error/len(index_arr)
        print("Training_error : ", error_)
        result_temp = np.dot(p, q.transpose())
        rmse_error = 0
        for user, movie, rating in validation_set:
            rmse_error += (result_temp[user-1][movie-1]-rating) ** 2
        rmse_error = np.sqrt(rmse_error/len(validation_set))
        print("Validation Error : ", rmse_error)
        if rmse_error < min_rmse_error:
            min_rmse_error = rmse_error
            best_p = p
            best_q = q
        else:
            not_improved_cnt += 1
            print(f"Did not improved from {min_rmse_error} to {rmse_error}")
            if not_improved_cnt >= patience:
                print("Early Stopped!!")
                return best_p, best_q
    return p, q

In [6]:
uninteresting_p, uninteresting_q = mf_uninteresting(rated_indexes, pre_use_val)

Training_error :  4.9818986225237625
Validation Error :  0.46923290347900387
Training_error :  0.4691921253374695
Validation Error :  0.38061012651861514
Training_error :  0.38847433143563836
Validation Error :  0.3144383513856024
Training_error :  0.33989710747774254
Validation Error :  0.2684676614284596
Training_error :  0.311167870848965
Validation Error :  0.2370717551385192
Training_error :  0.29328149741933496
Validation Error :  0.21501559403509826
Training_error :  0.2810979960387073
Validation Error :  0.19876582726670813
Training_error :  0.27204265431444324
Validation Error :  0.1862167771487675
Training_error :  0.2648597509083217
Validation Error :  0.1761403449654298
Training_error :  0.2589079720093116
Validation Error :  0.1678022617180038
Training_error :  0.25383227963162003
Validation Error :  0.1607461866051303
Training_error :  0.24941867196076786
Validation Error :  0.1546767487044411
Training_error :  0.2455280812356497
Validation Error :  0.14939476546656907
Tr

In [7]:
pre_use_mat_result = np.dot(uninteresting_p, uninteresting_q.transpose())
print(pre_use_mat_result)
uninteresting_rating_index = np.argwhere(pre_use_mat_result < 0.8)
print(uninteresting_rating_index.shape)

[[0.90545669 0.8958613  0.90920294 ... 1.52640241 1.88006362 1.8533642 ]
 [0.91611338 0.89126622 0.8981666  ... 1.46716209 1.87594721 1.82708742]
 [0.88923207 0.89768344 0.89270564 ... 1.45437031 1.97021738 1.74405085]
 ...
 [0.89377176 0.89864192 0.89945013 ... 1.40023059 1.88412192 1.68494968]
 [0.90054767 0.90330292 0.90692656 ... 1.51756843 1.86182809 1.82269148]
 [0.87624365 0.91011028 0.89481773 ... 1.431646   1.9426771  1.66784464]]
(6502, 2)


In [8]:
original_rating_mat = df_table.fillna(-1).values
for index in uninteresting_rating_index:
    original_rating_mat[index[0]][index[1]] = 1
result_rated_indexes = np.argwhere(original_rating_mat >= 0)
rating_sum = 0
cnt = 0
for index in result_rated_indexes:
    rating_sum += original_rating_mat[index[0]][index[1]]
    cnt += 1
rating_mean = rating_sum / cnt

In [9]:
def mean_squared_error_with_bias(y, p, q, b_p, b_q, mean):
    return ((y-mean-b_p-b_q-np.dot(p, q))**2+_lambda*np.sqrt(np.sum(np.square(p))) + _lambda*np.sqrt(np.sum(np.square(q))) + _lambda*(b_p**2) + _lambda*(b_q**2) + 0.e-10)


def mf_training(index_arr, data_arr, validation_set, mean):
    p = np.random.rand(row, attr_num)  # number of parameter : row * attr_num
    q = np.random.rand(col, attr_num)  # number of parameter : col * attr_num
    b_p = np.random.rand(row, 1)
    b_q = np.random.rand(col, 1)
    min_rmse_error = 9999999
    best_p = None
    best_q = None
    best_b_p = None
    best_b_q = None
    not_improved_cnt = 0
    for _ in range(k):
        d_p = [[np.zeros(attr_num), 0] for _ in range(row)]
        d_q = [[np.zeros(attr_num), 0] for _ in range(col)]
        d_b_p = [[0, 0] for _ in range(row)]
        d_b_q = [[0, 0] for _ in range(col)]
        acc_error = 0
        rmse_error = 0
        for i, j in index_arr:
            p_i = p[i]
            q_j = q[j]
            b_p_i = b_p[i][0]
            b_q_j = b_q[j][0]
            a = np.dot(p_i, q_j)
            # values of derivation p
            d_p[i][0] += _lambda*p_i - (data_arr[i][j]-b_p_i-b_q_j-mean-a)*q_j
            d_p[i][1] += 1  # cnt
            d_q[j][0] += _lambda*q_j - (data_arr[i][j]-b_p_i-b_q_j-mean-a)*p_i
            d_q[j][1] += 1
            d_b_p[i][0] += _lambda*b_p_i - (data_arr[i][j]-b_p_i-b_q_j-mean-a)
            d_b_p[i][1] += 1
            d_b_q[j][0] += _lambda*b_q_j - (data_arr[i][j]-b_p_i-b_q_j-mean-a)
            d_b_q[j][1] += 1
            acc_error += mean_squared_error_with_bias(
                data_arr[i][j], p[i], q[j], b_p_i, b_q_j, mean)


        # current value - derivation_average«
        for i in range(row):
            if d_p[i][1] != 0:
                p[i] -= alpha*(d_p[i][0]/d_p[i][1])
                b_p[i][0] -= alpha*(d_b_p[i][0]/d_b_p[i][1])
        # current value - derivation_average
        for j in range(col):
            if d_q[j][1] != 0:
                q[j] -= alpha*(d_q[j][0]/d_q[j][1])
                b_q[j][0] -= alpha*(d_b_q[j][0]/d_b_q[j][1])
        
        acc_error_ = acc_error/len(index_arr)
        print("Training Error : ", acc_error_)
        # for validation
        result_temp = np.dot(p, q.transpose())+mean
        result_temp += b_p
        result_temp += b_q.transpose()
        for user, movie, rating in validation_set:
            rmse_error += (result_temp[user-1][movie-1]-rating) ** 2
        rmse_error = np.sqrt(rmse_error/len(validation_set))
        print("Validation Error : ", rmse_error)
        if rmse_error < min_rmse_error:
            min_rmse_error = rmse_error
            best_p = p
            best_q = q
            best_b_p = b_p
            best_b_q = b_q
        else:
            not_improved_cnt += 1
            print(f"Did not improved from {min_rmse_error} to {rmse_error}")
            if not_improved_cnt >= patience:
                print("Early Stopped!!")
                return best_p, best_q, best_b_p, best_b_q
    return best_p, best_q, best_b_p, best_b_q

In [10]:
valid_set = val_data.values
print(valid_set)
p, q, b_p, b_q = mf_training(result_rated_indexes, original_rating_mat, valid_set, rating_mean)

[[ 256 1057    2]
 [ 903 1073    3]
 [ 427  990    5]
 ...
 [  85   53    3]
 [ 772  259    2]
 [ 429  366    3]]
Training Error :  18.49494407010105
Validation Error :  1.609518988120883
Training Error :  2.7912604346285144
Validation Error :  1.2041714683938494
Training Error :  1.5759283765206968
Validation Error :  1.0698242746213726
Training Error :  1.2264221688124333
Validation Error :  1.0237184846501954
Training Error :  1.1021532526491296
Validation Error :  1.0049047784568468
Training Error :  1.0459450587050072
Validation Error :  0.9952458424821545
Training Error :  1.014324312290359
Validation Error :  0.9892389241273549
Training Error :  0.9934854873621527
Validation Error :  0.9850132256427745
Training Error :  0.9782576410075877
Validation Error :  0.9818159647727435
Training Error :  0.9663565109178067
Validation Error :  0.9792856621386553
Training Error :  0.9566238233953639
Validation Error :  0.9772214207151104
Training Error :  0.9484072722844086
Validation Error

In [11]:
recommend_result = np.dot(p, q.transpose()) + rating_mean
recommend_result += b_p
recommend_result += b_q.transpose() 
recommend_result = np.where(recommend_result < 1.0, 1, recommend_result)
recommend_result = np.where(recommend_result > 5.0, 5, recommend_result)
print(recommend_result)

[[3.8674404  3.02817897 2.87603474 ... 4.66105786 5.         4.91383395]
 [3.84293021 3.23487815 3.16069365 ... 4.97854757 4.76759229 4.75432118]
 [3.12903246 2.59533681 2.41110135 ... 3.12746519 3.19199807 3.00652077]
 ...
 [4.28068499 3.66836956 3.37146357 ... 5.         5.         5.        ]
 [4.15825044 3.51921207 3.38821345 ... 4.79789538 4.80636034 4.93642154]
 [3.7193029  3.15266073 3.27482519 ... 4.20733583 3.82442496 3.76577295]]


In [12]:
rmse = 0
cnt = 0
for_write = []
for user, movie, rating in testframe.values:
    for_write.append([user, movie, recommend_result[user-1][movie-1]])
    rmse += (rating-recommend_result[user-1][movie-1])**2
rmse = np.sqrt(rmse/testframe.shape[0])
print(rmse)
df_write = pd.DataFrame(for_write)
df_write.to_csv(train_file+"_prediction.txt", sep="\t", index=None, header=None)

0.9422606135074956
