In [1]:
import pandas as pd
import numpy as np
import sys
from sklearn.model_selection import train_test_split
# train_file = sys.argv[1]
# test_file = sys.argv[2]
train_file = "u1.base"
test_file = "u1.test"

k = 50  # number of iteration
alpha = 0.25  # learning rate
_lambda = 0.1  # parameter for regularizer
patience = 5  # number of patience that not improved from previous version
attr_num = 12  # number of columns in factorized matrix
column_names = ["user_id", "item_id", "rating", "time_stamp"]

dataframe = pd.read_csv(train_file, sep="\t", names=column_names, header=None)
testframe = pd.read_csv(test_file, sep="\t", names=column_names, header=None)

dataframe = dataframe.drop(["time_stamp"], axis=1)
testframe = testframe.drop(["time_stamp"], axis=1)

last_movie = dataframe["item_id"].max()

In [2]:
train_data, val_data = train_test_split(dataframe, test_size=0.05, random_state=123)
print(train_data)
print(val_data)

       user_id  item_id  rating
6793       119       70       3
12405      234      153       3
41047      540      628       3
11533      216      546       2
31975      457      252       4
...        ...      ...     ...
63206      776      192       5
61404      757      179       4
17730      308       24       4
28030      425      156       5
15725      286       47       4

[76000 rows x 3 columns]
       user_id  item_id  rating
23438      385       18       5
78749      932      429       5
43994      570      327       4
25959      405      999       1
20526      342     1011       3
...        ...      ...     ...
15858      286     1038       5
36880      503        1       5
7921       145      559       2
71524      867      198       5
44208      577       22       5

[4000 rows x 3 columns]


In [3]:
df_table = train_data.pivot("user_id", "item_id", "rating")
for i in range(1, last_movie+1):
    if not i in df_table.columns:
        df_table[i] = np.NaN
df_table = df_table[[i for i in range(1, last_movie+1)]]
print(df_table)

item_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                              ...   
1         5.0   3.0   4.0   3.0   3.0   NaN   4.0   1.0   5.0   NaN  ...   
2         4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
940       NaN   NaN   NaN   2.0   NaN   NaN   4.0   5.0   3.0   NaN  ...   
941       5.0   NaN   NaN   NaN   NaN   NaN   4.0   NaN   NaN   NaN  ...   
942       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
943       NaN   5.0   NaN   NaN   NaN   NaN   NaN   NaN   3.0   NaN  ...   

item_id  16

In [4]:
import copy
pre_use_mat = df_table.replace([1.0, 2.0, 3.0, 4.0, 5.0], 1.0).values
pre_use_val = copy.deepcopy(val_data.values)
for temp in pre_use_val:
    temp[2] = 1
print(pre_use_val)
row = pre_use_mat.shape[0]
col = pre_use_mat.shape[1]
rated_indexes = np.argwhere(pre_use_mat == 1)
print(rated_indexes.shape)

[[385  18   1]
 [932 429   1]
 [570 327   1]
 ...
 [145 559   1]
 [867 198   1]
 [577  22   1]]
(76000, 2)


In [5]:
def mean_squared_error_with_bias(y, p, q, b_p, b_q, mean):
    return ((y-mean-b_p-b_q-np.dot(p, q))**2+_lambda*np.sqrt(np.sum(np.square(p))) + _lambda*np.sqrt(np.sum(np.square(q))) + _lambda*(b_p**2) + _lambda*(b_q**2) + 0.e-10)

def mf_uninteresting(index_arr, validation_set):
    p = np.random.rand(row, attr_num)  # number of parameter : row * 3
    q = np.random.rand(col, attr_num)  # number of parameter : col * 3
    b_p = np.random.rand(row, 1)
    b_q = np.random.rand(col, 1)
    min_rmse_error = 9999999
    best_p = None
    best_q = None
    best_b_p = None
    best_b_q = None
    not_improved_cnt = 0
    for _ in range(k):
        d_p = [[np.zeros(attr_num), 0] for _ in range(row)]
        d_q = [[np.zeros(attr_num), 0] for _ in range(col)]
        d_b_p = [[0, 0] for _ in range(row)]
        d_b_q = [[0, 0] for _ in range(col)]
        error = 0
        for i, j in index_arr:
            p_i = p[i]
            q_j = q[j]
            b_p_i = b_p[i][0]
            b_q_j = b_q[j][0]
            a = np.dot(p_i, q_j)
            # values of derivation p
            d_p[i][0] += _lambda*p_i - (1-b_p_i-b_q_j-1-a)*q_j
            d_p[i][1] += 1  # cnt
            d_q[j][0] += _lambda*q_j - (1-b_p_i-b_q_j-1-a)*p_i
            d_q[j][1] += 1
            d_b_p[i][0] += _lambda*b_p_i - (1-b_p_i-b_q_j-1-a)
            d_b_p[i][1] += 1
            d_b_q[j][0] += _lambda*b_q_j - (1-b_p_i-b_q_j-1-a)
            d_b_q[j][1] += 1
            error += mean_squared_error_with_bias(
                1, p[i], q[j], b_p_i, b_q_j, 1)
            
        # current value - derivation_average«
        for i in range(row):
            if d_p[i][1] != 0:
                p[i] -= alpha*(d_p[i][0]/d_p[i][1])
                b_p[i][0] -= alpha*(d_b_p[i][0]/d_b_p[i][1])
        # current value - derivation_average
        for j in range(col):
            if d_q[j][1] != 0:
                q[j] -= alpha*(d_q[j][0]/d_q[j][1])
                b_q[j][0] -= alpha*(d_b_q[j][0]/d_b_q[j][1])
        error_ = error/len(index_arr)
        print("Training_error : ", error_)
        result_temp = np.dot(p, q.transpose()) + 1
        result_temp += b_p
        result_temp += b_q.transpose()
        rmse_error = 0
        for user, movie, rating in validation_set:
            rmse_error += (result_temp[user-1][movie-1]-rating) ** 2
        rmse_error = np.sqrt(rmse_error/len(validation_set))
        print("Validation Error : ", rmse_error)
        if rmse_error < min_rmse_error:
            min_rmse_error = rmse_error
            best_p = p
            best_q = q
            best_b_p = b_p
            best_b_q = b_q
        else:
            not_improved_cnt += 1
            print(f"Did not improved from {min_rmse_error} to {rmse_error}")
            if not_improved_cnt >= patience:
                print("Early Stopped!!")
                return best_p, best_q, best_b_p, best_b_q
    return best_p, best_q, best_b_p, best_b_q

In [6]:
uninteresting_p, uninteresting_q, uninteresting_b_p, uninteresting_b_q = mf_uninteresting(rated_indexes, pre_use_val)

Training_error :  16.92184425664936
Validation Error :  1.0849807538867935
Training_error :  1.4986039381566432
Validation Error :  0.570752452773726
Training_error :  0.5370482361888429
Validation Error :  0.35120043528392014
Training_error :  0.2996280379973063
Validation Error :  0.25885960546466696
Training_error :  0.22798081083379806
Validation Error :  0.21551064328595246
Training_error :  0.19836339133902772
Validation Error :  0.1901662042399831
Training_error :  0.18143672537908836
Validation Error :  0.17227211182046595
Training_error :  0.16941923130138256
Validation Error :  0.15819218417501812
Training_error :  0.15984468059061752
Validation Error :  0.1464605088364247
Training_error :  0.15174030767201732
Validation Error :  0.13635559675379663
Training_error :  0.14463801505170246
Validation Error :  0.1274635153439676
Training_error :  0.13827723026213892
Validation Error :  0.11952323277792119
Training_error :  0.13249735584762473
Validation Error :  0.112359773728164

In [7]:
pre_use_mat_result = np.dot(uninteresting_p, uninteresting_q.transpose()) + 1
pre_use_mat_result += uninteresting_b_p
pre_use_mat_result += uninteresting_b_q.transpose() 
print(pre_use_mat_result)
uninteresting_rating_index = np.argwhere(pre_use_mat_result < 0.8)
print(uninteresting_rating_index.shape)

[[1.00110966 0.98399605 1.00605248 ... 1.02003713 1.03909422 1.03100693]
 [0.99014735 1.0052759  1.00627177 ... 1.01915378 1.01863363 1.02316111]
 [0.98663037 1.0000644  0.99608784 ... 1.039364   1.00702774 1.00712479]
 ...
 [1.02379402 1.01276423 0.99151916 ... 1.0187864  0.99049357 0.99173722]
 [1.0060016  1.00709032 0.97670242 ... 1.01960336 1.05018873 1.00219333]
 [0.98371384 1.00001488 1.0059611  ... 1.05820256 0.99990524 1.04246289]]
(101, 2)


In [8]:
original_rating_mat = df_table.fillna(-1).values
for index in uninteresting_rating_index:
    original_rating_mat[index[0]][index[1]] = 1
result_rated_indexes = np.argwhere(original_rating_mat >= 0)
rating_sum = 0
cnt = 0
for index in result_rated_indexes:
    rating_sum += original_rating_mat[index[0]][index[1]]
    cnt += 1
rating_mean = rating_sum / cnt

In [9]:
def mean_squared_error_with_bias(y, p, q, b_p, b_q, mean):
    return ((y-mean-b_p-b_q-np.dot(p, q))**2+_lambda*np.sqrt(np.sum(np.square(p))) + _lambda*np.sqrt(np.sum(np.square(q))) + _lambda*(b_p**2) + _lambda*(b_q**2) + 0.e-10)


def mf_training(index_arr, data_arr, validation_set, mean):
    p = np.random.rand(row, attr_num)  # number of parameter : row * attr_num
    q = np.random.rand(col, attr_num)  # number of parameter : col * attr_num
    b_p = np.random.rand(row, 1)
    b_q = np.random.rand(col, 1)
    min_rmse_error = 9999999
    best_p = None
    best_q = None
    best_b_p = None
    best_b_q = None
    not_improved_cnt = 0
    for _ in range(k):
        d_p = [[np.zeros(attr_num), 0] for _ in range(row)]
        d_q = [[np.zeros(attr_num), 0] for _ in range(col)]
        d_b_p = [[0, 0] for _ in range(row)]
        d_b_q = [[0, 0] for _ in range(col)]
        acc_error = 0
        rmse_error = 0
        for i, j in index_arr:
            p_i = p[i]
            q_j = q[j]
            b_p_i = b_p[i][0]
            b_q_j = b_q[j][0]
            a = np.dot(p_i, q_j)
            # values of derivation p
            d_p[i][0] += _lambda*p_i - (data_arr[i][j]-b_p_i-b_q_j-mean-a)*q_j
            d_p[i][1] += 1  # cnt
            d_q[j][0] += _lambda*q_j - (data_arr[i][j]-b_p_i-b_q_j-mean-a)*p_i
            d_q[j][1] += 1
            d_b_p[i][0] += _lambda*b_p_i - (data_arr[i][j]-b_p_i-b_q_j-mean-a)
            d_b_p[i][1] += 1
            d_b_q[j][0] += _lambda*b_q_j - (data_arr[i][j]-b_p_i-b_q_j-mean-a)
            d_b_q[j][1] += 1
            acc_error += mean_squared_error_with_bias(
                data_arr[i][j], p[i], q[j], b_p_i, b_q_j, mean)


        # current value - derivation_average«
        for i in range(row):
            if d_p[i][1] != 0:
                p[i] -= alpha*(d_p[i][0]/d_p[i][1])
                b_p[i][0] -= alpha*(d_b_p[i][0]/d_b_p[i][1])
        # current value - derivation_average
        for j in range(col):
            if d_q[j][1] != 0:
                q[j] -= alpha*(d_q[j][0]/d_q[j][1])
                b_q[j][0] -= alpha*(d_b_q[j][0]/d_b_q[j][1])
        
        acc_error_ = acc_error/len(index_arr)
        print("Training Error : ", acc_error_)
        # for validation
        result_temp = np.dot(p, q.transpose())+mean
        result_temp += b_p
        result_temp += b_q.transpose()
        for user, movie, rating in validation_set:
            rmse_error += (result_temp[user-1][movie-1]-rating) ** 2
        rmse_error = np.sqrt(rmse_error/len(validation_set))
        print("Validation Error : ", rmse_error)
        if rmse_error < min_rmse_error:
            min_rmse_error = rmse_error
            best_p = p
            best_q = q
            best_b_p = b_p
            best_b_q = b_q
        else:
            not_improved_cnt += 1
            print(f"Did not improved from {min_rmse_error} to {rmse_error}")
            if not_improved_cnt >= patience:
                print("Early Stopped!!")
                return best_p, best_q, best_b_p, best_b_q
    return best_p, best_q, best_b_p, best_b_q

In [10]:
valid_set = val_data.values
print(valid_set)
p, q, b_p, b_q = mf_training(result_rated_indexes, original_rating_mat, valid_set, rating_mean)

[[385  18   5]
 [932 429   5]
 [570 327   4]
 ...
 [145 559   2]
 [867 198   5]
 [577  22   5]]
Training Error :  18.532965275232343
Validation Error :  1.5459544340060485
Training Error :  2.6845043136419493
Validation Error :  1.1546798088753594
Training Error :  1.5212368240442504
Validation Error :  1.0273268713263064
Training Error :  1.2043582157512847
Validation Error :  0.9834508068680977
Training Error :  1.0956617649453324
Validation Error :  0.9652222115256803
Training Error :  1.0464404229864788
Validation Error :  0.9558750296115038
Training Error :  1.0180243494424803
Validation Error :  0.9502365371881488
Training Error :  0.998714680718501
Validation Error :  0.9464462390222967
Training Error :  0.9842326157950738
Validation Error :  0.9437064549676044
Training Error :  0.9726865986171702
Validation Error :  0.9416207882621624
Training Error :  0.9631021826346718
Validation Error :  0.9399702103658613
Training Error :  0.9549186099016037
Validation Error :  0.9386242161

In [11]:
recommend_result = np.dot(p, q.transpose()) + rating_mean
recommend_result += b_p
recommend_result += b_q.transpose() 
recommend_result = np.where(recommend_result < 1.0, 1, recommend_result)
recommend_result = np.where(recommend_result > 5.0, 5, recommend_result)
print(recommend_result)

[[3.93362117 3.19661477 3.28158924 ... 2.55695569 3.5519061  3.26420007]
 [4.01649156 3.50438831 3.02579217 ... 3.10346694 3.6256568  3.57582077]
 [3.59330852 3.13171318 2.41290662 ... 2.07223211 3.31882997 2.77411857]
 ...
 [4.15213804 3.50593148 3.32397829 ... 2.98852406 3.86747191 3.50326272]
 [4.33564474 3.72929896 3.47706238 ... 3.29570767 4.03193773 3.82024666]
 [3.78831102 3.28384381 2.75445542 ... 2.98061059 3.44293146 3.32246295]]


In [12]:
rmse = 0
cnt = 0
for_write = []
for user, movie, rating in testframe.values:
    if user > row or movie > col:
            for_write.append([user, movie, 3])
            rmse += (rating-3)**2
    else:
        for_write.append([user, movie, recommend_result[user-1][movie-1]])
        rmse += (rating-recommend_result[user-1][movie-1])**2
rmse = np.sqrt(rmse/testframe.shape[0])
print(rmse)
df_write = pd.DataFrame(for_write)
df_write.to_csv(train_file+"_prediction.txt", sep="\t", index=None, header=None)

0.9532818962833747
