In [1]:
import pandas as pd
import numpy as np
import sys
from sklearn.model_selection import train_test_split
# train_file = sys.argv[1]
# test_file = sys.argv[2]
train_file = "u1.base"
test_file = "u1.test"

k = 50  # number of iteration
alpha = 0.25  # learning rate
_lambda = 0.1  # parameter for regularizer
patience = 5  # number of patience that not improved from previous version
attr_num = 12  # number of columns in factorized matrix
column_names = ["user_id", "item_id", "rating", "time_stamp"]

dataframe = pd.read_csv(train_file, sep="\t", names=column_names, header=None)
testframe = pd.read_csv(test_file, sep="\t", names=column_names, header=None)

dataframe = dataframe.drop(["time_stamp"], axis=1)
testframe = testframe.drop(["time_stamp"], axis=1)

train_movie_last = dataframe["item_id"].max()
test_movie_last = testframe["item_id"].max()
last_movie = max(train_movie_last, test_movie_last)

In [2]:
train_data, val_data = train_test_split(dataframe, test_size=0.05, random_state=123)
print(train_data)
print(val_data)

       user_id  item_id  rating
6793       119       70       3
12405      234      153       3
41047      540      628       3
11533      216      546       2
31975      457      252       4
...        ...      ...     ...
63206      776      192       5
61404      757      179       4
17730      308       24       4
28030      425      156       5
15725      286       47       4

[76000 rows x 3 columns]
       user_id  item_id  rating
23438      385       18       5
78749      932      429       5
43994      570      327       4
25959      405      999       1
20526      342     1011       3
...        ...      ...     ...
15858      286     1038       5
36880      503        1       5
7921       145      559       2
71524      867      198       5
44208      577       22       5

[4000 rows x 3 columns]


In [3]:
df_table = train_data.pivot("user_id", "item_id", "rating")
for i in range(1, last_movie+1):
    if not i in df_table.columns:
        df_table[i] = np.NaN
df_table = df_table[[i for i in range(1, last_movie+1)]]
print(df_table)

item_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                              ...   
1         5.0   3.0   4.0   3.0   3.0   NaN   4.0   1.0   5.0   NaN  ...   
2         4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
940       NaN   NaN   NaN   2.0   NaN   NaN   4.0   5.0   3.0   NaN  ...   
941       5.0   NaN   NaN   NaN   NaN   NaN   4.0   NaN   NaN   NaN  ...   
942       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
943       NaN   5.0   NaN   NaN   NaN   NaN   NaN   NaN   3.0   NaN  ...   

item_id  16

In [4]:
import copy
pre_use_mat = df_table.replace([1.0, 2.0, 3.0, 4.0, 5.0], 1.0).values
pre_use_val = copy.deepcopy(val_data.values)
for temp in pre_use_val:
    temp[2] = 1
print(pre_use_val)
row = pre_use_mat.shape[0]
col = pre_use_mat.shape[1]
rated_indexes = np.argwhere(pre_use_mat == 1)
print(rated_indexes.shape)

[[385  18   1]
 [932 429   1]
 [570 327   1]
 ...
 [145 559   1]
 [867 198   1]
 [577  22   1]]
(76000, 2)


In [5]:
def mean_squared_error_with_bias(y, p, q, b_p, b_q, mean):
    return ((y-mean-b_p-b_q-np.dot(p, q))**2+_lambda*np.sqrt(np.sum(np.square(p))) + _lambda*np.sqrt(np.sum(np.square(q))) + _lambda*(b_p**2) + _lambda*(b_q**2) + 0.e-10)

def mf_uninteresting(index_arr, validation_set):
    p = np.random.rand(row, attr_num)  # number of parameter : row * 3
    q = np.random.rand(col, attr_num)  # number of parameter : col * 3
    b_p = np.random.rand(row, 1)
    b_q = np.random.rand(col, 1)
    min_rmse_error = 9999999
    best_p = None
    best_q = None
    best_b_p = None
    best_b_q = None
    not_improved_cnt = 0
    for _ in range(k):
        d_p = [[np.zeros(attr_num), 0] for _ in range(row)]
        d_q = [[np.zeros(attr_num), 0] for _ in range(col)]
        d_b_p = [[0, 0] for _ in range(row)]
        d_b_q = [[0, 0] for _ in range(col)]
        error = 0
        for i, j in index_arr:
            p_i = p[i]
            q_j = q[j]
            b_p_i = b_p[i][0]
            b_q_j = b_q[j][0]
            a = np.dot(p_i, q_j)
            # values of derivation p
            d_p[i][0] += _lambda*p_i - (1-b_p_i-b_q_j-1-a)*q_j
            d_p[i][1] += 1  # cnt
            d_q[j][0] += _lambda*q_j - (1-b_p_i-b_q_j-1-a)*p_i
            d_q[j][1] += 1
            d_b_p[i][0] += _lambda*b_p_i - (1-b_p_i-b_q_j-1-a)
            d_b_p[i][1] += 1
            d_b_q[j][0] += _lambda*b_q_j - (1-b_p_i-b_q_j-1-a)
            d_b_q[j][1] += 1
            error += mean_squared_error_with_bias(
                1, p[i], q[j], b_p_i, b_q_j, 1)
            
        # current value - derivation_average«
        for i in range(row):
            if d_p[i][1] != 0:
                p[i] -= alpha*(d_p[i][0]/d_p[i][1])
                b_p[i][0] -= alpha*(d_b_p[i][0]/d_b_p[i][1])
        # current value - derivation_average
        for j in range(col):
            if d_q[j][1] != 0:
                q[j] -= alpha*(d_q[j][0]/d_q[j][1])
                b_q[j][0] -= alpha*(d_b_q[j][0]/d_b_q[j][1])
        error_ = error/len(index_arr)
        print("Training_error : ", error_)
        result_temp = np.dot(p, q.transpose()) + 1
        result_temp += b_p
        result_temp += b_q.transpose()
        rmse_error = 0
        for user, movie, rating in validation_set:
            rmse_error += (result_temp[user-1][movie-1]-rating) ** 2
        rmse_error = np.sqrt(rmse_error/len(validation_set))
        print("Validation Error : ", rmse_error)
        if rmse_error < min_rmse_error:
            min_rmse_error = rmse_error
            best_p = p
            best_q = q
            best_b_p = b_p
            best_b_q = b_q
        else:
            not_improved_cnt += 1
            print(f"Did not improved from {min_rmse_error} to {rmse_error}")
            if not_improved_cnt >= patience:
                print("Early Stopped!!")
                return best_p, best_q, best_b_p, best_b_q
    return best_p, best_q, best_b_p, best_b_q

In [6]:
uninteresting_p, uninteresting_q, uninteresting_b_p, uninteresting_b_q = mf_uninteresting(rated_indexes, pre_use_val)

Training_error :  17.407734365394294
Validation Error :  1.097104637061853
Training_error :  1.51595840038517
Validation Error :  0.5751613782468652
Training_error :  0.5388598128577635
Validation Error :  0.35085229664618
Training_error :  0.2982650267008058
Validation Error :  0.2555458491766368
Training_error :  0.22598240687495302
Validation Error :  0.21056031700608946
Training_error :  0.19628161958074344
Validation Error :  0.18443952053596652
Training_error :  0.1794400026587089
Validation Error :  0.16625306427180755
Training_error :  0.16756994173896209
Validation Error :  0.15213638035048063
Training_error :  0.15815990150512704
Validation Error :  0.14049594280867855
Training_error :  0.15021576556196953
Validation Error :  0.1305407123771434
Training_error :  0.14326049723087061
Validation Error :  0.12181965981407912
Training_error :  0.13703087117494192
Validation Error :  0.11405260726141106
Training_error :  0.13136653412931917
Validation Error :  0.10705525285455388
T

In [7]:
pre_use_mat_result = np.dot(uninteresting_p, uninteresting_q.transpose()) + 1
pre_use_mat_result += uninteresting_b_p
pre_use_mat_result += uninteresting_b_q.transpose() 
print(pre_use_mat_result)
uninteresting_rating_index = np.argwhere(pre_use_mat_result < 0.8)
print(uninteresting_rating_index.shape)

[[0.9974128  1.00871151 1.00993438 ... 1.0461118  1.05965468 1.01233011]
 [0.99518347 1.03189709 1.02116543 ... 1.03576953 1.08251336 0.98469719]
 [0.9922712  0.99721983 1.03930941 ... 1.04433682 1.04439135 1.04335577]
 ...
 [1.00886765 1.00049514 0.99707499 ... 1.05011352 1.04323794 1.04859732]
 [0.99017354 1.00771792 0.98994783 ... 1.04554346 1.04979008 1.00082225]
 [0.99463969 0.98177072 0.98588911 ... 1.04214888 1.04833272 1.01939046]]
(200, 2)


In [8]:
original_rating_mat = df_table.fillna(-1).values
for index in uninteresting_rating_index:
    original_rating_mat[index[0]][index[1]] = 1
result_rated_indexes = np.argwhere(original_rating_mat >= 0)
rating_sum = 0
cnt = 0
for index in result_rated_indexes:
    rating_sum += original_rating_mat[index[0]][index[1]]
    cnt += 1
rating_mean = rating_sum / cnt

In [9]:
def mean_squared_error_with_bias(y, p, q, b_p, b_q, mean):
    return ((y-mean-b_p-b_q-np.dot(p, q))**2+_lambda*np.sqrt(np.sum(np.square(p))) + _lambda*np.sqrt(np.sum(np.square(q))) + _lambda*(b_p**2) + _lambda*(b_q**2) + 0.e-10)


def mf_training(index_arr, data_arr, validation_set, mean):
    p = np.random.rand(row, attr_num)  # number of parameter : row * attr_num
    q = np.random.rand(col, attr_num)  # number of parameter : col * attr_num
    b_p = np.random.rand(row, 1)
    b_q = np.random.rand(col, 1)
    min_rmse_error = 9999999
    best_p = None
    best_q = None
    best_b_p = None
    best_b_q = None
    not_improved_cnt = 0
    for _ in range(k):
        d_p = [[np.zeros(attr_num), 0] for _ in range(row)]
        d_q = [[np.zeros(attr_num), 0] for _ in range(col)]
        d_b_p = [[0, 0] for _ in range(row)]
        d_b_q = [[0, 0] for _ in range(col)]
        acc_error = 0
        rmse_error = 0
        for i, j in index_arr:
            p_i = p[i]
            q_j = q[j]
            b_p_i = b_p[i][0]
            b_q_j = b_q[j][0]
            a = np.dot(p_i, q_j)
            # values of derivation p
            d_p[i][0] += _lambda*p_i - (data_arr[i][j]-b_p_i-b_q_j-mean-a)*q_j
            d_p[i][1] += 1  # cnt
            d_q[j][0] += _lambda*q_j - (data_arr[i][j]-b_p_i-b_q_j-mean-a)*p_i
            d_q[j][1] += 1
            d_b_p[i][0] += _lambda*b_p_i - (data_arr[i][j]-b_p_i-b_q_j-mean-a)
            d_b_p[i][1] += 1
            d_b_q[j][0] += _lambda*b_q_j - (data_arr[i][j]-b_p_i-b_q_j-mean-a)
            d_b_q[j][1] += 1
            acc_error += mean_squared_error_with_bias(
                data_arr[i][j], p[i], q[j], b_p_i, b_q_j, mean)


        # current value - derivation_average«
        for i in range(row):
            if d_p[i][1] != 0:
                p[i] -= alpha*(d_p[i][0]/d_p[i][1])
                b_p[i][0] -= alpha*(d_b_p[i][0]/d_b_p[i][1])
        # current value - derivation_average
        for j in range(col):
            if d_q[j][1] != 0:
                q[j] -= alpha*(d_q[j][0]/d_q[j][1])
                b_q[j][0] -= alpha*(d_b_q[j][0]/d_b_q[j][1])
        
        acc_error_ = acc_error/len(index_arr)
        print("Training Error : ", acc_error_)
        # for validation
        result_temp = np.dot(p, q.transpose())+mean
        result_temp += b_p
        result_temp += b_q.transpose()
        for user, movie, rating in validation_set:
            rmse_error += (result_temp[user-1][movie-1]-rating) ** 2
        rmse_error = np.sqrt(rmse_error/len(validation_set))
        print("Validation Error : ", rmse_error)
        if rmse_error < min_rmse_error:
            min_rmse_error = rmse_error
            best_p = p
            best_q = q
            best_b_p = b_p
            best_b_q = b_q
        else:
            not_improved_cnt += 1
            print(f"Did not improved from {min_rmse_error} to {rmse_error}")
            if not_improved_cnt >= patience:
                print("Early Stopped!!")
                return best_p, best_q, best_b_p, best_b_q
    return best_p, best_q, best_b_p, best_b_q

In [10]:
valid_set = val_data.values
print(valid_set)
p, q, b_p, b_q = mf_training(result_rated_indexes, original_rating_mat, valid_set, rating_mean)

[[385  18   5]
 [932 429   5]
 [570 327   4]
 ...
 [145 559   2]
 [867 198   5]
 [577  22   5]]
Training Error :  18.462249050925898
Validation Error :  1.5290144084114903
Training Error :  2.5960876844885314
Validation Error :  1.1528578021898075
Training Error :  1.5048524199053124
Validation Error :  1.026479974642444
Training Error :  1.2000075922767983
Validation Error :  0.9810697126053565
Training Error :  1.0931518578645003
Validation Error :  0.9615412540753968
Training Error :  1.0443024109265964
Validation Error :  0.9513707218131304
Training Error :  1.0161375921880025
Validation Error :  0.9452494924440429
Training Error :  0.9970886881627206
Validation Error :  0.9411934943923703
Training Error :  0.9828554669870946
Validation Error :  0.9383261545892847
Training Error :  0.9715291368656707
Validation Error :  0.9362025289880571
Training Error :  0.9621314433905157
Validation Error :  0.9345729456064156
Training Error :  0.9541044868533138
Validation Error :  0.9332868891

In [11]:
recommend_result = np.dot(p, q.transpose()) + rating_mean
recommend_result += b_p
recommend_result += b_q.transpose() 
recommend_result = np.where(recommend_result < 1.0, 1, recommend_result)
recommend_result = np.where(recommend_result > 5.0, 5, recommend_result)
print(recommend_result)

[[4.01021504 3.26506142 3.10947656 ... 3.35727684 3.60612274 3.25362958]
 [3.8702287  3.33689449 2.93689393 ... 3.22262552 3.79301417 3.34873667]
 [3.50373241 3.00974795 2.43817864 ... 2.61690333 3.27799646 3.08383335]
 ...
 [4.20383977 3.48976348 3.20729765 ... 3.29175891 3.98012062 3.59369447]
 [4.34146168 3.72965078 3.52932726 ... 3.51419389 4.07861372 3.82072158]
 [3.73341527 3.17375795 2.99631019 ... 3.25622851 3.60002131 3.33291253]]


In [12]:
rmse = 0
cnt = 0
for_write = []
for user, movie, rating in testframe.values:
    for_write.append([user, movie, recommend_result[user-1][movie-1]])
    rmse += (rating-recommend_result[user-1][movie-1])**2
rmse = np.sqrt(rmse/testframe.shape[0])
print(rmse)
df_write = pd.DataFrame(for_write)
df_write.to_csv(train_file+"_prediction.txt", sep="\t", index=None, header=None)

0.9543087860354109
