In [1]:
from collections import defaultdict
import numpy as np
from util.data_loader import DataLoader
import warnings
warnings.filterwarnings('ignore')



def pearson_coefficient(u:np.ndarray, v:np.ndarray)->float:
    u_diff = u-np.mean(u)
    v_diff = v-np.mean(v)
    분자 = np.dot(u_diff,v_diff)
    분모 = np.sqrt(sum(u_diff**2)) * np.sqrt(sum(v_diff**2))
    return 0 if 분모==0 else 분자/분모


dataset = DataLoader().main()
user_movie_matrix= dataset.train.pivot(index="user_id", columns="movie_id", values="rating")
test = dataset.test.copy()
pred_love_items = defaultdict(list)
test_users = test.user_id.unique()

for test_user in test_users:
    similarities = []
    similar_users=[]
    avgs=[]

    for user in user_movie_matrix.index:
        if user==test_user:
            continue

        u1 = user_movie_matrix.loc[test_user, :].to_numpy()
        u2 = user_movie_matrix.loc[user, :].to_numpy()
        not_nan_pos = ~np.isnan(u1)&~np.isnan(u2)
        u1 = u1[not_nan_pos]
        u2 = u2[not_nan_pos]
        상관계수=pearson_coefficient(u1,u2)
        if 상관계수>0:
            similarities.append(상관계수)
            similar_users.append(user)
            avgs.append(np.mean(u2))

    test_movies = test.loc[test["user_id"]==test_user].movie_id
    avg = user_movie_matrix.loc[test_user,:].mean()
    test.loc[test["user_id"]==test_user, "rating_pred"]= avg
    for movie in test_movies:
        if movie not in user_movie_matrix.columns:
          continue
        similar_user_rates = user_movie_matrix.loc[similar_users, movie].to_numpy()
        rate_exists = ~np.isnan(similar_user_rates)
        if not rate_exists.any():
            continue
        similar_user_rates = similar_user_rates[rate_exists]
        similar_user_avgs = np.array(avgs)[rate_exists]
        weight = np.array(similarities)[rate_exists]

        res = avg+ np.dot(weight,(similar_user_rates-similar_user_avgs))/sum(weight)

        test.loc[(test["user_id"]==test_user)&(test["movie_id"]==movie), "rating_pred"]=res

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(test['rating'], test['rating_pred']))
print(f'rmse:{rmse:.3f}')


rmse:0.960
