In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import os
os.chdir('C:/RecSys/Data/')

In [2]:
# 데이터 읽어오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')

i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, encoding='latin-1')

ratings = ratings.drop('timestamp', axis=1)

movies = movies[['movie_id', 'title']]

In [3]:
# train_test_split
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [4]:
# Accuracy Test
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

In [5]:
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1670,1671,1672,1673,1674,1675,1677,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,4.0,3.0,,,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


---

# 3.3 기본 CF 알고리즘

neighbor을 고려하지 않은 전체 사용자를 대상으로 하는 CF 알고리즘  
① 모든 사용자 간 유사도 계산  
② 추천 대상이 되는 사람과 다른 사람의 유사도 추출  
③ 추천 대상이 되는 사람이 **평가하지 않은** 모든 아이템에 대해서 예상 평가값을 구함  
④ 예상 평가값이 높은 상위 N개의 아이템을 추천

In [6]:
# 코사인 유사도의 경우 null값이 있을 경우 error가 발생함
matrix_dummy = rating_matrix.copy().fillna(0)
matrix_dummy

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1670,1671,1672,1673,1674,1675,1677,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.0,4.0,3.0,0.0,0.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# 코사인 유사도를 사용한 경우
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
user_similarity

# # 연습문제 3-1
# # 피어슨 상관계수를 사용한다면? → user 기준으로 봐야하기 때문에 matrix_dummy를 transpose해야함
# t_matrix = matrix_dummy.transpose()
# t_matrix

# user_similarity = t_matrix.corr()
# user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
# user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.086829,0.007674,0.053647,0.257768,0.350580,0.334526,0.215742,0.077507,0.293554,...,0.251692,0.084585,0.170805,0.154161,0.128482,0.095792,0.256938,0.094404,0.148812,0.323478
2,0.086829,1.000000,0.064178,0.173355,0.021821,0.230241,0.062355,0.086132,0.170589,0.158250,...,0.102946,0.150850,0.309968,0.403403,0.196270,0.241228,0.150615,0.132341,0.118590,0.071777
3,0.007674,0.064178,1.000000,0.223495,0.029890,0.037891,0.066846,0.093987,0.058419,0.044196,...,0.019450,0.057003,0.108127,0.026439,0.060492,0.000000,0.082850,0.049161,0.056293,0.020168
4,0.053647,0.173355,0.223495,1.000000,0.030395,0.091795,0.067221,0.203353,0.074258,0.081326,...,0.014834,0.000000,0.093864,0.134432,0.126446,0.000000,0.150330,0.124980,0.152107,0.061528
5,0.257768,0.021821,0.029890,0.030395,1.000000,0.185514,0.242025,0.132718,0.069915,0.135670,...,0.195749,0.096905,0.085362,0.046026,0.138028,0.064277,0.170561,0.046801,0.058622,0.239318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.095792,0.241228,0.000000,0.000000,0.064277,0.089362,0.082838,0.072886,0.053231,0.096652,...,0.079755,0.325498,0.207625,0.223571,0.318472,1.000000,0.041447,0.159025,0.000000,0.065240
940,0.256938,0.150615,0.082850,0.150330,0.170561,0.266234,0.238484,0.201023,0.077725,0.251387,...,0.262378,0.057583,0.137233,0.162597,0.136125,0.041447,1.000000,0.130816,0.185459,0.200507
941,0.094404,0.132341,0.049161,0.124980,0.046801,0.088495,0.053995,0.120773,0.133940,0.100037,...,0.013596,0.188602,0.227115,0.076883,0.206723,0.159025,0.130816,1.000000,0.064758,0.066260
942,0.148812,0.118590,0.056293,0.152107,0.058622,0.212698,0.180456,0.117074,0.090633,0.209461,...,0.188601,0.027810,0.076427,0.094422,0.102309,0.000000,0.185459,0.064758,1.000000,0.124717


In [8]:
def cf_simple(user_id, movie_id):
    if movie_id in rating_matrix:
        # 현재 영화에 대한 모든 사용자의 rating값 가져오기
        movie_ratings = rating_matrix[movie_id].copy()
        # 현재 영화를 평가하지 않은 사용자의 index 가져오기
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        # 현재 영화를 평가하지 않은 사용자의 rating 제거
        movie_ratings = movie_ratings.dropna()
        
        # 현재 사용자와 다른 사용자 간의 similarity 가져오기
        sim_scores = user_similarity[user_id].copy()
        # 현재 영화를 평가하지 않은 사용자의 similarity값 제거
        sim_scores = sim_scores.drop(none_rating_idx)
        
        # 현재 영화를 평가한 모든 사용자의 가중평균값 구하기
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating

score(cf_simple)

1.0150960244238976

---

# 3.4 이웃을 고려한 CF

In [9]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, encoding='latin-1')

ratings = ratings.drop('timestamp', axis=1)
movies = movies[['movie_id', 'title']]
movies = movies.set_index('movie_id')

from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

In [10]:
# 이웃의 수를 반영하기 위해 score 함수 수정
def score(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

In [11]:
# Neighbor size를 정해서 예측치를 계산하는 함수 
def cf_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_matrix:
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        
        sim_scores = user_similarity[user_id].copy()
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:          
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        else:                       
            if len(sim_scores) > 1: 
                # 지정된 neighbor size 값과 해당 영화를 평가한 총 사용자 수 중 작은 것으로 결정
                neighbor_size = min(neighbor_size, len(sim_scores))
                # array로 바꾸기 (argsort를 사용하기 위함)
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                # 유사도를 순서대로 정렬 (오름차순)
                user_idx = np.argsort(sim_scores)
                # 유사도를 neighbor size만큼 받기
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                # 영화 rating을 neighbor size만큼 받기
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                # 최종 예측값 계산 
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
    return mean_rating

# 정확도 계산
score(cf_knn, neighbor_size=30)

1.0120536602605283

In [12]:
# 전체 데이터로 full matrix와 cosine similarity 구하기
# 실제 추천 시스템에서는 train_test_split을 안 한다
rating_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')

from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

def recommender(user, n_items=10, neighbor_size=20):
    predictions = []
    # 이미 평가한 영화 제외함
    rated_index = rating_matrix.loc[user][rating_matrix.loc[user] > 0].index
    items = rating_matrix.loc[user].drop(rated_index)
    
    # 해당 item에 대한 예측 평점들을 prediction에 넣기
    for item in items.index:
        predictions.append(cf_knn(user, item, neighbor_size))
    
    # 예측 평점이 높은 상위 n_items개의 아이템을 recommendations에 넣기
    recommendations = pd.Series(data=predictions, index=items.index, dtype=float)
    recommendations = recommendations.sort_values(ascending=False)[:n_items]
    
    # 추천 영화 제목 반환
    recommended_items = movies.loc[recommendations.index]['title']
    return recommended_items

recommender(user=2, n_items=5, neighbor_size=30)

movie_id
1189                      Prefontaine (1997)
1293                         Star Kid (1997)
1500               Santa with Muscles (1996)
1467    Saint of Fort Washington, The (1993)
318                  Schindler's List (1993)
Name: title, dtype: object

---

# 3.5 최적의 이웃 크기 설정

추천의 정확도를 최대로 하는 최적의 이웃 크기가 존재함  
But, 얼마가 최적의 크기인지는 분야에 따라 차이가 있음  
때문에 적절한 실험(노가다)이 필요함!

In [13]:
rating_matrix = x_train.pivot_table(values='rating', index='user_id', columns='movie_id')

from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

# 이웃 크기 10씩 증가시키면서 최적의 RMSE 구해보기
for neighbor_size in [10, 20, 30, 40, 50, 60]:
    print("Neighbor size = %d : RMSE = %.4f" %(neighbor_size, score(cf_knn, neighbor_size)))

Neighbor size = 10 : RMSE = 1.0121
Neighbor size = 20 : RMSE = 1.0121
Neighbor size = 30 : RMSE = 1.0121
Neighbor size = 40 : RMSE = 1.0121
Neighbor size = 50 : RMSE = 1.0121
Neighbor size = 60 : RMSE = 1.0121


---

# 3.6 사용자의 평가경향을 고려한 CF (정확도 개선 방법)

Eg. 평소에 2.0의 평점을 주던 사람이 3.0을 주는 것 ↔ 평소에 4.0을 주던 사람이 3.0을 주는 것

① 각 사용자의 평점평균을 구한다.  
② 각 아이템의 평점을 각 사용자의 평균에서의 차이(평점편차)로 변환한다.  
③ 현 사용자의 이웃을 구하고, 이웃의 아이템에 대한 평점편차와 유사도를 가중평균(편차예측값)한다.  
④ 현 사용자의 평균에 ③에서 구한 편차예측값을 더해준다.  
⑤ 예측값을 구할 수 없다면, 사용자의 평점평균으로 대체한다.

In [14]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, encoding='latin-1')
 
ratings = ratings.drop('timestamp', axis=1)
movies = movies[['movie_id', 'title']]

from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

In [15]:
# 각 행의 열들의 평균을 구함
rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T

def cf_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_matrix:
        movie_ratings = rating_bias[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        
        sim_scores = user_similarity[user_id].copy()
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = rating_mean[user_id] + prediction
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = rating_mean[user_id] + prediction
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    return prediction

score(cf_knn_bias, neighbor_size=30)

0.9610916781007818

→ 지금까지의 RMSE 중 가장 낮은 것을 확인할 수 있음

---

## 연습문제 3-2

In [16]:
def recommender2(user, n_items=10, neighbor_size=20):
    predictions = []
    rated_index = rating_bias.loc[user][rating_bias.loc[user] > 0].index
    items = rating_bias.loc[user].drop(rated_index)
    
    for item in items.index:
        predictions.append(cf_knn_bias(user, item, neighbor_size))
    recommendations = pd.Series(data=predictions, index=items.index, dtype=float)
    recommendations = recommendations.sort_values(ascending=False)[:n_items]
    recommended_items = movies.loc[recommendations.index]['title']
    
    return recommended_items

recommender2(user=2, n_items=5, neighbor_size=30)

movie_id
1467                                     Cure, The (1995)
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
1463                  Stars Fell on Henrietta, The (1995)
1449                               Golden Earrings (1947)
1642                                    Angel Baby (1995)
Name: title, dtype: object

---

# 3.7 그 외의 CF 정확도 개선 방법

공통 아이템이 많은 사용자와의 유사도 > 공통 아이템이 적은 사용자와의 유사도  
→ 유사도를 신뢰도에 따라 달리 주자!

In [17]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, encoding='latin-1')

ratings = ratings.drop('timestamp', axis=1)
movies = movies[['movie_id', 'title']]

from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T

In [18]:
# 해당 영화를 봤으면 1, 안 봤으면 0
rating_binary1 = np.array((rating_matrix > 0).astype(float))
rating_binary2 = rating_binary1.transpose()

# counts는 각 사용자가 공통으로 평가한 영화의 수가 됨
counts = np.dot(rating_binary1, rating_binary2)
counts = pd.DataFrame(counts, index=rating_matrix.index, columns=rating_matrix.index).fillna(0)
counts

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,204.0,10.0,4.0,5.0,42.0,56.0,83.0,23.0,3.0,43.0,...,43.0,11.0,25.0,12.0,21.0,9.0,26.0,5.0,19.0,50.0
2,10.0,46.0,5.0,3.0,4.0,15.0,10.0,2.0,2.0,9.0,...,8.0,6.0,15.0,5.0,11.0,7.0,11.0,4.0,6.0,3.0
3,4.0,5.0,40.0,8.0,1.0,5.0,6.0,3.0,1.0,3.0,...,1.0,2.0,8.0,5.0,5.0,0.0,9.0,1.0,4.0,1.0
4,5.0,3.0,8.0,18.0,1.0,3.0,3.0,4.0,1.0,3.0,...,1.0,1.0,4.0,3.0,3.0,0.0,7.0,2.0,4.0,0.0
5,42.0,4.0,1.0,1.0,131.0,20.0,59.0,16.0,3.0,16.0,...,35.0,3.0,10.0,3.0,10.0,4.0,19.0,3.0,10.0,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,9.0,7.0,0.0,0.0,4.0,10.0,12.0,2.0,1.0,4.0,...,6.0,11.0,12.0,4.0,18.0,37.0,4.0,4.0,1.0,8.0
940,26.0,11.0,9.0,7.0,19.0,35.0,45.0,6.0,4.0,30.0,...,26.0,5.0,17.0,7.0,11.0,4.0,80.0,5.0,12.0,18.0
941,5.0,4.0,1.0,2.0,3.0,8.0,5.0,4.0,2.0,4.0,...,1.0,3.0,10.0,6.0,9.0,4.0,5.0,17.0,1.0,0.0
942,19.0,6.0,4.0,4.0,10.0,21.0,29.0,7.0,2.0,14.0,...,13.0,4.0,8.0,3.0,5.0,1.0,12.0,1.0,59.0,12.0


In [19]:
rating_mean

user_id
1      3.568627
2      3.652174
3      2.825000
4      4.444444
5      2.824427
         ...   
939    4.270270
940    3.412500
941    4.000000
942    4.220339
943    3.452381
Length: 943, dtype: float64

In [20]:
def cf_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_bias[movie_id].copy()
        # movie_rating 중 결측값인 경우
        no_rating = movie_ratings.isna()
        # 공통 아이템 수
        common_counts = counts[user_id]
        # 신뢰도 임계치 설정 → 신뢰도가 일정 수준 이상인 사용자만을 계산에 포함                              
        low_significance = common_counts < SIG_LEVEL
        
        none_rating_idx = movie_ratings[no_rating | low_significance].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            prediction = np.dot(movie_ratings, sim_scores) / sim_scores.sum()
            prediction = rating_mean[user_id] + prediction
        else:
            if len(sim_scores) > MIN_RATINGS:
                neighbor_size = min(neighbor_size, sim_scores)
                
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                
                prediction = np.dot(movie_ratings, sim_scores) / sim_scores.sum()
                prediction = rating_mean[user_id] + prediction
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    return prediction

In [21]:
SIG_LEVEL = 3
MIN_RATINGS = 2

score(cf_knn_bias, neighbor_size=30)

nan

---

## 연습문제 3-3

정확도를 더 개선하기 위한 예측값 조정

In [22]:
def cf_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_bias[movie_id].copy()
        # movie_rating 중 결측값인 경우
        no_rating = movie_ratings.isna()
        # 공통 아이템 수
        common_counts = counts[user_id]
        # 신뢰도 임계치 설정 → 신뢰도가 일정 수준 이상인 사용자만을 계산에 포함                              
        low_significance = common_counts < SIG_LEVEL
        
        none_rating_idx = movie_ratings[no_rating | low_significance].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            prediction = np.dot(movie_ratings, sim_scores) / sim_scores.sum()
            prediction = rating_mean[user_id] + prediction
        else:
            if len(sim_scores) > MIN_RATINGS:
                neighbor_size = min(neighbor_size, sim_scores)
                
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                
                prediction = np.dot(movie_ratings, sim_scores) / sim_scores.sum()
                prediction = rating_mean[user_id] + prediction
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    
    # 예측값이 1보다 작으면 1로, 5보다 크면 5로 수정
    if prediction < 1:
        prediction = 1
    elif prediction > 5:
        prediction = 5
    
    return prediction

In [23]:
SIG_LEVEL = 3
MIN_RATINGS = 2

score(cf_knn_bias, neighbor_size=30)

nan

---

# 3.8 사용자 기반 CF와 아이템 기반 CF

유사도를 계산하는 기준이 사용자인가 아이템인가?

사용자 기반 CF(User-Based CF : **UBCF**)  
: 데이터 크기가 적고 각 사용자에 대한 충분한 정보(구매, 평가)가 있는 경우  
→ 정확도가 좋으나 계산이 조금 느릴 수 있고, 데이터가 바뀔 때마다 업데이트를 많이 해야하기 때문  

아이템 기반 CF(Item-Based CF : **IBCF**)   
: 데이터가 크거나 각 사용자에 대한 충분한 정보가 없는 경우  
→ 정확도는 좀 떨어지더라도 계산이 빠르며, 데이터가 바뀌더라도 업데이트를 자주 하지 않아도 됨(결과에 큰 영향 X)

In [24]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, encoding='latin-1')


ratings = ratings.drop('timestamp', axis=1)
movies = movies[['movie_id', 'title']]

from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

In [25]:
# item 유사도 구하기 → IBCF 적용
from sklearn.metrics.pairwise import cosine_similarity
rating_matrix_t = np.transpose(rating_matrix)
matrix_dummy = rating_matrix_t.copy().fillna(0)
item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity, index=matrix_dummy.index, columns=matrix_dummy.index)
item_similarity

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1674,1675,1676,1677,1678,1679,1680,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.315738,0.297080,0.344269,0.248066,0.094199,0.422083,0.344082,0.351522,0.217277,...,0.0,0.038713,0.0,0.000000,0.000000,0.041061,0.0,0.0,0.0,0.054748
2,0.315738,1.000000,0.198259,0.367348,0.231067,0.094258,0.337723,0.248052,0.158356,0.159327,...,0.0,0.064135,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.090701
3,0.297080,0.198259,1.000000,0.275710,0.103277,0.117301,0.269733,0.142555,0.210869,0.119757,...,0.0,0.000000,0.0,0.000000,0.000000,0.038348,0.0,0.0,0.0,0.115045
4,0.344269,0.367348,0.275710,1.000000,0.252114,0.053990,0.363876,0.371715,0.326621,0.184829,...,0.0,0.046357,0.0,0.109265,0.109265,0.043706,0.0,0.0,0.0,0.087412
5,0.248066,0.231067,0.103277,0.252114,1.000000,0.032112,0.230441,0.204300,0.158553,0.067482,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.109181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0.041061,0.000000,0.038348,0.043706,0.000000,0.000000,0.059735,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,1.000000,0.0,0.0,0.0,0.000000
1678,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,1.0,1.0,1.0,0.000000
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,1.0,1.0,1.0,0.000000
1680,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,1.0,1.0,1.0,0.000000


In [26]:
def cf_IBCF(user_id, movie_id):
    if movie_id in rating_matrix:
        user_rating = rating_matrix_t[user_id]
        none_rating_idx = user_rating[user_rating.isnull()].index
        user_rating = user_rating.drop(none_rating_idx)
        
        sim_scores = item_similarity[movie_id]
        sim_scores = sim_scores.drop(none_rating_idx)
        
        mean_rating = np.dot(sim_scores, user_rating) / np.sum(sim_scores)
    else:
        mean_rating = 3.0
    return mean_rating

score(cf_IBCF)

1.0173565223260255

→ UBCF와 그렇게 큰 차이가 있지는 않음