In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### User(사용자) 데이터

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('/content/drive/MyDrive/recosys/u.user', sep = '|', names = u_cols, encoding = 'latin-1')
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


### Item(아이템) 데이터

In [3]:
i_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDB URL', 'unknown', 'Action',
          'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Horror',
          'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'war', 'western']

movies = pd.read_csv('/content/drive/MyDrive/recosys/u.item', sep = '|', names = i_cols, encoding = 'latin-1')
movies = movies.set_index('movie_id')
movies.head()

Unnamed: 0_level_0,title,release_date,video_release_date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film_Horror,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,war,western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Ratings(평점) 데이터

In [4]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('/content/drive/MyDrive/recosys/u.data', sep = '\t', names = r_cols, encoding = 'latin-1')
ratings = ratings.set_index('user_id')
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


### 인기 제품 추천 방식 - Best Seller 추천

개별 사용자에 대한 정보가 없거나 정확도에 관계없이 가장 간단한 추천을 제공해야 하는 상황에서 사용할 수 있는 방법은 모든 사람들에게 똑같이 인기있는 제품을 추천하는 방식이다


In [5]:
def recom_movie(n_items):
    movie_sort = movie_mean.sort_values(ascending = False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
recom_movie(5)

movie_id
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1122                       They Made Me a Criminal (1939)
1653    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object

In [6]:
def RMSE(y_true, y_pred):
  return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

rmse = []
for user in set(ratings.index):
  y_true = ratings.loc[user]['rating']
  y_pred = movie_mean[ratings.loc[user]['movie_id']]
  accuracy = RMSE(y_true, y_pred)
  rmse.append(accuracy)

print(f'100K 개의 영화 평점에 대한 추천 시스템의 성능(RMSE): {np.round(np.mean(rmse),4)}')

100K 개의 영화 평점에 대한 추천 시스템의 성능(RMSE): 0.996


### 사용자 집단별 추천

best-seller 방식보다 조금 더 발전한 방법으로 사용자들을 비슷한 특성끼리 묶은 후, 각 집단의 평점평균을 바탕으로 추천하는 것이다. 사용자들을 집단으로 묶는 기준으로는 나이, 성별, 직업 등이 있다. 같은 성별이나 나이, 직업이 비슷한 사람들끼리 영화 취향이 비슷할 것이라는 가정을 전제로 한다

In [7]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('/content/drive/MyDrive/recosys/u.data', sep = '\t', names = r_cols, encoding = 'latin-1')

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('/content/drive/MyDrive/recosys/u.user', sep = '|', names = u_cols, encoding = 'latin-1')

#timestamp 제거
ratings = ratings.drop('timestamp', axis = 1)

#movie ID와 title 빼고 다른 데이터 제거
i_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDB URL', 'unknown', 'Action',
          'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Horror',
          'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'war', 'western']

movies = pd.read_csv('/content/drive/MyDrive/recosys/u.item', sep = '|', names = i_cols, encoding = 'latin-1')
movies = movies[['movie_id', 'title']]

In [8]:
#train, test 분리
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, stratify = y)

#### 모델 별로 RMSE 계산하기

In [9]:
def RMSE(y_true, y_pred):
  return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

#모델 별 RMSE를 계산하는 함수
def score(model):
  id_pairs = zip(x_test['user_id'], x_test['movie_id'])
  y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred)

#train data로 Full Matrix 구하기
rating_matrix = x_train.pivot(index = 'user_id', columns = 'movie_id', values = 'rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1668,1669,1670,1671,1672,1677,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,,3.0,,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,,,,4.0,,3.0,,...,,,,,,,,,,
941,,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [10]:
#전체 평균으로 예측치를 계산하는 기본 모델

def best_seller(user_id, movie_id):
  try:
    rating = train_mean[movie_id]  #해당 데이터가 평균 데이터에 존재한다면 평균값을 반환

  except:
    rating = 3.0  #해당 데이터가 평균 평점이 존재하지 않으면 기본값 3.0을 반환

  return rating

train_mean = x_train.groupby(['movie_id'])['rating'].mean()
print(np.round(score(best_seller),4))

1.0271


#### 성별로 집단 나누어 예측

In [11]:
x_train

Unnamed: 0,user_id,movie_id,rating
67655,655,300,3
2617,125,1180,3
9856,356,272,5
67184,903,191,5
20247,181,137,2
...,...,...,...
29601,303,1073,4
14706,405,719,1
60470,269,232,1
35196,537,239,2


In [12]:
users

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [13]:
#Full Matrix를 사용자 데이터와 merge
merged_ratings = pd.merge(x_train, users)
users = users.set_index('user_id')

#gender별 평점평균 계산
g_mean = merged_ratings[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

In [14]:
def cf_gender(user_id, movie_id):
  if movie_id in rating_matrix :
    gender = users.loc[user_id]['sex']

    if gender in g_mean[movie_id]:
      gender_rating = g_mean[movie_id][gender]

    else:
      gender_rating = 3.0

  else:
    gender_rating = 3.0

  return gender_rating

score(cf_gender)

1.034989674516658

#### 직업으로 집단 나누어 예측

In [15]:
o_mean = merged_ratings[['movie_id', 'occupation', 'rating']].groupby(['movie_id', 'occupation'])['rating'].mean()

In [16]:
def cf_occupation(user_id, movie_id):
  if movie_id in rating_matrix:
    occupation = users.loc[user_id]['occupation']

    if occupation in o_mean[movie_id]:
      occupation_rating = o_mean[movie_id][occupation]

    else:
      occupation_rating = 3.0

  else:
    occupation_rating = 3.0

  return occupation_rating

score(cf_occupation)

1.1241232216011365

#### 성별과 직업 모두 고려해서 나누어 예측

In [17]:
b_mean = merged_ratings[['movie_id', 'sex', 'occupation', 'rating']].groupby(['movie_id', 'sex', 'occupation'])['rating'].mean()

In [18]:
def cf_both(user_id, movie_id):
  if movie_id in rating_matrix:
    both = users.loc[user_id]

    if both is not None:
      gender = both['sex']
      occupation = both['occupation']
      if (movie_id in b_mean) and (gender, occupation) in b_mean[movie_id]:
          return b_mean[movie_id][(gender, occupation)]

  both_rating = 3.0

  return both_rating

score(cf_both)

1.144995118518433