## 라이브러리 생성
* pandas
  * 데이터 조작 및 분석 라이브러리
  * 테이블 형태의 데이터
* numpy
  * 다차원 배열과 행렬 연산
  * 데이터 과학 및 머신러닝 분야
</br>
</br>
* warnings 하는 이유
  * 패키지의 버전이 달라서 발생하는 warning 메세지들을 제거

In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

## Read Data

In [None]:
movies = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/ml-latest-small/movies.csv')
ratings = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/data/ml-latest-small/ratings_updated.p')
genres = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/data/ml-latest-small/genres.p')

> 이전에 추가했던 샘플 데이터 확인

In [None]:
ratings.tail()

Unnamed: 0,userId,movieId,rating,timestamp
44,1001,1866,2.5,1999-12-20 06:10:08
45,1001,2405,3.0,2003-03-05 16:19:57
46,1001,1343,3.0,1997-03-19 12:00:45
47,1001,1875,4.0,2003-03-31 02:13:13
48,1001,4221,2.5,2010-02-27 02:23:02


## Processing(전처리)

In [None]:
# Step1. 테이블 확인

movies.sample()

Unnamed: 0,movieId,title,genres
156,185,"Net, The (1995)",Action|Crime|Thriller


In [None]:
ratings.sample()

Unnamed: 0,userId,movieId,rating,timestamp
36059,246,5902,4.5,2012-11-28 18:31:36


In [None]:
genres.sample()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7001,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0


* 테이블 병합
  * `merge(how='inner')`
    * inner join는 defauld이다.
  * `merge(left_on='movieId', right_index=True)`
    * right_on이 index인 경우 right_index=True로 표기한다.

In [None]:
# Step2. 테이블 병합

ratings = ratings.merge(genres, left_on='movieId', right_index=True)
ratings.sample()

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
56537,376,165,3.5,2013-04-03 13:01:40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


* 계산 편의를 위해 0 값 null 처리
  * NaN (Not a Number)

In [None]:
# Step3. 결측치 처리

ratings = ratings.replace(0, np.nan)
ratings.sample()

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
86611,561,377,3.5,2017-04-01 23:58:47,,1.0,,,,,...,,,,,,1.0,,1.0,,


## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(ratings, random_state=33, test_size=0.1)

In [None]:
print(train.shape)
print(test.shape)

(90796, 24)
(10089, 24)


## Item Profile
> Item Profile란? </br>
> userId가 각 genre에 대해서 평균 rating를 얼마로 남겼는지

In [None]:
# 미리보기
train.sample()

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
84077,534,130520,4.0,2016-04-04 16:39:58,,,1.0,1.0,1.0,1.0,...,,,,,,,1.0,,,


In [None]:
genres.loc[3000]

(no genres listed)    0
Action                1
Adventure             1
Animation             1
Children              0
Comedy                0
Crime                 0
Documentary           0
Drama                 1
Fantasy               1
Film-Noir             0
Horror                0
IMAX                  0
Musical               0
Mystery               0
Romance               0
Sci-Fi                0
Thriller              0
War                   0
Western               0
Name: 3000, dtype: int64

In [None]:
genre_cols = genres.columns
genre_cols

Index(['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western'],
      dtype='object')

* train df의 평점 col을 가중 평점 col로 변경
  * 새로운 열 = 각 영화의 장르 * 영화의 평점으로 가중치
  *    `train[cols] = train[cols] * train['rating']`

In [None]:
# Step1. train df에 가중 평점 열 추가
for cols in genre_cols:
    train[cols] = train[cols] * train['rating']
train.sample(3)

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
91831,596,1961,3.0,2018-09-01 18:41:06,,,,,,,...,,,,,,,,,,
36179,247,223,3.0,2016-07-04 20:54:19,,,,,,3.0,...,,,,,,,,,,
18390,117,225,3.0,1996-10-01 09:41:53,,,,,,,...,,,,,,,,3.0,,


> 해석:
</br>
> 3번 유저는 애니메이션, 어린이 장르 보다는 호러나 미스테리 장르를 선호한다.

In [None]:
# Step2. 유저별 장르의 평점 평균

# `train.groupby('userId')['Action', 'Adventure'].mean()`
user_profile = train.groupby('userId')[genre_cols].mean()
user_profile.sample()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
415,,3.895833,3.857143,4.0,4.0,3.982143,4.442308,4.5,4.290698,3.5,4.333333,3.875,4.0,3.5,4.25,3.973684,4.0,4.222222,4.5,4.75


In [None]:
user_profile.loc[266]

(no genres listed)         NaN
Action                3.704918
Adventure             3.641026
Animation             2.800000
Children              2.125000
Comedy                3.309859
Crime                 3.862069
Documentary           3.000000
Drama                 3.313725
Fantasy               3.294118
Film-Noir             4.000000
Horror                3.000000
IMAX                       NaN
Musical               3.000000
Mystery               2.666667
Romance               2.266667
Sci-Fi                3.580645
Thriller              3.604651
War                   4.571429
Western               3.000000
Name: 266, dtype: float64

### My user profile
* userId: 1000번, 1001번

In [None]:
user_profile.loc[[1000, 1001]]

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1000,,3.35,4.0,4.0,3.5,3.5,3.3,4.0,2.95,4.0,,2.833333,,2.5,4.0,3.666667,3.5,3.166667,,4.0
1001,,3.083333,3.25,2.0,2.5,3.0,3.333333,,3.333333,2.333333,,4.0,3.0,2.0,,2.75,,3.166667,2.5,3.5


## Predict
### - Sample data

> 특정 사용자의 취향 추측하기
- 다큐멘터리, 서부 영화 등을 좋아하고 뮤지컬은 좋아하지 않는다.
- 느와르는 본 적이 없다.

*주의! sample은 특정 인덱스 '13852'에 대한 정보이다.*



In [None]:
# Step1. 샘플 index 선택
sample = test.loc[13852]
# sample

In [None]:
# Step2. userId 확인
sample_user = sample['userId']
sample_user

89

In [None]:
# Step3. 장르 선호도 확인
sample_user_profile = user_profile.loc[sample_user]
sample_user_profile

(no genres listed)    3.000000
Action                3.336538
Adventure             3.531250
Animation             4.088608
Children              3.909091
Comedy                3.450116
Crime                 3.325000
Documentary           4.500000
Drama                 3.367021
Fantasy               3.500000
Film-Noir                  NaN
Horror                4.266667
IMAX                  4.125000
Musical               2.852941
Mystery               3.833333
Romance               3.085714
Sci-Fi                3.387097
Thriller              3.738095
War                   3.227273
Western               4.500000
Name: 89, dtype: float64

> 특정 영화의 예상 평점 추측하기:
- sample의 movieId는 88069
- 이 영화의 장르는 Comedy와 Crime
- 장르의 평점(예상): 3.45, 3.32
  - sample_user_profile 참고
- 영화의 평점(예상); 3.38

*주의! sample은 특정 인덱스 '13852'에 대한 정보이다.*

In [None]:
# Step1. 유저 선택
sample['movieId']

88069

In [None]:
# Step2. 장르 정보
movies[movies['movieId'] == 88069]

Unnamed: 0,movieId,title,genres
7641,88069,Delhi Belly (2011),Comedy|Crime


In [None]:
# Step3-1. 장르별 평점 추측하기
sample_user_profile * sample[genre_cols]

(no genres listed)         NaN
Action                     NaN
Adventure                  NaN
Animation                  NaN
Children                   NaN
Comedy                3.450116
Crime                    3.325
Documentary                NaN
Drama                      NaN
Fantasy                    NaN
Film-Noir                  NaN
Horror                     NaN
IMAX                       NaN
Musical                    NaN
Mystery                    NaN
Romance                    NaN
Sci-Fi                     NaN
Thriller                   NaN
War                        NaN
Western                    NaN
dtype: object

In [None]:
# Step3-2. 영화 평점 추측하기
(sample_user_profile * sample[genre_cols]).mean()

3.387558004640371

### 전체 데이터로 확장

> tqdm 라이브러리란?
</br>
> 진행 상황을 시각적으로 표시

In [None]:
from tqdm import tqdm_notebook

* `iterrows()` 함수
  * DataFrame을 행 단위로 순회하며 각 행의 인덱스와 데이터를 반환
* `user_profile.loc[user]`
  * 특정 사용자의 프로파일
* `row[genre_cols]`
  *  특정 행의 장르 관련 열 값

In [None]:
# Step1. 예측값 리스트 생성

predict = []
for idx, row in tqdm_notebook(test.iterrows()):
    user = row['userId']
    # user profile * item profile
    predict.append((user_profile.loc[user] * row[genre_cols]).mean())

0it [00:00, ?it/s]

* 전체 데이터 평균 vs 유저 평균
  * (유저 평균이 더 정확할 수 있으나) 전체데이터 평균으로 계산

> test 데이터프레임에 predict 열을 추가하고,
> </br>이 열의 일부 값이 결측치(NaN)일 때, 해당 값을 훈련 데이터의 평균 평점으로 대체하는 작업을 수행

In [None]:
test['predict'] = predict
test.loc[test['predict'].isnull(), 'predict'] = train['rating'].mean()

In [None]:
test[test['predict'].isnull()]

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,predict


## Model validation

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mse = mean_squared_error(test['rating'], test['predict'])
rmse = np.sqrt(mse)
rmse

0.9262258382619217

### user profile,