# 2.1 데이터 읽기

In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import os
os.chdir('C:/RecoSys/Data/')

In [4]:
# user 데이터 읽어오기
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [5]:
# item 파일 읽어오기(영화 정보, 영화 평점)
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 
          'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 
          'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')
moives = movies.set_index('movie_id')
moives.head()

Unnamed: 0_level_0,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
# 평점 데이터 읽어오기
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, encoding='latin-1')
ratings = ratings.set_index('user_id')
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


→ timestamp : '연도/날짜/시간'을 숫자로 변환한 값

---

# 2.2 인기제품 방식(Best-Seller 추천)

1) 개별 사용자에 대한 정보가 없음  
2) 때문에 정확도에 관계없이 가장 '간단한 추천'을 제공해야하는 상황  
→ **평균값**을 이용한 추천이 합리적!

In [7]:
def recom_movie1(n_items):
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items]
    recom_movies = moives.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
recom_movie1(5)

movie_id
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1122                       They Made Me a Criminal (1939)
1653    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object

※ 함수구조  
① movie_mean : movie_id별 평점평균 매기기  
② moive_sort : movie_mean의 평점 상위 n_items개의 영화를 내림차순으로 정렬  
③ recom_moives : movie_sort의 movie_id에 해당하는 영화 정보(제목, 출시일, ..) 추출  
④ recommendations : 추출한 정보 중에서 movie_id와 해당하는 제목만 추출 

In [8]:
def recom_movies1(n_items):
    return movies.loc[movie_mean.sort_values(ascending=False)[:n_items].index]['title']

→ recom_movies1의 압축형 버전!

---

# 2.3 추천시스템의 정확도 추천

추천시스템의 성능(정확도) 측정  
실제값과 예측값 간의 차이가 적어야함  
Eg. **RMSE**

In [9]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

In [10]:
rmse = []
for user in set(ratings.index):
    y_true = ratings.loc[user]['rating']
    y_pred = movie_mean[ratings.loc[user]['movie_id']]
    accuracy = RMSE(y_true, y_pred)
    rmse.append(accuracy)

print(np.mean(rmse))

0.996007224010567


※ y_pred 구조  
① user(index)에 해당하는 ratings 정보 긁어오기  
② 그 중, 'movie_id'만 갖고옴  
→ 이렇게 되면 user_id, movie_id로만 구성된 series가 완성  
③ movie_mean은 movie_id, 평점 평균으로 구성된 series임  
→ 이 과정을 거쳐 어떤 user가 어떤 moive에 대해 어떤 평점을 주었는지 알 수 있음

---

# 2.4 사용자 집단별 추천

best-seller처럼 전체 사용자의 평점평균을 이용하는 방법보다는 비슷한 특성의 사람들을 묶어서 이를 바탕으로 추천을 하는 것이 합리적!

In [11]:
# 사용자, 영화, 평점 데이터 다시 읽어오기
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')

i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')
movies = movies[['movie_id', 'title']]

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, encoding='latin-1')
ratings = ratings.drop('timestamp', axis=1)

## train_test_split

In [12]:
from sklearn.model_selection import train_test_split
X = ratings.copy()
y = ratings['user_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

train_test_split에서 stratify의 역할 : 계층적 데이터 추출 옵션  
→ 원래 데이터의 분포와 유사하게 데이터 추출

## 정확도 계산

In [13]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

# 모델별 RMSE 구하기
def score(model):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(X_test['rating'])
    return RMSE(y_true, y_pred)

In [14]:
# return Full-Matrix
rating_matrix = X_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,,,4.0,,,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [15]:
# user_id를 기준으로 X_train, users merge하기
merged_ratings = pd.merge(X_train, users)

# set_index → 나중에 사용자의 데이터를 확인하기 위해 필요함
users = users.set_index('user_id')

# gender별 평점평균 계산
g_mean = merged_ratings[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()
g_mean

movie_id  sex
1         F      3.744681
          M      3.910204
2         F      3.142857
          M      3.072289
3         F      2.909091
                   ...   
1677      F      3.000000
1678      M      1.000000
1679      M      3.000000
1680      M      2.000000
1682      M      3.000000
Name: rating, Length: 3019, dtype: float64

In [16]:
# gender별 기준 추천
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['sex']
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
    else:
        gender_rating = 3.0
    return gender_rating

score(cf_gender)

1.0341914952195754

---

## 연습문제 2-2

In [17]:
# merged_df = pd.merge(X_train, users)
# s_o_mean = merged_df[['movie_id', 'sex', 'occupation', 'rating']].groupby(['movie_id', 'sex', 'occupation'])['rating'].mean()
# s_o_mean

In [18]:
# def cf_sex_ocupation(user_id, movie_id):
#     sex = users.loc[user_id]['sex']
#     occupation = users.loc[user_id]['occupation']
    
#     if movie_id in rating_matrix:
#         if (sex in s_o_mean[movie_id]) and (occupation in s_o_mean[movie_id]):
#             s_o_rating = s_o_mean[movie_id][sex, occupation]
#         else:
#             s_o_rating = 3.0
#     else:
#         s_o_rating = 3.0
#     return s_o_rating

# score(cf_sex_occupation)

---

# 2.5 내용 기반 필터링 추천(Content-based filtering : CB)

**① 각 아이템 간의 유사도 계산**  
Eg. 아이템의 내용이 텍스트라면 tf-idf가 유사도 측정 지표가 됨(가중치 계산)    
cosine similarity를 통해 유사도 계산  
**② 추천 대상이 되는 사용자가 선호하는 (좋게 평가한) 아이템을 선정**  
→ 아이템을 몇 개나 고를 것인가?  
**③ ②에서 선정된 아이템과 가장 유사도가 높은 N개의 아이템 찾기**  
**④ N개의 아이템 추천**  
→ 아이템을 어떻게 결합할 것인가?  
Eg. 아이템을 합쳐서 만드는 pooling 방법, 각 아이템의 점수에 따라 나열하는 방법

**※ tf-idf(term frequency-inverse document frequency)**  
- tf : 어떤 단어가 해당 아이템(문서)에 얼마나 자주 등장하는가?  
- idf : 그 단어는 해당 문서에서 다른 문서에 비해서 상대적으로 얼마나 더 자주 등장하는가?

In [19]:
# Data 읽기
movies = pd.read_csv('movies_metadata.csv', encoding='latin-1', low_memory=False)
movies = movies[['id', 'title', 'overview']]
movies.head(10)
len(movies)

45442

In [20]:
# Preprocessing
movies = movies.dropna()
movies['overview'] = movies['overview'].fillna(' ')
len(movies)

# 불용어 enlgish로 지정 → tf-idf 계산
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['overview'])

# Cosine 유사도 계산
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim = pd.DataFrame(cosine_sim, index=movies.index, columns=movies.index)

In [21]:
# title을 index로! value는 영화 index
indices = pd.Series(movies.index, index=movies['title'])

def content_recommender(title, n_of_recomm):
    # idx 추출
    idx = indices[title]
    
    # 코사인 유사도 계산(자기자신은 제외) 후 정렬
    sim_scores = cosine_sim[idx]
    sim_scores = sim_scores.sort_values(ascending=False)[1:n_of_recomm+1]
    
    # 코사인 유사도에 기반하여 영화 제목 추출
    return movies.loc[sim_scores.index]['title']

# 추천
print(content_recommender('The Lion King', 5))
print(content_recommender('The Dark Knight Rises', 10))

34664    How the Lion Cub and the Turtle Sang a Song
9339                               The Lion King 1Â½
9101                  The Lion King 2: Simba's Pride
42806                                           Prey
25637                                 Fearless Fagan
Name: title, dtype: object
12468                                      The Dark Knight
149                                         Batman Forever
1321                                        Batman Returns
15497                           Batman: Under the Red Hood
584                                                 Batman
21179    Batman Unmasked: The Psychology of the Dark Kn...
9216                    Batman Beyond: Return of the Joker
18021                                     Batman: Year One
19778              Batman: The Dark Knight Returns, Part 1
3085                          Batman: Mask of the Phantasm
Name: title, dtype: object
