## 데이터 확인
 - data 형식이 | 혹은 tab으로 분리 필요

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv('../data/u.user',sep='|',names = u_cols, encoding = 'latin-1')
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [3]:
i_cols = ['movie_id','title','release_date','video_release_date','IMDB_URL','unknown','Action',\
          'Adventure','Animation','Childern\'s','Comedy','Crime','Documentary','Drama','Fantasy',\
          'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv('../data/u.item',sep='|', names = i_cols, encoding = 'latin-1')
movies = movies.set_index('movie_id')
movies.head()

Unnamed: 0_level_0,title,release_date,video_release_date,IMDB_URL,unknown,Action,Adventure,Animation,Childern's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
r_cols = ['user_id', 'movie_id','rating','timestamp']
ratings = pd.read_csv('../data/u.data',sep='\t', names = r_cols, encoding = 'latin-1')
ratings = ratings.set_index('user_id')
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


## 인기제품 방식
 - 개별 사용자 정보 없을 경우, 간단한 추천의 경우 -> 모든 사람에게 똑같은 추천 제공해야함
   - 이때 가장 인기있는 제품 추천하는 것이 합리적.. -> 평가를 평균값 하여 높은 것을 순서대로 추천

In [5]:
# best seller 추천
def recom_movie1(n_items):
    movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

def recom_movie2(n_items):
    movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
    recommendations = movies.loc[movie_mean.sort_values(ascending=False)[:n_items].index]['title']
    return recommendations

### 추천 시스템 정확도 측정
 - RMSE 활용 -> 예측값과 실제값의 차이

In [6]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

In [7]:
rmse = []
movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
for user in set(ratings.index):
    y_true = ratings.loc[user]['rating']
    y_pred = movie_mean[ratings.loc[user]['movie_id']]
    accuracy = RMSE(y_true, y_pred)
    rmse.append(accuracy)
print(np.mean(rmse))

0.996007224010567


## 사용자 집단별 추천
 - best-seller 방식보다 비슷한 특성군으로 모아 추천을 진행
 - 각 집단의 평점평균을 바탕으로 추천
 - 비슷한 집단 설정 방식 -> 기본적으로는 성별, 나이 등이 존재, 더 나아가 데이터를 이용한 클러스터링도 하나의 방법이 될 것임
 - 아래 코드의 가정은 비슷한 성별의 사람들은 영화 취향 비슷하다는 가정을 가지고 추천을 진행


In [8]:
# 데이터 분리
ratings.drop('timestamp',axis=1, inplace=True)
ratings.reset_index(inplace=True)
movies.reset_index(inplace=True)
movies = movies[['movie_id', 'title']]

In [9]:
# train test 분리
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.25, stratify=y)

In [19]:
# 모델별 RMSE 계산하는 함수
def score(model):
    id_pairs = zip(x_test['user_id'],x_test['movie_id'])
    y_pred = np.array([model(user,movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

In [20]:
rating_matrix = x_train.pivot(index='user_id', columns = 'movie_id', values = 'rating')

In [21]:
# 전체 평균으로 예측치 계산
def best_seller(user_id, movie_id):
    train_mean = x_train.groupby('movie_id')['rating'].mean()
    try:
        rating = train_mean[movie_id]
    except:
        rating = 3.0
    return rating

score(best_seller)

1.0241374664529495

In [26]:
users.reset_index(inplace=True)
merged_ratings = pd.merge(x_train,users)
users.set_index('user_id',inplace=True)

g_mean = merged_ratings[['movie_id','sex','rating']].groupby(['movie_id','sex'])['rating'].mean()

In [27]:
g_mean.head()

movie_id  sex
1         F      3.766667
          M      3.922449
2         F      3.071429
          M      3.134146
3         F      2.300000
Name: rating, dtype: float64

In [37]:
# GENDER 기준 추천
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['sex']
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
    else:
        gender_rating = 3.0
    return gender_rating

score(cf_gender)

1.0316445942949637

## 번외
 - 나이(10살 텀을 기준으로 코딩)
 - 직업