In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# data directory absolute path
data_dir = '../../../../data/'

In [3]:
# user information

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(data_dir+'u.user', sep='|', names=u_cols, encoding='latin-1')
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [7]:
# item(movie) information

i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 
          'unknown', 'Action', 'Adventure', 'Animation', 'Childeren\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',
          'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
         ]
movies = pd.read_csv(data_dir+'u.item', sep='|', names=i_cols, encoding='latin-1')
movies = movies.set_index('movie_id')
movies.head()

Unnamed: 0_level_0,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Childeren's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
# rating information

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(data_dir+'u.data', sep='\t', names=r_cols, encoding='latin-1')
ratings = ratings.set_index('user_id')
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


In [15]:
movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
movie_mean.head()

movie_id
1    3.878319
2    3.206107
3    3.033333
4    3.550239
5    3.302326
Name: rating, dtype: float64

In [20]:
# Popularity (best-seller)

def recommend_movie1(n_items):
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items]
#     print('movie_sort.index', movie_sort.index)
    recommend_movies = movies.loc[movie_sort.index]
#     print('recommend_movies', recommend_movies)
    recommendations = recommend_movies['title']
    return recommendations

recommend_movie1(5)

movie_id
1293                                      Star Kid (1997)
1467                 Saint of Fort Washington, The (1993)
1653    Entertaining Angels: The Dorothy Day Story (1996)
814                         Great Day in Harlem, A (1994)
1122                       They Made Me a Criminal (1939)
Name: title, dtype: object

In [21]:
# Pythonic version

def recommend_movie2(n_items):
    return movies.loc[movie_mean.sort_values(ascending=False)[:n_items].index]['title']
recommend_movie2(10)

movie_id
1293                                      Star Kid (1997)
1467                 Saint of Fort Washington, The (1993)
1653    Entertaining Angels: The Dorothy Day Story (1996)
814                         Great Day in Harlem, A (1994)
1122                       They Made Me a Criminal (1939)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1189                                   Prefontaine (1997)
1500                            Santa with Muscles (1996)
1536                                 Aiqing wansui (1994)
Name: title, dtype: object

In [23]:
# RMSE

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

In [24]:
rmse = []
for user in set(ratings.index): 
    y_true = ratings.loc[user]['rating']
    y_pred = movie_mean[ratings.loc[user]['movie_id']]
    accuracy = RMSE(y_true, y_pred)
    rmse.append(accuracy)
print(np.mean(rmse))

0.996007224010567


In [26]:
# 사용자 집단별 추천

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(data_dir+'u.user', sep='|', names=u_cols, encoding='latin-1')

i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 
          'unknown', 'Action', 'Adventure', 'Animation', 'Childeren\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',
          'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
         ]
movies = pd.read_csv(data_dir+'u.item', sep='|', names=i_cols, encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(data_dir+'u.data', sep='\t', names=r_cols, encoding='latin-1')

In [27]:
# timestamp 제거
ratings = ratings.drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [28]:
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]
movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [41]:
# train, test set 분리

x = ratings.copy() # 원본.copy() : 원본 보존
y = ratings['user_id'] # stratify의 기준
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [42]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

In [43]:
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

In [44]:
# train data로 full matrix 구하기

rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
print(x_train)
print(rating_matrix)

       user_id  movie_id  rating
59218      391        48       4
80624      833        89       5
44528      670       945       4
45439      588       154       4
61741      838      1005       4
...        ...       ...     ...
88904      709       118       5
90838      721       237       3
11522      293       485       3
66668      788       227       3
59385      293        97       4

[75000 rows x 3 columns]
movie_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                               ...   
1          NaN   3.0   4.0   3.0   3.0   NaN   NaN   NaN   5.0   NaN  ...   
2          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
3          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5          NaN   3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
...        ...   ...   ...   ...   ... 

In [45]:
def best_seller(user_id, movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        rating = 3.0
    return rating

train_mean = x_train.groupby(['movie_id'])['rating'].mean()
score(best_seller)

1.025820053631765

In [51]:
# Full matrix, user data merge
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(data_dir+'u.user', sep='|', names=u_cols, encoding='latin-1')
print(x_train)
print(users)
merged_ratings = pd.merge(x_train, users)
users = users.set_index('user_id')
merged_ratings

       user_id  movie_id  rating
59218      391        48       4
80624      833        89       5
44528      670       945       4
45439      588       154       4
61741      838      1005       4
...        ...       ...     ...
88904      709       118       5
90838      721       237       3
11522      293       485       3
66668      788       227       3
59385      293        97       4

[75000 rows x 3 columns]
     user_id  age sex     occupation zip_code
0          1   24   M     technician    85711
1          2   53   F          other    94043
2          3   23   M         writer    32067
3          4   24   M     technician    43537
4          5   33   F          other    15213
..       ...  ...  ..            ...      ...
938      939   26   F        student    33319
939      940   32   M  administrator    02215
940      941   20   M        student    97229
941      942   48   F      librarian    78209
942      943   22   M        student    77841

[943 rows x 5 columns]


Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,391,48,4,23,M,student,84604
1,391,89,3,23,M,student,84604
2,391,228,2,23,M,student,84604
3,391,603,5,23,M,student,84604
4,391,174,5,23,M,student,84604
...,...,...,...,...,...,...,...
74995,143,331,5,42,M,technician,08832
74996,143,323,3,42,M,technician,08832
74997,143,1038,3,42,M,technician,08832
74998,143,294,3,42,M,technician,08832


In [52]:
# gender별 각 영화의 평점 평균 계산
g_mean = merged_ratings[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()
g_mean

movie_id  sex
1         F      3.826087
          M      3.909091
2         F      3.461538
          M      3.187500
3         F      2.785714
                   ...   
1676      M      2.000000
1678      M      1.000000
1679      M      3.000000
1680      M      2.000000
1681      M      3.000000
Name: rating, Length: 3028, dtype: float64

In [53]:
# gender별 평균을 예측치로 돌려주는 함수
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['sex']
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
    else:
        gender_rating = 3.0
    return gender_rating

score(cf_gender)

1.0325182132631032

In [54]:
# practice 2-1 : occupation

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(data_dir+'u.user', sep='|', names=u_cols, encoding='latin-1')

merged_ratings_2 = pd.merge(x_train, users)
users = users.set_index('user_id')
merged_ratings_2

o_mean = merged_ratings_2[['movie_id', 'occupation', 'rating']].groupby(['movie_id', 'occupation'])['rating'].mean()
o_mean

def cf_occupation(user_id, movie_id):
    if movie_id in rating_matrix:
        occup = users.loc[user_id]['occupation']
        if occup in o_mean[movie_id]:
            occupation_rating = o_mean[movie_id][occup]
        else:
            occupation_rating = 3.0
    else:
        occupation_rating = 3.0
    return occupation_rating

score(cf_occupation)

1.1148927115661529

In [62]:
# practice 2-2 : gender & occupation

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(data_dir+'u.user', sep='|', names=u_cols, encoding='latin-1')

users['occupation'].describe()

count         943
unique         21
top       student
freq          196
Name: occupation, dtype: object

In [63]:
merged_ratings_3 = pd.merge(x_train, users)
users = users.set_index('user_id')
merged_ratings_3

go_mean = merged_ratings_3[['movie_id', 'sex', 'occupation', 'rating']].groupby(['movie_id', 'sex', 'occupation'])['rating'].mean()
go_mean



movie_id  sex  occupation   
1         F    administrator    4.153846
               artist           4.250000
               educator         3.666667
               engineer         4.000000
               entertainment    4.000000
                                  ...   
1676      M    other            2.000000
1678      M    student          1.000000
1679      M    student          3.000000
1680      M    student          2.000000
1681      M    writer           3.000000
Name: rating, Length: 22599, dtype: float64

In [65]:
def go_rating(user_id, movie_id):
    if movie_id in rating_matrix:
        occup = users.loc[user_id]['occupation']
        gender = users.loc[user_id]['sex']
        if occup in go_mean[movie_id] and gender in go_mean[movie_id]:
            go_rating = go_mean[movie_id][gender][occup]
        else:
            go_rating = 3.0
    else:
        go_rating = 3.0
    return go_rating

score(go_rating)

1.247525550840543