In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

### Load Dataset

In [43]:
path = '../data'

ratings_df = pd.read_csv(os.path.join(path,'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [44]:
ratings_df.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [45]:
movies_df.head(2)

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy


In [46]:
tags_df.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996


### Genres를 이용한 movie representation

In [48]:
total_count = len(movies_df.index)
total_genres = list(set([genre for sublist in movies_df['genres'].apply(lambda x : x.split('|')).values for genre in sublist]))

print(f"전체 영화 수: {total_count}")
print(f"전체 장르 수: {len(total_genres)}")

전체 영화 수: 9742
전체 장르 수: 20


In [62]:
genre_count = defaultdict(lambda : 0)

for genre_list in movies_df['genres']:
    for genre in genre_list.split('|'):
        genre_count[genre] += 1
        
genre_count

defaultdict(<function __main__.<lambda>()>,
            {'Adventure': 1263,
             'Animation': 611,
             'Children': 664,
             'Comedy': 3756,
             'Fantasy': 779,
             'Romance': 1596,
             'Drama': 4361,
             'Action': 1828,
             'Crime': 1199,
             'Thriller': 1894,
             'Horror': 978,
             'Mystery': 573,
             'Sci-Fi': 980,
             'War': 382,
             'Musical': 334,
             'Documentary': 440,
             'IMAX': 158,
             'Western': 167,
             'Film-Noir': 87,
             '(no genres listed)': 34})

In [63]:
# tf-idf
for genre in genre_count:
    genre_count[genre] = np.log10(total_count/genre_count[genre])

genre_count

defaultdict(<function __main__.<lambda>()>,
            {'Adventure': 0.8872447746804204,
             'Animation': 1.2026069149931968,
             'Children': 1.1664800458677336,
             'Comedy': 0.4139225416416778,
             'Fantasy': 1.0971106675631865,
             'Romance': 0.7856152382210405,
             'Drama': 0.3490620385623247,
             'Action': 0.7266719338379385,
             'Crime': 0.9098289421369025,
             'Thriller': 0.7112681505684965,
             'Horror': 0.9983092704481497,
             'Mystery': 1.2304935032683613,
             'Sci-Fi': 0.9974220495432563,
             'War': 1.4065847623240424,
             'Musical': 1.4649016584241867,
             'Documentary': 1.3451954487495636,
             'IMAX': 1.7899910382813284,
             'Western': 1.7659316540881678,
             'Film-Noir': 2.0491288726171324,
             '(no genres listed)': 2.457169208193496})

In [82]:
genre_representation = pd.DataFrame(columns = sorted(total_genres),
                                    index=movies_df.index)
genre_representation['genres'] = movies_df['genres']

def func(data):
    for genre in data['genres'].split('|'):
        data[genre] = genre_count[genre]
    return data

genre_representation.apply(func, axis=1).iloc[:,:-1]

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,0.726672,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193583,,,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193585,,,,,,,,,0.349062,,,,,,,,,,,
193587,,0.726672,,1.202607,,,,,,,,,,,,,,,,


### Tag를 이용한 Movie Representation

In [101]:
# get unique tag
tag_column = list(tags_df['tag'].apply(lambda x : x.split(',')))
unique_tags = list(set(list(map(lambda x : x.strip(), [tag[0] for tag in tag_column]))))

print(len(tag_column))
print(len(unique_tags))

3683
1589


In [111]:
# Compute IDF for tag
total_movie_count = len(set(tags_df['movieId']))

# key : tag, value : number of movies with such tag
tag_count_dict = defaultdict(lambda : 0)

for movie_tag in tags_df['tag']:
    tag_count_dict[movie_tag] += 1
    
tag_idf = {}
for tag in tag_count_dict:
    tag_idf[tag] = np.log10(total_movie_count / tag_count_dict[tag])

tag_idf

{'funny': 1.8347247056857963,
 'Highly quotable': 2.7193312869837265,
 'will ferrell': 2.5943925503754266,
 'Boxing story': 3.196452541703389,
 'MMA': 3.196452541703389,
 'Tom Hardy': 2.895422546039408,
 'drugs': 2.196452541703389,
 'Leonardo DiCaprio': 2.196452541703389,
 'Martin Scorsese': 2.5943925503754266,
 'way too long': 3.196452541703389,
 'Al Pacino': 2.4974825373673704,
 'gangster': 2.895422546039408,
 'mafia': 2.7193312869837265,
 'Mafia': 2.196452541703389,
 'holocaust': 2.895422546039408,
 'true story': 2.5943925503754266,
 'twist ending': 1.9176989407505602,
 'Anthony Hopkins': 3.196452541703389,
 'courtroom drama': 2.895422546039408,
 'britpop': 3.196452541703389,
 'indie record label': 3.196452541703389,
 'music': 1.9923325590474643,
 'dumpster diving': 3.196452541703389,
 'Sustainability': 3.196452541703389,
 'romantic comedy': 3.196452541703389,
 'wedding': 2.4183012913197452,
 'painter': 3.196452541703389,
 'bloody': 2.7193312869837265,
 'black hole': 3.1964525417033

In [112]:
len(tag_idf)

1589

In [172]:
# Create movie representations
tag_representation = pd.DataFrame(columns=sorted(unique_tags),
                                  index=set(tags_df['movieId']))
tag_representation.head(2)

Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,


In [174]:
def func(data):
    for tag in tags_df[tags_df.movieId == data.name]['tag'].unique():
        data[tag] = tag_idf[tag]
    return data

tag_representation = tag_representation.apply(func, axis=1).sort_index(0)

tag_representation.head(2)

Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,


#### Example

In [179]:
movies_df.head(2)

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy


In [177]:
tag_representation.loc[1].dropna()

fun      2.497483
pixar    2.895423
Name: 1, dtype: float64

In [178]:
tag_representation.loc[2].dropna()

Robin Williams      2.719331
fantasy             2.418301
game                3.196453
magic board game    3.196453
Name: 2, dtype: float64

In [180]:
print(genre_representation.shape)
print(tag_representation.shape)

(9742, 21)
(1572, 1589)


### Final Movie Representation

In [189]:
movie_representation = pd.concat([genre_representation.iloc[:,:-1],
                                  tag_representation], axis=1).fillna(0)
print(movie_representation.shape)
movie_representation.head(3)

(9742, 1609)


Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,0.0,0.0,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.0,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.887245,0.0,1.16648,0.0,0.0,0.0,0.0,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.413923,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Contents 유사도 평가

> - Cosine similarity 사용

In [190]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim,
                             index=[a.index])
    
    return result_df

In [191]:
cs_df = cos_sim_matrix(movie_representation, movie_representation)
cs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.0,0.124438,0.008403,0.040571,0.011755,0.0,0.016339,0.331122,0.0,0.131794,...,0.064466,0.260941,0.071492,0.27171,0.0,0.348295,0.379492,0.0,0.232553,0.093519
2,0.124438,1.0,0.0,0.0,0.0,0.0,0.0,0.240843,0.0,0.095861,...,0.0,0.0,0.0,0.0,0.0,0.108082,0.117763,0.0,0.0,0.0
3,0.008403,0.0,1.0,0.179391,0.011294,0.0,0.072246,0.0,0.0,0.0,...,0.00656,0.0,0.068686,0.0,0.0,0.020322,0.022142,0.0,0.0,0.089849
4,0.040571,0.0,0.179391,1.0,0.05453,0.0,0.348828,0.0,0.0,0.0,...,0.031674,0.101979,0.567487,0.0,0.0,0.098119,0.106908,0.365843,0.0,0.433821
5,0.011755,0.0,0.011294,0.05453,1.0,0.0,0.640342,0.0,0.0,0.0,...,0.009177,0.0,0.096091,0.0,0.0,0.028429,0.030976,0.0,0.0,0.125697


In [193]:
print(cs_df.shape)
cs_df[1].sort_values(ascending=False).head()

(9742, 9742)


2         1.000000
46972     0.322201
126142    0.300850
2043      0.300850
2399      0.300850
Name: 1, dtype: float64

In [194]:
print(movies_df.loc[1])
print(movies_df.loc[46972])
print(movies_df.loc[126142])
print(movies_df.loc[2043])
print(movies_df.loc[2399])

title                                Toy Story (1995)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 1, dtype: object
title     Night at the Museum (2006)
genres    Action|Comedy|Fantasy|IMAX
Name: 46972, dtype: object
title     The Cave of the Golden Rose (1991)
genres            Adventure|Children|Fantasy
Name: 126142, dtype: object
title     Darby O'Gill and the Little People (1959)
genres                   Adventure|Children|Fantasy
Name: 2043, dtype: object
title     Santa Claus: The Movie (1985)
genres       Adventure|Children|Fantasy
Name: 2399, dtype: object


### 추천시스템 성능 평가

In [197]:
train_df, test_df = train_test_split(ratings_df,
                                     test_size=0.2,
                                     random_state=1234)

train_df.shape, test_df.shape

((80668, 4), (20168, 4))

In [202]:
test_userids = test_df.userId.unique()
test_userids.shape

(610,)

In [245]:
result_df = pd.DataFrame()

for user_id in tqdm(test_userids):
    user_record_df = train_df[train_df.userId == int(user_id)]
    
    # (n,9742) : n은 userId가 평점을 매긴 영화 수
    user_sim_df = cs_df.loc[user_record_df['movieId']]
    # (n, 1)
    user_rating_df = user_record_df[['rating']]
    
    sim_sum = np.sum(user_sim_df.T.to_numpy(), -1) # (9742, 1)
    
    prediction = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (sim_sum+1) # (9742, 1)
    
    prediction_df = pd.DataFrame(prediction,
                                 index=cs_df.index).reset_index()
    prediction_df.columns = ['movieId','pred_rating']
    prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]
    
    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)
    
result_df


  0%|                                                                                          | 0/610 [00:00<?, ?it/s][A
  0%|▎                                                                                 | 2/610 [00:00<00:34, 17.79it/s][A
  1%|▋                                                                                 | 5/610 [00:00<00:32, 18.53it/s][A
  1%|█▏                                                                                | 9/610 [00:00<00:27, 22.04it/s][A
  2%|█▌                                                                               | 12/610 [00:00<00:24, 23.95it/s][A
  2%|█▉                                                                               | 15/610 [00:00<00:25, 23.61it/s][A
  3%|██▋                                                                              | 20/610 [00:00<00:21, 27.72it/s][A
  4%|███                                                                              | 23/610 [00:00<00:21, 26.86it/s][A
  4%|███▍      

Unnamed: 0,movieId,pred_rating,userId,rating,timestamp
0,32,3.518410,610,4.5,1479543331
1,47,3.466281,610,5.0,1479545853
2,50,3.564660,610,4.0,1493844757
3,95,3.489052,610,3.5,1479542004
4,303,3.459690,610,3.0,1479542688
...,...,...,...,...,...
4,58998,2.833405,506,5.0,1424486961
0,293,1.067180,549,5.0,1464282636
1,527,0.970189,549,1.0,1464282598
0,8874,0.674328,471,3.5,1496668982


In [246]:
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['pred_rating'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.40606646706041 1.1857767357561078
