In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
path = './movielens_data/'

ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [5]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


---


In [6]:
# Genres를 이용한 movie representation
total_count = len(movies_df.index)
total_genres = list(set([genre for sublist in list(map(lambda x: x.split('|'), movies_df['genres'])) for genre in sublist]))

In [7]:
print(f"전체 영화 수: {total_count}")
print(f"장르: {total_genres}")

전체 영화 수: 9742
장르: ['Mystery', 'Sci-Fi', 'Adventure', 'Drama', 'War', 'Documentary', 'Musical', 'Animation', 'IMAX', 'Crime', 'Thriller', 'Film-Noir', 'Fantasy', 'Children', '(no genres listed)', 'Action', 'Romance', 'Horror', 'Comedy', 'Western']


In [8]:
genre_count = dict.fromkeys(total_genres)

for each_genre_list in movies_df['genres']:
    for genre in each_genre_list.split('|'):
        if genre_count[genre] == None:
            genre_count[genre] = 1
        else:
            genre_count[genre] = genre_count[genre]+1

In [9]:
for each_genre in genre_count: # 정규화
    genre_count[each_genre] = np.log10(total_count/genre_count[each_genre])

In [10]:
genre_count

{'Mystery': 1.2304935032683613,
 'Sci-Fi': 0.9974220495432563,
 'Adventure': 0.8872447746804204,
 'Drama': 0.3490620385623247,
 'War': 1.4065847623240424,
 'Documentary': 1.3451954487495636,
 'Musical': 1.4649016584241867,
 'Animation': 1.2026069149931968,
 'IMAX': 1.7899910382813284,
 'Crime': 0.9098289421369025,
 'Thriller': 0.7112681505684965,
 'Film-Noir': 2.0491288726171324,
 'Fantasy': 1.0971106675631865,
 'Children': 1.1664800458677336,
 '(no genres listed)': 2.457169208193496,
 'Action': 0.7266719338379385,
 'Romance': 0.7856152382210405,
 'Horror': 0.9983092704481497,
 'Comedy': 0.4139225416416778,
 'Western': 1.7659316540881678}

In [11]:
# create genre representations
genre_representation = pd.DataFrame(columns=sorted(total_genres), index=movies_df.index)
for index, each_row in tqdm(movies_df.iterrows()):
    dict_temp = {i: genre_count[i] for i in each_row['genres'].split('|')}
    row_to_add = pd.DataFrame(dict_temp, index=[index])
    genre_representation.update(row_to_add)

genre_representation

9742it [00:41, 232.13it/s]


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,0.726672,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193583,,,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193585,,,,,,,,,0.349062,,,,,,,,,,,
193587,,0.726672,,1.202607,,,,,,,,,,,,,,,,


In [12]:
# Tag를 이용한 Movie Representation
tags_df.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [13]:
# get unique tag
tag_column = list(map(lambda x: x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x: x.strip(), list([tag for sublist in tag_column for tag in sublist])))))

In [14]:
print(len(tag_column))
print(len(unique_tags))

3683
1589


In [15]:
tag_column

[['funny'],
 ['Highly quotable'],
 ['will ferrell'],
 ['Boxing story'],
 ['MMA'],
 ['Tom Hardy'],
 ['drugs'],
 ['Leonardo DiCaprio'],
 ['Martin Scorsese'],
 ['way too long'],
 ['Al Pacino'],
 ['gangster'],
 ['mafia'],
 ['Al Pacino'],
 ['Mafia'],
 ['holocaust'],
 ['true story'],
 ['twist ending'],
 ['Anthony Hopkins'],
 ['courtroom drama'],
 ['twist ending'],
 ['britpop'],
 ['indie record label'],
 ['music'],
 ['dumpster diving'],
 ['Sustainability'],
 ['romantic comedy'],
 ['wedding'],
 ['painter'],
 ['bloody'],
 ['black hole'],
 ['sci-fi'],
 ['time-travel'],
 ['fantasy'],
 ['magic board game'],
 ['Robin Williams'],
 ['beautiful scenery'],
 ['epic'],
 ['historical'],
 ['inspirational'],
 ['Medieval'],
 ['mel gibson'],
 ['Oscar (Best Cinematography)'],
 ['revenge'],
 ['sword fight'],
 ['black comedy'],
 ['Christina Ricci'],
 ['Christopher Lloyd'],
 ['dark comedy'],
 ['family'],
 ['gothic'],
 ['Al Pacino'],
 ['Andy Garcia'],
 ['Classic'],
 ['Francis Ford Coppola'],
 ['mafia'],
 ['black c

In [16]:
unique_tags

['introspection',
 'homosexuality',
 'Tim Burton',
 'tom hardy',
 'big wave',
 'heartbreaking',
 'sofia coppola',
 'guns',
 'bittersweet',
 'harry potter',
 'McCarthy hearings',
 'rasicm',
 'psychiatrist',
 'Music',
 'small towns',
 'Denzel Washington',
 'ships',
 'Halloween',
 'singletons',
 'mirrors',
 'gun fu',
 'financial crisis',
 'music business',
 'AIDs',
 'beat poetry',
 'philosophy',
 'pigs',
 'bowling',
 'paranoia',
 'wedding',
 'John Travolta',
 'ethics',
 'adorable',
 'stand-up comedy',
 'train',
 'anime',
 'Hilary Swank',
 'lack of plot',
 'tragic',
 'prostitution',
 'magic board game',
 'TERRORISM',
 'Ichabod Crane',
 'gangs',
 'kung fu',
 'dogs',
 'tearjerking',
 'Monty Python',
 'Bittersweet',
 'austere',
 'Will Ferrell',
 'aliens',
 'weird',
 'blood',
 'film noir',
 'baseball',
 'r:graphic sexuality',
 'space adventure',
 'secret society',
 'Jennifer Lawrence',
 'video game adaptation',
 'dreams',
 'Jeff Bridges',
 'Chile',
 'directorial debut',
 'I see dead people',
 

In [17]:
# Compute IDF for tag
total_movie_count = len(set(tags_df['movieId']))
# key: tag, value: number of movies with such tag
tag_count_dict = dict.fromkeys(unique_tags)

for each_movie_tag_list in tags_df['tag']:
    for tag in each_movie_tag_list.split(","):
        if tag_count_dict[tag.strip()] == None:
            tag_count_dict[tag.strip()] = 1
        else:
            tag_count_dict[tag.strip()] += 1

tag_idf = dict()
for each_tag in tag_count_dict: # 정규화
    tag_idf[each_tag] = np.log10(total_movie_count / tag_count_dict[each_tag])

In [18]:
tag_idf

{'introspection': 3.196452541703389,
 'homosexuality': 3.196452541703389,
 'Tim Burton': 2.7193312869837265,
 'tom hardy': 3.196452541703389,
 'big wave': 3.196452541703389,
 'heartbreaking': 2.5943925503754266,
 'sofia coppola': 3.196452541703389,
 'guns': 2.895422546039408,
 'bittersweet': 2.2422100322640643,
 'harry potter': 3.196452541703389,
 'McCarthy hearings': 3.196452541703389,
 'rasicm': 2.895422546039408,
 'psychiatrist': 2.895422546039408,
 'Music': 3.196452541703389,
 'small towns': 3.196452541703389,
 'Denzel Washington': 3.196452541703389,
 'ships': 3.196452541703389,
 'Halloween': 3.196452541703389,
 'singletons': 3.196452541703389,
 'mirrors': 3.196452541703389,
 'gun fu': 3.196452541703389,
 'financial crisis': 3.196452541703389,
 'music business': 2.895422546039408,
 'AIDs': 2.895422546039408,
 'beat poetry': 3.196452541703389,
 'philosophy': 2.4183012913197452,
 'pigs': 3.196452541703389,
 'bowling': 3.196452541703389,
 'paranoia': 2.4974825373673704,
 'wedding': 2.

In [19]:
len(tag_idf.keys())

1589

In [20]:
# Create movie representations
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))
for name, group in tqdm(tags_df.groupby(by='movieId')):
    temp_list = list(map(lambda x: x.split(','), list(group['tag'])))
    temp_tag_list = list(set(list(map(lambda x: x.strip(), list([tag for sublist in temp_list for tag in sublist])))))

    dict_temp = {i: tag_idf[i.strip()] for i in temp_tag_list}
    row_to_add = pd.DataFrame(dict_temp, index=[group['movieId'].values[0]])
    tag_representation.update(row_to_add)

tag_representation = tag_representation.sort_index(0)

100%|██████████████████████████████████████████████████████████████████████████████| 1572/1572 [03:54<00:00,  6.70it/s]


In [21]:
tag_representation

Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,,,,,,,,
184471,,,,,,,,,,,...,,,,,,,,,,
187593,,,,,,,,,,,...,,,,,,,,,,
187595,,,,,,,,,,,...,,,,,,,,,,


----

In [22]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [23]:
tag_representation.loc[1].dropna()

fun      2.497483
pixar    2.895423
Name: 1, dtype: object

In [24]:
tag_representation.loc[2].dropna()

Robin Williams      2.719331
fantasy             2.418301
game                3.196453
magic board game    3.196453
Name: 2, dtype: object

In [25]:
print(genre_representation.shape)
print(tag_representation.shape)

(9742, 20)
(1572, 1589)


---

In [26]:
# genre와 tag로 만들어진 representation을 합쳐서 각 movie의 vector로 만든다
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)

In [27]:
print(movie_representation.shape)

(9742, 1609)


In [28]:
print(movie_representation.describe())

       (no genres listed)       Action    Adventure    Animation     Children  \
count         9742.000000  9742.000000  9742.000000  9742.000000  9742.000000   
mean             0.008576     0.136354     0.115027     0.075425     0.079506   
std              0.144915     0.283726     0.298052     0.291593     0.293989   
min              0.000000     0.000000     0.000000     0.000000     0.000000   
25%              0.000000     0.000000     0.000000     0.000000     0.000000   
50%              0.000000     0.000000     0.000000     0.000000     0.000000   
75%              0.000000     0.000000     0.000000     0.000000     0.000000   
max              2.457169     0.726672     0.887245     1.202607     1.166480   

            Comedy        Crime  Documentary        Drama      Fantasy  ...  \
count  9742.000000  9742.000000  9742.000000  9742.000000  9742.000000  ...   
mean      0.159587     0.111978     0.060756     0.156257     0.087728  ...   
std       0.201476     0.298916  

---

- Cosine similarity 유사도 평가 : 유사도 지표 변경 가능

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim, index=[a.index])

    return result_df

In [30]:
print(movie_representation.head())

   (no genres listed)  Action  Adventure  Animation  Children    Comedy  \
1                 0.0     0.0   0.887245   1.202607   1.16648  0.413923   
2                 0.0     0.0   0.887245   0.000000   1.16648  0.000000   
3                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   
4                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   
5                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   

   Crime  Documentary     Drama   Fantasy  ...  women  wonderwoman  workplace  \
1    0.0          0.0  0.000000  1.097111  ...    0.0          0.0        0.0   
2    0.0          0.0  0.000000  1.097111  ...    0.0          0.0        0.0   
3    0.0          0.0  0.000000  0.000000  ...    0.0          0.0        0.0   
4    0.0          0.0  0.349062  0.000000  ...    0.0          0.0        0.0   
5    0.0          0.0  0.000000  0.000000  ...    0.0          0.0        0.0   

   writing  wrongful imprisonment  wry  younger men  zither  z

In [31]:
cs_df = cos_sim_matrix(movie_representation, movie_representation)
cs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.0,0.124438,0.008403,0.040571,0.011755,0.0,0.016339,0.331122,0.0,0.131794,...,0.064466,0.260941,0.071492,0.27171,0.0,0.348295,0.379492,0.0,0.232553,0.093519
2,0.124438,1.0,0.0,0.0,0.0,0.0,0.0,0.240843,0.0,0.095861,...,0.0,0.0,0.0,0.0,0.0,0.108082,0.117763,0.0,0.0,0.0
3,0.008403,0.0,1.0,0.179391,0.011294,0.0,0.072246,0.0,0.0,0.0,...,0.00656,0.0,0.068686,0.0,0.0,0.020322,0.022142,0.0,0.0,0.089849
4,0.040571,0.0,0.179391,1.0,0.05453,0.0,0.348828,0.0,0.0,0.0,...,0.031674,0.101979,0.567487,0.0,0.0,0.098119,0.106908,0.365843,0.0,0.433821
5,0.011755,0.0,0.011294,0.05453,1.0,0.0,0.640342,0.0,0.0,0.0,...,0.009177,0.0,0.096091,0.0,0.0,0.028429,0.030976,0.0,0.0,0.125697


In [32]:
cs_df.shape

(9742, 9742)

In [33]:
print(cs_df.shape)

(9742, 9742)


In [34]:
print(cs_df[1].sort_values(ascending=False))

2         1.000000
46972     0.322201
158813    0.300850
119655    0.300850
80748     0.300850
            ...   
4921      0.000000
4920      0.000000
4919      0.000000
4917      0.000000
193609    0.000000
Name: 1, Length: 9742, dtype: float64


In [35]:
# 추천시스템의 성능 평가
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=1234)

In [36]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [37]:
test_userids = list(set(test_df.userId.values))
test_userids

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [38]:
result_df = pd.DataFrame()

for user_id in tqdm(test_userids):
    user_record_df = train_df.loc[train_df.userId == int(user_id), :]
    
    user_sim_df = cs_df.loc[user_record_df['movieId']]  # (n, 9742); n은 userId가 평점을 매긴 영화 수
    user_rating_df = user_record_df[['rating']]  # (n, 1)
    sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)  # (9742, 1)
    # print("user_id=", i, user_record_df.shape, user_sim_df.T.shape, user_rating_df.shape, sim_sum.shape)

    prediction = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (sim_sum+1) # (9742, 1)

    prediction_df = pd.DataFrame(prediction, index=cs_df.index).reset_index()
    prediction_df.columns = ['movieId', 'pred_rating']    
    prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]

    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)

100%|████████████████████████████████████████████████████████████████████████████████| 610/610 [00:09<00:00, 66.40it/s]


In [39]:
result_df.head(10)

Unnamed: 0,movieId,pred_rating,userId,rating,timestamp
0,1,4.145652,1,4.0,964982703
1,50,3.650755,1,5.0,964982931
2,216,2.670124,1,5.0,964981208
3,223,2.612844,1,3.0,964980985
4,231,4.215284,1,5.0,964981179
5,235,3.61982,1,4.0,964980908
6,316,4.136756,1,3.0,964982310
7,457,3.218743,1,5.0,964981909
8,543,3.729524,1,4.0,964981179
9,592,4.024728,1,4.0,964982271


In [40]:
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['pred_rating'].values)
rmse = np.sqrt(mse)

print('mse : ', mse)
print('rmse : ', rmse)

mse :  1.40606646706041
rmse :  1.1857767357561078
