In [2]:
from pathlib import Path
import pandas as pd
import os

path = os.path.join(Path(os.getcwd()).parent, "data")
ratings = os.path.join(path, "ml-latest-small", "ratings.csv")
movies = os.path.join(path, "ml-latest-small", "movies.csv")

In [3]:
ratings_df = pd.read_csv(ratings, encoding='utf-8')
movies_df = pd.read_csv(movies, encoding='utf-8', index_col='movieId')

In [5]:
genres = list(set([genre for movie in movies_df['genres'].tolist() for genre in movie.split('|')]))
genres.sort()
print("전체 단어: ", genres)

전체 단어:  ['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [6]:
# (no genres listed)도 하나의 장르로 포함
movies_df[movies_df.genres == '(no genres listed)'].head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
114335,La cravate (1957),(no genres listed)
122888,Ben-hur (2016),(no genres listed)
122896,Pirates of the Caribbean: Dead Men Tell No Tal...,(no genres listed)
129250,Superfast! (2015),(no genres listed)
132084,Let It Be Me (1995),(no genres listed)


In [11]:
def tf(word, document):
    return document.count(word)

def idf(word, document_list):
    df = 0
    for document in document_list:
        df += 1 if word in document else 0
    return log(len(document_list)/(df+1))

def tfidf(word, document, document_list):
    return tf(word, document) * idf(word, document_list)

### 전체 장르에 대한 IDF

In [8]:
# frequency
from tqdm import tqdm
df_dict = dict.fromkeys(genres, 0)

for genre_list in tqdm(movies_df['genres']):
    for genre in genres:  
        df_dict[genre] += genre in genre_list.split('|')
df_dict

100%|██████████| 9742/9742 [00:00<00:00, 140101.59it/s]


{'(no genres listed)': 34,
 'Action': 1828,
 'Adventure': 1263,
 'Animation': 611,
 'Children': 664,
 'Comedy': 3756,
 'Crime': 1199,
 'Documentary': 440,
 'Drama': 4361,
 'Fantasy': 779,
 'Film-Noir': 87,
 'Horror': 978,
 'IMAX': 158,
 'Musical': 334,
 'Mystery': 573,
 'Romance': 1596,
 'Sci-Fi': 980,
 'Thriller': 1894,
 'War': 382,
 'Western': 167}

In [10]:
from math import log

idf_dict = dict.fromkeys(df_dict.keys())
for key, value in df_dict.items():
    idf_dict[key] = log(len(movies_df)/(value+1))
idf_dict

{'(no genres listed)': 5.6288536528770745,
 'Action': 1.6726770659756223,
 'Adventure': 2.0421651396596854,
 'Animation': 2.767469431854162,
 'Children': 2.684414673710634,
 'Comedy': 0.9528256687925191,
 'Crime': 2.0941248785903963,
 'Documentary': 3.095156838919642,
 'Drama': 0.8035157676049136,
 'Fantasy': 2.5249077946828504,
 'Film-Noir': 4.706864899888282,
 'Horror': 2.2976700718359777,
 'IMAX': 4.115297512146256,
 'Musical': 3.3700711825414214,
 'Mystery': 2.8315723180469217,
 'Romance': 1.8083195661514755,
 'Sci-Fi': 2.295629254801125,
 'Thriller': 1.6372275968499612,
 'War': 3.236166725185842,
 'Western': 4.060237734963229}

### TF-IDF 

In [13]:
result = []

for _, row in tqdm(movies_df.iterrows()):
    doc_tf = []
    for genre in genres:
        doc_tf.append(tf(genre, row['genres'])*idf_dict[genre])
    result.append(doc_tf)
tfidf_df = pd.DataFrame(result, columns=genres, index=movies_df.index).sort_index()
tfidf_df

9742it [00:00, 10325.56it/s]


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.000000,2.042165,2.767469,2.684415,0.952826,0.0,0.0,0.000000,2.524908,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
2,0.0,0.000000,2.042165,0.000000,2.684415,0.000000,0.0,0.0,0.000000,2.524908,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1.80832,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.0,0.0,0.803516,0.000000,0.0,0.0,0.0,0.0,0.0,1.80832,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,1.672677,0.000000,2.767469,0.000000,0.952826,0.0,0.0,0.000000,2.524908,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
193583,0.0,0.000000,0.000000,2.767469,0.000000,0.952826,0.0,0.0,0.000000,2.524908,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
193585,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.803516,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
193587,0.0,1.672677,0.000000,2.767469,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0


### 아이템 간 cosine 유사도 기반 추천

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

class ItemSimilarity:

    def __init__(self, movies_df, tfidf_df):
        self.movies_df = movies_df
        self.tfidf_df = tfidf_df

    @property
    def cos_sim_matrix(self):
        return self.get_cos_sim(self.tfidf_df, self.tfidf_df)

    @staticmethod
    def get_cos_sim(a, b):
        cos_sim = cosine_similarity(a, b)
        result_df = pd.DataFrame(data=cos_sim, index=[a.index], columns = b.index)
        return result_df 

    def recommend(self, target_movie, top_k):
        movie_id = self.movies_df[self.movies_df.title == target_movie].index[0]

        for index, sim_val in self.cos_sim_matrix[movie_id].sort_values(ascending = False)[:top_k].items():
            recommend_movie_id = index[0]
            print(movies_df.loc[recommend_movie_id]['title'], sim_val)
            

In [20]:
ItemSimilarity(movies_df, tfidf_df).recommend(
    target_movie='Black Butler: Book of the Atlantic (2017)', top_k=10
)

Black Butler: Book of the Atlantic (2017) 1.0
Justice League: Doom (2012)  0.9740752144041196
Dante's Inferno: An Animated Epic (2010) 0.9740752144041196
Superman/Batman: Public Enemies (2009) 0.9740752144041196
Triplets of Belleville, The (Les triplettes de Belleville) (2003) 0.9177602335851786
Mickey's Once Upon a Christmas (1999) 0.9177602335851786
South Park: Imaginationland (2008) 0.9177602335851786
Monkeybone (2001) 0.9177602335851786
Anomalisa (2015) 0.9177602335851786
Daddy, I'm A Zombie (2012) 0.9177602335851786
