### TF-IDF 알고리즘

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
import os
print(os.listdir("../data/movies/"))

['movies_metadata.csv', 'ratings.csv']


In [12]:
path = '../data/movies/'

In [13]:
data = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [15]:
# overview의 항목 추출
data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [16]:
# overview의 결측치가 있는 항목은 모두 제거 
data = data[data['overview'].notnull()].reset_index(drop=True)
data.shape

(44512, 24)

In [18]:
data = data.loc[0:20000].reset_index(drop=True)

In [20]:
# 불용어 : 유의미하지 않은 단어 토큰 제거
tfidf = TfidfVectorizer(stop_words='english')

# overview에 대해 tf-idf수행
tfidf_matrix = tfidf.fit_transform(data['overview'])
print(tfidf_matrix.shape)

(20001, 47665)


20001개에 대한 문서에 대해 47665개의 단어 토큰을 가지는 것을 볼 수 있다.

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [22]:
cosine_matrix.shape

(20001, 20001)

In [23]:
np.round(cosine_matrix, 4)

array([[1.    , 0.0158, 0.    , ..., 0.0083, 0.0172, 0.    ],
       [0.0158, 1.    , 0.0492, ..., 0.0057, 0.008 , 0.    ],
       [0.    , 0.0492, 1.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.0083, 0.0057, 0.    , ..., 1.    , 0.0144, 0.    ],
       [0.0172, 0.008 , 0.    , ..., 0.0144, 1.    , 0.0183],
       [0.    , 0.    , 0.    , ..., 0.    , 0.0183, 1.    ]])

In [44]:
# enumerate 함수 : index와 title 추출
for i, c in enumerate(data['title']):
    print(i) # i : for문이 돌아가는 순서 
    print(c) # c : value
    break

0
Toy Story


데이터를 보면 index와 별개로 id column이 있다. 따라서 id컬럼과 title을 매핑해주자.

In [40]:
# movie title과 id를 매핑할 dictionary를 생성한다.
movie2id = {}
for i, c in enumerate(data['title']): # i:인덱스 c:value
    movie2id[i] = c
    
# id와 Movie title을 매핑할 dictionary를 생성해준다.
id2movie = {}
for i, c in movie2id.items(): # i:key(0), c:value('Toy Story')
    id2movie[c] = i
    
#movie2id.items

In [73]:
#Toy Stroy의 idx 추출 
idx = id2movie['Toy Story'] # Toy Story의 인덱스 : 0
idx

0

In [47]:
cosine_matrix # 정렬되어있지 않음, 자기자신을 포함한 유사도가 나타나있는상태(1)

array([[1.        , 0.015775  , 0.        , ..., 0.00826973, 0.01721372,
        0.        ],
       [0.015775  , 1.        , 0.04921281, ..., 0.005673  , 0.00799   ,
        0.        ],
       [0.        , 0.04921281, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00826973, 0.005673  , 0.        , ..., 1.        , 0.01443662,
        0.        ],
       [0.01721372, 0.00799   , 0.        , ..., 0.01443662, 1.        ,
        0.01828417],
       [0.        , 0.        , 0.        , ..., 0.        , 0.01828417,
        1.        ]])

In [56]:
cosine_matrix[idx]

array([1.        , 0.015775  , 0.        , ..., 0.00826973, 0.01721372,
       0.        ])

In [None]:
for i, c in enumerate(cosine_matrix[idx]):
    if i!=idx:
        print(i,c)

In [74]:
sim_scores = [(i, c) for i, c in enumerate(cosine_matrix[idx]) 
              if i!=idx] # 자기자신을 제외한 영화들의 유사도 및 인덱스 추출

# value 값이 필요한 것이므로 x[1]사용. x[1] : value 값 나오게함 
# reverse = True -> 유사도가 높은 순서대로 정렬

sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True) 
sim_scores[0:10] # 상위 10개 인덱스와 유사도 추출

[(15282, 0.5262275451171008),
 (2979, 0.463276799830381),
 (10271, 0.2797390476075632),
 (8303, 0.20078538664316947),
 (1058, 0.18287334034120212),
 (11367, 0.15712074193481165),
 (1916, 0.15288512626542436),
 (3039, 0.1433450408051554),
 (483, 0.13765225108436677),
 (11573, 0.1337032693869044)]

In [71]:
# 유사도가 높은 순서대로 정렬 된 것으로부터 제목과 유사도 추출
# sim_scores에서 i:인덱스 ,score: 유사도
# movie2id[0] -> Toy Story 3 영화 제목 추출 
sim_scores = [(movie2id[i], score) for i, score in sim_scores[0:10]]
sim_scores

[('Toy Story 3', 0.5262275451171008),
 ('Toy Story 2', 0.463276799830381),
 ('The 40 Year Old Virgin', 0.2797390476075632),
 ('The Champ', 0.20078538664316947),
 ('Rebel Without a Cause', 0.18287334034120212),
 ('For Your Consideration', 0.15712074193481165),
 ('Condorman', 0.15288512626542436),
 ('Man on the Moon', 0.1433450408051554),
 ('Malice', 0.13765225108436677),
 ('Factory Girl', 0.1337032693869044)]