# Content Based Filtering
- 영화의 줄거리를 이용해, TF-IDF를 계산한뒤, TF-IDF 벡터간의 유사도를 구해 비슷한 영화를 추천
- https://wikidocs.net/24603

## 1. 데이터 Load

In [1]:
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [38]:

movies = pd.read_csv("movie_doc.csv",sep="\t")
movies.tail()

Unnamed: 0,id,title,story,gender,link,image
398,398,스파이,규연은 선우의 가족들을 '가족 간첩단'으로 몰아가며 사건을 마무리 지으려 한다. 더...,N,https://movie.naver.com/movie/bi/mi/basic.nhn?...,https://movie-phinf.pstatic.net/20150507_77/14...
399,399,파도가 지나간 자리,1차 세계대전 참전용사였던 ‘톰’(마이클 패스벤더)은 전쟁의 상처로 사람들을 피해 ...,F,https://movie.naver.com/movie/bi/mi/basic.nhn?...,https://movie-phinf.pstatic.net/20170207_77/14...
400,400,스타 이즈 본,노래에 놀라운 재능을 가졌지만 외모에는 자신이 없는 무명가수 앨리(레이디 가가)는 ...,F,https://movie.naver.com/movie/bi/mi/basic.nhn?...,https://movie-phinf.pstatic.net/20180912_118/1...
401,401,빌리 엘리어트,영국 북부 탄광촌에 사는 11살 소년 빌리. 매일 복싱을 배우러 가는 체육관에서 우...,F,https://movie.naver.com/movie/bi/mi/basic.nhn?...,https://movie-phinf.pstatic.net/20170105_296/1...
402,402,보디가드,많은 사람들의 관심 속에 사는 최고의 팝스타 레이첼에게 의문의 협박편지가 날아든다....,F,https://movie.naver.com/movie/bi/mi/basic.nhn?...,https://movie-phinf.pstatic.net/20160711_34/14...


In [4]:
stop_words = []
with open('korean_stopwords.txt', encoding="utf-8") as f:
    for line in f.readlines():
        row = line.strip().split()[0]
        stop_words.append(row)

## 2. Tokenizer 정의

In [6]:
from konlpy.tag import Okt
okt = Okt()


In [7]:
pd.options.mode.chained_assignment = None
np.random.seed(0)

from konlpy.tag import Okt
okt = Okt()

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# tokenizer : 문장에서 색인어 추출을 위해 명사,동사,알파벳,숫자 정도의 단어만 뽑아서 normalization, stemming 처리하도록 함
def tokenizer(raw, pos=["Noun"], stopword=[stop_words]):
    return [
        word for word, tag in okt.pos(
            raw, 
            norm=True,   # normalize 그랰ㅋㅋ -> 그래ㅋㅋ
            stem=True    # stemming 바뀌나->바뀌다
            )
            if len(word) > 1 and tag in pos and word not in stopword
        ]

# 테스트 문장
rawdata = movies['story'].tolist()

## 3. TF-IDF 행렬 계산

In [11]:
vectorize = TfidfVectorizer(
    tokenizer=tokenizer,
    min_df=0.01,    
    max_df=0.99,    
    sublinear_tf=True    # tf값에 1+log(tf)를 적용하여 tf값이 무한정 커지는 것을 막음
)
X = vectorize.fit_transform(rawdata)

print(
    'fit_transform, (sentence {}, feature {})'.format(X.shape[0], X.shape[1])
)


print(X.toarray())



# 문장에서 뽑아낸 feature 들의 배열
features = vectorize.get_feature_names()


fit_transform, (sentence 403, feature 635)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.24616584 0.         ... 0.         0.         0.        ]]


In [12]:
tf_idf_mtx = pd.DataFrame(X.toarray(), columns = features)

In [13]:
tf_idf_mtx.head()

Unnamed: 0,가게,가까이,가득,가슴,가운데,가을,가장,가정,가족,가지,...,확인,활동,활약,황제,회사,훈련,흔적,희망,희망이,히어로
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.255697,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.172706,0.0,0.14099,0.0,0.123478,0.0,...,0.0,0.0,0.0,0.0,0.181683,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.195718,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4. Cosine 유사도 계산

In [15]:
cosine_sim = linear_kernel(tf_idf_mtx, tf_idf_mtx)

In [16]:
cosine_sim

array([[1.        , 0.01615743, 0.02670028, ..., 0.02337501, 0.        ,
        0.03843376],
       [0.01615743, 1.        , 0.07685338, ..., 0.06754139, 0.        ,
        0.04159549],
       [0.02670028, 0.07685338, 1.        , ..., 0.02912564, 0.04469684,
        0.09572375],
       ...,
       [0.02337501, 0.06754139, 0.02912564, ..., 1.        , 0.0475114 ,
        0.08591473],
       [0.        , 0.        , 0.04469684, ..., 0.0475114 , 1.        ,
        0.        ],
       [0.03843376, 0.04159549, 0.09572375, ..., 0.08591473, 0.        ,
        1.        ]])

In [19]:
indices = pd.Series(movies.index, index=movies['title'])
print(indices.head())

title
인비저블 게스트         0
나, 다니엘 블레이크      1
국가부도의 날          2
당갈               3
스파이더맨: 파 프롬 홈    4
dtype: int64


## 5. 비슷한 영화 추천

In [20]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # 선택한 영화의 타이틀로부터 해당되는 인덱스를 받아옵니다. 이제 선택한 영화를 가지고 연산할 수 있습니다.
    idx = indices[title]

    # 모든 영화에 대해서 해당 영화와의 유사도를 구합니다.
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 영화들을 정렬합니다.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 10개의 영화를 받아옵니다.
    sim_scores = sim_scores[1:11]

    # 가장 유사한 10개의 영화의 인덱스를 받아옵니다.
    movie_indices = [i[0] for i in sim_scores]

    # 가장 유사한 10개의 영화의 제목을 리턴합니다.
    return movies['title'].iloc[movie_indices]


In [23]:
get_recommendations('범죄도시')


282       죽거나 혹은 나쁘거나
264             트리플 9
335            다크 나이트
375    미션 임파서블: 로그네이션
203            살인의 추억
315        아이 인 더 스카이
359      분노의 질주: 더 세븐
46                 암살
219      미션 임파서블: 폴아웃
367             공각기동대
Name: title, dtype: object

In [27]:
type(indices)

pandas.core.series.Series

## 6. 관련 데이터 저장

In [28]:
with open('sim.npy', 'wb') as f:
    np.save(f, cosine_sim)

In [29]:
with open('sim.npy', 'rb') as f:
    cosine_sim = np.load(f)
    print(cosine_sim)

[[1.         0.01615743 0.02670028 ... 0.02337501 0.         0.03843376]
 [0.01615743 1.         0.07685338 ... 0.06754139 0.         0.04159549]
 [0.02670028 0.07685338 1.         ... 0.02912564 0.04469684 0.09572375]
 ...
 [0.02337501 0.06754139 0.02912564 ... 1.         0.0475114  0.08591473]
 [0.         0.         0.04469684 ... 0.0475114  1.         0.        ]
 [0.03843376 0.04159549 0.09572375 ... 0.08591473 0.         1.        ]]


In [30]:
indices.to_csv("index.csv")

In [34]:
indices = pd.read_csv("index.csv", header = None, index_col = 0, squeeze = True)

In [37]:
indices["암살"]

46