In [106]:
import pandas as pd
import numpy as np

# 데이터로딩

In [107]:
synop = pd.read_csv('anime_with_synopsis.csv')
synop

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...
...,...,...,...,...,...
16209,48481,Daomu Biji Zhi Qinling Shen Shu,Unknown,"Adventure, Mystery, Supernatural",No synopsis information has been added to this...
16210,48483,Mieruko-chan,Unknown,"Comedy, Horror, Supernatural",ko is a typical high school student whose life...
16211,48488,Higurashi no Naku Koro ni Sotsu,Unknown,"Mystery, Dementia, Horror, Psychological, Supe...",Sequel to Higurashi no Naku Koro ni Gou .
16212,48491,Yama no Susume: Next Summit,Unknown,"Adventure, Slice of Life, Comedy",New Yama no Susume anime.


## 결측치 확인
 - TF-IDF를 연산할 때 데이터에 Null 값이 들어있으면 에러가 발생합니다. 
 - TF-IDF의 대상이 되는 data의 overview 열에 결측값에 해당하는 Null 값이 있는지 확인합니다.

In [108]:
# sypnopsis가 없는 행의 위치 찾기
synop[synop['sypnopsis'].isnull()].index.tolist()
synop['sypnopsis'].isnull().sum()
synop['sypnopsis']=synop['sypnopsis'].fillna('') 
synop['sypnopsis'].isnull().sum()

0

 - 줄거리 행렬에 대한 tf-idf행렬을 생성하여 단어의 빈도수를 나타내고 array(배열)로 변경
 - stop_word :문서에서 단어장을 생성할 때, 무시할 수 있는 단어를 말한다. 보통 영어의 관사나 접속사, 한국어의 조사등이 여기에 해당된다.
 - fit_transform()은 train dataset에서만 사용
 - fit으로 텍스트 데이터를 단어사전으로 만드는 것

In [109]:
# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words ='english') # tf-idf 선언, 불용어처리
tfidf_matrix = tfidf.fit_transform(synop['sypnopsis']) # tf-idf 적합
print('TF-IDF 행렬의 크기(shape) :',tfidf_matrix.shape)

TF-IDF 행렬의 크기(shape) : (16214, 45064)


In [153]:
print(tfidf_matrix)
# 각 행의 단어의 위치와 그 단어의 tf_idf값

  (0, 26445)	0.05061631594613398
  (0, 26145)	0.05217422049067399
  (0, 18094)	0.11863663102845437
  (0, 36585)	0.1083322793637923
  (0, 17057)	0.1093990137075953
  (0, 6975)	0.08100951567987662
  (0, 43197)	0.09011583080861152
  (0, 37018)	0.06145740047026555
  (0, 8185)	0.11323297129973957
  (0, 7365)	0.07057386182309039
  (0, 16350)	0.08751293843026238
  (0, 22691)	0.06795444487387223
  (0, 1041)	0.06856262939895942
  (0, 9391)	0.1332779080780282
  (0, 16678)	0.04850041839310851
  (0, 22860)	0.1124340292660396
  (0, 29311)	0.06066019551442682
  (0, 26256)	0.0493804907829942
  (0, 8896)	0.06636823863030278
  (0, 24546)	0.06882446620422322
  (0, 41810)	0.08985704472207419
  (0, 1208)	0.06523064240068223
  (0, 39815)	0.10194097789551518
  (0, 11344)	0.08910953677111512
  (0, 8310)	0.07573846465421283
  :	:
  (16213, 34541)	0.18731484406364496
  (16213, 36317)	0.129716613298348
  (16213, 39400)	0.13419618915898907
  (16213, 6563)	0.11191689186572135
  (16213, 4166)	0.13928101430408768
 

 - 코사인 유사도 연산 결과로는 20,000행 20,000열의 행렬을 얻습니다. 
 - 이는 20,000개의 각 문서 벡터(영화 줄거리 벡터)와 자기 자신을 포함한 20,000개의 문서 벡터 간의 유사도가 기록된 행렬입니다. 
 - 모든 20,000개 영화의 상호 유사도가 기록되어져 있습니다. 
 - 이제 기존 데이터프레임으로부터 영화의 타이틀을 key, 영화의 인덱스를 value로 하는 딕셔너리 title_to_index를 만들어둡니다.

In [110]:
# 코사인 유사도
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
#cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)
pd.DataFrame(cosine_sim)

(16214, 16214)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16204,16205,16206,16207,16208,16209,16210,16211,16212,16213
0,1.000000,0.231391,0.015967,0.023563,0.008075,0.020814,0.000000,0.016533,0.013483,0.004135,...,0.0,0.003352,0.015949,0.007409,0.000000,0.000000,0.012420,0.0,0.009882,0.045931
1,0.231391,1.000000,0.037029,0.010630,0.004027,0.017836,0.011672,0.009779,0.008074,0.013148,...,0.0,0.011873,0.003797,0.000000,0.000000,0.000000,0.014127,0.0,0.000000,0.012970
2,0.015967,0.037029,1.000000,0.000000,0.011895,0.007033,0.003193,0.005715,0.000000,0.022979,...,0.0,0.009853,0.012658,0.000000,0.014713,0.014713,0.011780,0.0,0.000000,0.000000
3,0.023563,0.010630,0.000000,1.000000,0.000000,0.013626,0.000000,0.014193,0.003583,0.005449,...,0.0,0.027605,0.006357,0.000887,0.000000,0.000000,0.014531,0.0,0.000000,0.006155
4,0.008075,0.004027,0.011895,0.000000,1.000000,0.048272,0.001977,0.006680,0.000000,0.009307,...,0.0,0.012684,0.003060,0.003087,0.000000,0.000000,0.013027,0.0,0.000000,0.014703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16209,0.000000,0.000000,0.014713,0.000000,0.000000,0.013116,0.000000,0.000000,0.017206,0.000000,...,0.0,0.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.0,0.000000,0.000000
16210,0.012420,0.014127,0.011780,0.014531,0.013027,0.023211,0.005175,0.010088,0.009320,0.031097,...,0.0,0.000000,0.011436,0.003322,0.000000,0.000000,1.000000,0.0,0.000000,0.000000
16211,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000
16212,0.009882,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.016239,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,0.016552


In [111]:
# 테이블 생성
# 영화제목을 입력하면 인덱스를 반환할 수 있도록 제목과 인덱스를 갖는 테이블을 생성
indices = pd.Series(synop.index, index = synop['Name']).drop_duplicates()
print(indices)

Name
Cowboy Bebop                           0
Cowboy Bebop: Tengoku no Tobira        1
Trigun                                 2
Witch Hunter Robin                     3
Bouken Ou Beet                         4
                                   ...  
Daomu Biji Zhi Qinling Shen Shu    16209
Mieruko-chan                       16210
Higurashi no Naku Koro ni Sotsu    16211
Yama no Susume: Next Summit        16212
Scarlet Nexus                      16213
Length: 16214, dtype: int64


In [112]:
# 확인
idx = indices['Trigun']
print(idx)

2


In [154]:
# overview를 바탕으로 가장 유사도가 높은 상위 10개 영화를 추천하는 함수 선언
def get_recommendations(title, cosine_sim = cosine_sim):
    
    # 해당 영화 제목에 대응되는 인덱스를 저장
    idx = indices[title]
    #gx = gen[idx]
    
    # 모든 영화에 대하여 해당 영화와의 유사도를 계산
    sim_scores = [(i,c) for i, c in enumerate(cosine_sim[idx]) if i != idx] # 자기자신을 제외한 # (인덱스, 유사도) 를 원소로 갖는 리스트
    
    # 유사도를 기준으로 정렬 - 정렬은 key값이 아닌 value값으로 정렬
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True) 

    # 유사도 기준 상위 15개 영화 선택 
    sim_scores = sim_scores[:15] # (인덱스, 유사도) 를 원소로 갖는 리스트

    # 상위 10개 영화 인덱스를 저장
    movie_indices = [i[0] for i in sim_scores]
    
    # 가장 유사도가 높은 상위 10개의 영화 제목을 반환
    return synop['Name'].iloc[movie_indices]

In [155]:
synop_sim = get_recommendations('Naruto')
pd.DataFrame(synop_sim)

Unnamed: 0,Name
1508,Naruto: Shippuuden
11346,Boruto: Naruto Next Generations
6158,Naruto: Shippuuden Movie 6 - Road to Ninja
3103,"Naruto: Shippuuden - Shippuu! ""Konoha Gakuen"" Den"
8831,Boruto: Naruto the Movie
1952,Naruto: Dai Katsugeki!! Yuki Hime Shinobu Houj...
4598,Naruto: Shippuuden Movie 4 - The Lost Tower
6026,Naruto SD: Rock Lee no Seishun Full-Power Ninden
4300,Naruto: The Cross Roads
546,Naruto: Takigakure no Shitou - Ore ga Eiyuu Da...


# 장르 유사도

In [156]:
pd.DataFrame(synop['Genres'])

Unnamed: 0,Genres
0,"Action, Adventure, Comedy, Drama, Sci-Fi, Space"
1,"Action, Drama, Mystery, Sci-Fi, Space"
2,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen"
3,"Action, Mystery, Police, Supernatural, Drama, ..."
4,"Adventure, Fantasy, Shounen, Supernatural"
...,...
16209,"Adventure, Mystery, Supernatural"
16210,"Comedy, Horror, Supernatural"
16211,"Mystery, Dementia, Horror, Psychological, Supe..."
16212,"Adventure, Slice of Life, Comedy"


In [157]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
tfidf = TfidfVectorizer()
tfidf_g = tfidf.fit_transform(synop['Genres'])

genre_sim = cosine_similarity(tfidf_g, tfidf_g)
sim_df = pd.DataFrame(genre_sim,index = synop.Name, columns = synop.Name)
sim_df.head()
print(genre_sim.shape)

(16214, 16214)


In [158]:
indices_g = pd.Series(synop.index, index = synop['Genres']).drop_duplicates()

In [159]:
# 확인
idx = indices[['Chainsaw Maid']]
#print(idx)

In [160]:
indices_g[5760:]

Genres
Comedy, Horror, Supernatural, Thriller                               5760
Fantasy                                                              5761
Adventure, Drama, Historical, Mystery, Supernatural                  5762
Fantasy, Horror                                                      5763
Sports                                                               5764
                                                                    ...  
Adventure, Mystery, Supernatural                                    16209
Comedy, Horror, Supernatural                                        16210
Mystery, Dementia, Horror, Psychological, Supernatural, Thriller    16211
Adventure, Slice of Life, Comedy                                    16212
Action, Fantasy                                                     16213
Length: 10454, dtype: int64

In [161]:
movie_user_likes = "Naruto" # 장르 유사도
genre_sim = sim_df[movie_user_likes].sort_values(ascending=False)[:16]
genre_sim

Name
Naruto                                                              1.000000
Naruto: Shippuuden                                                  1.000000
Boruto: Jump Festa 2016 Special                                     1.000000
Naruto: Shippuuden Movie 6 - Road to Ninja                          0.981005
Naruto: Honoo no Chuunin Shiken! Naruto vs. Konohamaru!!            0.981005
Boruto: Naruto Next Generations                                     0.981005
Rekka no Honoo                                                      0.981005
Dragon Ball Z Movie 11: Super Senshi Gekiha!! Katsu no wa Ore da    0.969101
Dragon Ball Kai                                                     0.969101
Dragon Ball GT: Gokuu Gaiden! Yuuki no Akashi wa Suushinchuu        0.969101
Dragon Ball Super                                                   0.969101
Dragon Ball Z                                                       0.969101
Dragon Ball Z: Summer Vacation Special                              0.9

# 줄거리, 장르값 출력

In [162]:
sy_li = synop_sim.values
ge_li = genre_sim.index

In [163]:
pd.DataFrame(ge_li)

Unnamed: 0,Name
0,Naruto
1,Naruto: Shippuuden
2,Boruto: Jump Festa 2016 Special
3,Naruto: Shippuuden Movie 6 - Road to Ninja
4,Naruto: Honoo no Chuunin Shiken! Naruto vs. Ko...
5,Boruto: Naruto Next Generations
6,Rekka no Honoo
7,Dragon Ball Z Movie 11: Super Senshi Gekiha!! ...
8,Dragon Ball Kai
9,Dragon Ball GT: Gokuu Gaiden! Yuuki no Akashi ...


In [164]:
pd.DataFrame(sy_li)

Unnamed: 0,0
0,Naruto: Shippuuden
1,Boruto: Naruto Next Generations
2,Naruto: Shippuuden Movie 6 - Road to Ninja
3,"Naruto: Shippuuden - Shippuu! ""Konoha Gakuen"" Den"
4,Boruto: Naruto the Movie
5,Naruto: Dai Katsugeki!! Yuki Hime Shinobu Houj...
6,Naruto: Shippuuden Movie 4 - The Lost Tower
7,Naruto SD: Rock Lee no Seishun Full-Power Ninden
8,Naruto: The Cross Roads
9,Naruto: Takigakure no Shitou - Ore ga Eiyuu Da...


In [165]:
# 1차원 리스트의 교집합에서 파이썬 내장함수 set.intersection()를 쓸 수도 있습니다. 
# 다만 이 함수는 set타입에서 쓸 수 있어 list의 교집합을 구할 때는 list를 set으로 변환해 set.intersection한 뒤
# 그 결과를 다시 list로 변환해야 합니다.
total = list(set(sy_li).intersection(ge_li))
total

['Boruto: Naruto Next Generations',
 'Naruto: Shippuuden Movie 6 - Road to Ninja',
 'Naruto: Shippuuden']

In [167]:
pd.DataFrame(total, columns=['anime - title'],).head()

Unnamed: 0,anime - title
0,Boruto: Naruto Next Generations
1,Naruto: Shippuuden Movie 6 - Road to Ninja
2,Naruto: Shippuuden


In [171]:
# 출력한 값(fin)에 따른 장르 추출
synop = pd.read_csv('anime_with_synopsis.csv')

ak = synop[['Name','Genres']]

genre_list = []
for j in range(len(synop)):
    for i in range(len(fin)):
        if total[i] == synop['Name'][j]:
            genre_list.append(synop['Genres'][j])
genre_list

genre_list

['Action, Adventure, Comedy, Super Power, Martial Arts, Shounen',
 'Action, Adventure, Super Power, Martial Arts, Shounen',
 'Action, Adventure, Super Power, Martial Arts, Shounen']

In [181]:
# 타이틀과 장르 추출
pd.DataFrame(genre_list, total, columns=[['anime-title']])

Unnamed: 0,anime-title
Boruto: Naruto Next Generations,"Action, Adventure, Comedy, Super Power, Martia..."
Naruto: Shippuuden Movie 6 - Road to Ninja,"Action, Adventure, Super Power, Martial Arts, ..."
Naruto: Shippuuden,"Action, Adventure, Super Power, Martial Arts, ..."
