In [1]:
import pandas as pd
#구글 드라이브 내 다운로드 받은 csv 파일의 디렉토리 위치 설정
file = 'tmdb_5000_movies.csv'
df = pd.read_csv(file)

In [2]:
print(df.shape)
print(df.head(1))
print(df.columns)

(4803, 20)
      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                      homepage     id  \
0  http://www.avatarmovie.com/  19995   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   

  original_title                                           overview  \
0         Avatar  In the 22nd century, a paraplegic Marine is di...   

   popularity                               production_companies  \
0  150.437577  [{"name": "Ingenious Film Partners", "id": 289...   

                                production_countries release_date     revenue  \
0  [{"iso_3166_1": "US", "name": "United States o...   2009-12-10  2787965087   

   runtime                                   spoken_languages    status  \
0    162.0  [{"iso_639_1": "en", "name": "English"}, {"iso...  Released   

                       tagli

In [3]:
movies = df[['original_title', 'overview']]
movies.head()

Unnamed: 0,original_title,overview
0,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,Spectre,A cryptic message from Bond’s past sends him o...
3,The Dark Knight Rises,Following the death of District Attorney Harve...
4,John Carter,"John Carter is a war-weary, former military ca..."


In [4]:
print(movies.shape)
print(movies['overview'].isnull().sum())
movies = movies.dropna(axis=0)              #NaN 값이 있는 행 삭제(axis=1 >> 열 삭제)
print(movies.shape)

(4803, 2)
3
(4800, 2)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(movies['overview'])
tfidf = vectorizer.transform(movies['overview']).toarray()

print(tfidf.shape)
print(tfidf.dtype)
print(pd.DataFrame(tfidf).head(5))

(4800, 20978)
float64
   0      1      2      3      4      5      6      7      8      9      ...  \
0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
1    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
2    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
3    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
4    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   

   20968  20969  20970  20971  20972  20973  20974  20975  20976  20977  
0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  
1    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  
2    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  
3    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  
4    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  

[5 rows x 20978 columns]


In [7]:
idx = pd.Series(movies.index, index=movies['original_title']).drop_duplicates()
print(idx.head(5))
print(tfidf[idx['Avatar']].shape, tfidf[idx['Avatar']])

original_title
Avatar                                      0
Pirates of the Caribbean: At World's End    1
Spectre                                     2
The Dark Knight Rises                       3
John Carter                                 4
dtype: int64
(20978,) [0. 0. 0. ... 0. 0. 0.]


In [8]:
from sklearn.metrics.pairwise import linear_kernel
sim = linear_kernel(tfidf, tfidf)

In [9]:
print(sim[0:5])

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.02160368 0.         0.        ]
 [0.         0.         1.         ... 0.01488031 0.         0.        ]
 [0.02499417 0.         0.         ... 0.03386294 0.04275107 0.02268915]
 [0.         0.03336731 0.         ... 0.00612481 0.         0.        ]]


In [10]:
rank = sim[idx['The Dark Knight Rises']]
score = list(enumerate(rank))
score = sorted(score, key=lambda x: x[1], reverse=True)
score[0:11]

[(3, 0.9999999999999997),
 (65, 0.3015156223799365),
 (299, 0.2985696882009985),
 (428, 0.28785364205294767),
 (1359, 0.26445884163119304),
 (3853, 0.18545106440515935),
 (119, 0.16799506417419163),
 (2507, 0.16682666472835278),
 (9, 0.13373903093791062),
 (1181, 0.13219435075960548),
 (210, 0.1304533502821387)]

In [11]:
movie_index = [i[0] for i in score[0:11]]
print(movies.loc[movie_index, 'original_title'])

3                    The Dark Knight Rises
65                         The Dark Knight
299                         Batman Forever
428                         Batman Returns
1359                                Batman
3853                                  2:13
119                          Batman Begins
2507                             Slow Burn
9       Batman v Superman: Dawn of Justice
1181                                   JFK
210                         Batman & Robin
Name: original_title, dtype: object
