In [60]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
df = pd.read_csv("movie_dataset.csv")
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [31]:
df.shape

(4803, 24)

In [33]:
def split_genre(gen):
    genre = str(gen).lower()
    return genre.split()

def preprocessing_overview(data):
    data = str(data).lower()
    data = data.replace('[', '')
    data = data.replace(']', '')
    data = data.replace(',', '')
    data = data.replace("'", '')
    return data

df['genres'] = df['genres'].apply(split_genre)
df['overview'] = df['overview'].apply(preprocessing_overview)
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,"[['action',, 'adventure',, 'fantasy',, 'scienc...",http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,in the 22nd century a paraplegic marine is dis...,150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,"[['adventure',, 'fantasy',, 'action']]",http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,captain barbossa long believed to be dead has ...,139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,"[['action',, 'adventure',, 'crime']]",http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,a cryptic message from bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,"[['action',, 'crime',, 'drama',, 'thriller']]",http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,following the death of district attorney harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,"[['action',, 'adventure',, 'science',, 'fictio...",http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,john carter is a war-weary former military cap...,43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


## Recomendation by Overview

In [67]:
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
vector = tfidf.fit_transform(df['overview'])
pickle.dump(tfidf, open('tfidf_overview.sav', 'wb'))

arr_vector = vector.toarray()
arr_vector.shape

(4803, 10000)

In [68]:
test = 'story about best pirates'

In [69]:
tfidf = pickle.load(open('tfidf_overview.sav', 'rb'))

test_arr = tfidf.transform([test])
test_arr = test_arr.toarray()

result = {}
for id, vector in enumerate(arr_vector):
    cosine_val = cosine_similarity([arr_vector[id]], test_arr)
    result[id] = cosine_val

result_desc = dict(sorted(result.items(), key=lambda item: item[1], reverse=True))

top_n = 5

for n, movies_id in enumerate(result_desc):
    if n > 5:
        break
    print(df.iloc[movies_id]['original_title'], '\n', df.iloc[movies_id]['overview'], '\n===========================')

VeggieTales: The Pirates Who Don't Do Anything 
 set sail for adventure! a boatload of beloved veggietales pals embark on a fun and fresh pirate adventure with their trademark humor and silly songs in the pirates who dont do anything - a veggietales movie! larry the cucumber mr. lunt and pa grape find themselves on the ride of their lives when they are mysteriously whisked back to the time when pirates ruled the high seas. 
Håkon Håkonsen 
 a young norwegian boy in 1850s england goes to work as a cabin boy and discovers some of his shipmates are actually pirates. 
Captain Phillips 
 the true story of captain richard phillips and the 2009 hijacking by somali pirates of the us-flagged mv maersk alabama the first american cargo ship to be hijacked in two hundred years. 
Ice Age: Continental Drift 
 manny diego and sid embark upon another adventure after their continent is set adrift. using an iceberg as a ship they encounter sea creatures and battle pirates as they explore a new world. 
V