# Importing general libraries

In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

import ast

from tqdm.notebook import tqdm
tqdm.pandas()

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

# Importing datasets

In [35]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [36]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [37]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [38]:
# Combining both the datasets

movie_data = movies.merge(credits, on = 'title')
movie_data.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [39]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [40]:
movie_data = movie_data[['movie_id','title','original_language','spoken_languages','overview','genres','keywords','cast','crew']]

In [41]:
movie_data.head(2)

Unnamed: 0,movie_id,title,original_language,spoken_languages,overview,genres,keywords,cast,crew
0,19995,Avatar,en,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...","In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,en,"[{""iso_639_1"": ""en"", ""name"": ""English""}]","Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [42]:
movie_data.isnull().sum()

movie_id             0
title                0
original_language    0
spoken_languages     0
overview             3
genres               0
keywords             0
cast                 0
crew                 0
dtype: int64

> There are 3 missing values in overview feature

In [43]:
# Dropping null values
movie_data.dropna(inplace = True)

In [44]:
#movie_data = movie_data.fillna("")

In [45]:
movie_data[movie_data['movie_id'].duplicated()]

Unnamed: 0,movie_id,title,original_language,spoken_languages,overview,genres,keywords,cast,crew
974,72710,The Host,ko,"[{""iso_639_1"": ""ko"", ""name"": ""\ud55c\uad6d\uc5...",Gang-du is a dim-witted man working at his fat...,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 18, ""nam...","[{""id"": 1261, ""name"": ""river""}, {""id"": 1880, ""...","[{""cast_id"": 52, ""character"": ""Melanie Stryder...","[{""credit_id"": ""52fe487bc3a368484e0fa919"", ""de..."
975,1255,The Host,ko,"[{""iso_639_1"": ""ko"", ""name"": ""\ud55c\uad6d\uc5...",Gang-du is a dim-witted man working at his fat...,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 18, ""nam...","[{""id"": 1261, ""name"": ""river""}, {""id"": 1880, ""...","[{""cast_id"": 3, ""character"": ""Park Gang-du"", ""...","[{""credit_id"": ""52fe42eac3a36847f802ca6b"", ""de..."
1364,268,Batman,en,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",The Dynamic Duo faces four super-villains who ...,"[{""id"": 10751, ""name"": ""Family""}, {""id"": 12, ""...","[{""id"": 339, ""name"": ""submarine""}, {""id"": 849,...","[{""cast_id"": 5, ""character"": ""Jack Napier/The ...","[{""credit_id"": ""52fe422fc3a36847f800aa4b"", ""de..."
1365,2661,Batman,en,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",The Dynamic Duo faces four super-villains who ...,"[{""id"": 10751, ""name"": ""Family""}, {""id"": 12, ""...","[{""id"": 339, ""name"": ""submarine""}, {""id"": 849,...","[{""cast_id"": 17, ""character"": ""Batman / Bruce ...","[{""credit_id"": ""52fe4363c3a36847f80509a7"", ""de..."
3654,39269,Out of the Blue,en,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Ordinary people find extraordinary courage in ...,"[{""id"": 18, ""name"": ""Drama""}]","[{""id"": 2658, ""name"": ""new zealand""}, {""id"": 3...","[{""cast_id"": 2, ""character"": ""Don"", ""credit_id...","[{""credit_id"": ""52fe47099251416c9106826f"", ""de..."
3655,10844,Out of the Blue,en,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Ordinary people find extraordinary courage in ...,"[{""id"": 18, ""name"": ""Drama""}]","[{""id"": 2658, ""name"": ""new zealand""}, {""id"": 3...","[{""cast_id"": 12, ""character"": ""Nick Harvey"", ""...","[{""credit_id"": ""52fe43c19251416c7501cceb"", ""de..."


In [46]:
movie_data[movie_data['movie_id'] == 72710]

Unnamed: 0,movie_id,title,original_language,spoken_languages,overview,genres,keywords,cast,crew
972,72710,The Host,en,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",A parasitic alien soul is injected into the bo...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 52, ""character"": ""Melanie Stryder...","[{""credit_id"": ""52fe487bc3a368484e0fa919"", ""de..."
974,72710,The Host,ko,"[{""iso_639_1"": ""ko"", ""name"": ""\ud55c\uad6d\uc5...",Gang-du is a dim-witted man working at his fat...,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 18, ""nam...","[{""id"": 1261, ""name"": ""river""}, {""id"": 1880, ""...","[{""cast_id"": 52, ""character"": ""Melanie Stryder...","[{""credit_id"": ""52fe487bc3a368484e0fa919"", ""de..."


In [47]:
movie_data[movie_data.movie_id == 1255]

Unnamed: 0,movie_id,title,original_language,spoken_languages,overview,genres,keywords,cast,crew
973,1255,The Host,en,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",A parasitic alien soul is injected into the bo...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 3, ""character"": ""Park Gang-du"", ""...","[{""credit_id"": ""52fe42eac3a36847f802ca6b"", ""de..."
975,1255,The Host,ko,"[{""iso_639_1"": ""ko"", ""name"": ""\ud55c\uad6d\uc5...",Gang-du is a dim-witted man working at his fat...,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 18, ""nam...","[{""id"": 1261, ""name"": ""river""}, {""id"": 1880, ""...","[{""cast_id"": 3, ""character"": ""Park Gang-du"", ""...","[{""credit_id"": ""52fe42eac3a36847f802ca6b"", ""de..."


> Inference : 
There are few rows which are repeating. This look like some data issue. Incorrectly same movie id has been assigned to two different movies which cannot happen in reality. In real world, movie id should always be unique. As there is no way to figure out which assignment is correct, I will drop all rows returned by duplicate() function.


In [48]:
# Dropppig duplicated rows

to_drop = movie_data[movie_data['movie_id'].duplicated()]
movie_data.drop(to_drop.index, inplace = True)

In [49]:
ast.literal_eval(movie_data['cast'][0])

[{'cast_id': 242,
  'character': 'Jake Sully',
  'credit_id': '5602a8a7c3a3685532001c9a',
  'gender': 2,
  'id': 65731,
  'name': 'Sam Worthington',
  'order': 0},
 {'cast_id': 3,
  'character': 'Neytiri',
  'credit_id': '52fe48009251416c750ac9cb',
  'gender': 1,
  'id': 8691,
  'name': 'Zoe Saldana',
  'order': 1},
 {'cast_id': 25,
  'character': 'Dr. Grace Augustine',
  'credit_id': '52fe48009251416c750aca39',
  'gender': 1,
  'id': 10205,
  'name': 'Sigourney Weaver',
  'order': 2},
 {'cast_id': 4,
  'character': 'Col. Quaritch',
  'credit_id': '52fe48009251416c750ac9cf',
  'gender': 2,
  'id': 32747,
  'name': 'Stephen Lang',
  'order': 3},
 {'cast_id': 5,
  'character': 'Trudy Chacon',
  'credit_id': '52fe48009251416c750ac9d3',
  'gender': 1,
  'id': 17647,
  'name': 'Michelle Rodriguez',
  'order': 4},
 {'cast_id': 8,
  'character': 'Selfridge',
  'credit_id': '52fe48009251416c750ac9e1',
  'gender': 2,
  'id': 1771,
  'name': 'Giovanni Ribisi',
  'order': 5},
 {'cast_id': 7,
  'c

In [50]:
# Below function will iterate over the list of dictionary and extract name from the dictionary. 
# Extracted name will be stored in a temporary list which will be returned to the calling function.
# Also I will be combining name with more than on word into a single word. Say "Science Fiction" ===> "ScienceFiction"

def extract_name(sen):
    lst = []
    for txt in ast.literal_eval(sen):
        lst.append(txt['name'].replace(' ', ''))
    return lst

In [51]:
movie_data['genres'] = movie_data['genres'].progress_apply(extract_name) # # Extracting Genre name

  0%|          | 0/4800 [00:00<?, ?it/s]

In [52]:
movie_data['keywords'] = movie_data['keywords'].progress_apply(extract_name) ## Extracing keywords

  0%|          | 0/4800 [00:00<?, ?it/s]

In [53]:
# Featching all cast names and replacing white spaces so complete name is treated single name.
# We don't want firstname and lastname to be treated as two different word. At the same time we want name to be in order
# say firstname and then last name

def convert_cast(sen):
    lst = []
    for txt in ast.literal_eval(sen):
        lst.append(txt['name'].replace(" ", ''))
    return lst

In [54]:
movie_data['cast'] = movie_data['cast'].progress_apply(convert_cast)

  0%|          | 0/4800 [00:00<?, ?it/s]

In [56]:
movie_data['cast'][0]

['SamWorthington',
 'ZoeSaldana',
 'SigourneyWeaver',
 'StephenLang',
 'MichelleRodriguez',
 'GiovanniRibisi',
 'JoelDavidMoore',
 'CCHPounder',
 'WesStudi',
 'LazAlonso',
 'DileepRao',
 'MattGerald',
 'SeanAnthonyMoran',
 'JasonWhyte',
 'ScottLawrence',
 'KellyKilgour',
 'JamesPatrickPitt',
 'SeanPatrickMurphy',
 'PeterDillon',
 'KevinDorman',
 'KelsonHenderson',
 'DavidVanHorn',
 'JacobTomuri',
 'MichaelBlain-Rozgay',
 'JonCurry',
 'LukeHawker',
 'WoodySchultz',
 'PeterMensah',
 'SoniaYee',
 'JahnelCurfman',
 'IlramChoi',
 'KylaWarren',
 'LisaRoumain',
 'DebraWilson',
 'ChrisMala',
 'TaylorKibby',
 'JodieLandau',
 'JulieLamm',
 'CullenB.Madden',
 'JosephBradyMadden',
 'FrankieTorres',
 'AustinWilson',
 'SaraWilson',
 'TamicaWashington-Miller',
 'LucyBriant',
 'NathanMeister',
 'GerryBlair',
 'MatthewChamberlain',
 'PaulYates',
 'WrayWilson',
 'JamesGaylyn',
 'MelvinLenoClarkIII',
 'CarvonFutrell',
 'BrandonJelkes',
 'MicahMoch',
 'HanniyahMuhammad',
 'ChristopherNolen',
 'ChristaOliv

In [57]:
# Extracting movie language

def extract_language(sen):
    lst = []
    for txt in ast.literal_eval(sen):
        lst.append(txt['name'])
    return lst

In [58]:
movie_data['spoken_languages'] = movie_data['spoken_languages'].progress_apply(extract_language)

  0%|          | 0/4800 [00:00<?, ?it/s]

In [59]:
def extract_dir(sen):
    lst = []
    for txt in ast.literal_eval(movie_data['crew']):
        if txt['job'].lower() == "director":
            lst.append(txt['name'].replace(" ", ''))
    return lst

In [66]:
# Extracting director name from the data
movie_data['crew'] = movie_data['crew'].progress_apply(extract_dir)

  0%|          | 0/4800 [00:00<?, ?it/s]

In [68]:
# Spliting movie review into list of words
movie_data['overview'] = movie_data['overview'].progress_apply(lambda x:x.split())

  0%|          | 0/4800 [00:00<?, ?it/s]

In [72]:
movie_data['original_language'] = movie_data['original_language'].apply(lambda x: x.split())

In [73]:
movie_data['original_language']

0       [en]
1       [en]
2       [en]
3       [en]
4       [en]
        ... 
4804    [es]
4805    [en]
4806    [en]
4807    [en]
4808    [en]
Name: original_language, Length: 4800, dtype: object

In [74]:
# Creating another feature containing combined features

movie_data['tags'] = movie_data['spoken_languages'] + movie_data['original_language'] + movie_data['overview']  +  movie_data['genres'] + movie_data['keywords'] + movie_data['cast'] + movie_data['crew']

In [75]:
# As I have already stored these values into a news feature..I will be dropping it from the data

movie_data.drop(['overview', 'original_language' ,'spoken_languages','genres', 'keywords', 'cast', 'crew'], axis = 1, inplace = True)

In [76]:
movie_data

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[English, Español, en, In, the, 22nd, century,..."
1,285,Pirates of the Caribbean: At World's End,"[English, en, Captain, Barbossa,, long, believ..."
2,206647,Spectre,"[Français, English, Español, Italiano, Deutsch..."
3,49026,The Dark Knight Rises,"[English, en, Following, the, death, of, Distr..."
4,49529,John Carter,"[English, en, John, Carter, is, a, war-weary,,..."
...,...,...,...
4804,9367,El Mariachi,"[Español, es, El, Mariachi, just, wants, to, p..."
4805,72766,Newlyweds,"[en, A, newlywed, couple's, honeymoon, is, upe..."
4806,231617,"Signed, Sealed, Delivered","[English, en, ""Signed,, Sealed,, Delivered"", i..."
4807,126186,Shanghai Calling,"[English, en, When, ambitious, New, York, atto..."


In [77]:
# Joining all words into a single sentence

movie_data['tags'] = movie_data['tags'].progress_apply(lambda x: " ".join(x))

  0%|          | 0/4800 [00:00<?, ?it/s]

In [83]:
# Performing data cleaning....removing special charaters..

import re
def remove_SpecialChar(sentence):
    sentence = sentence.lower()
    sentence = re.sub("\S*\d\S*", "", sentence)
    sentence = re.sub('[^A-Za-z0-9]+', " ", sentence)
    return sentence

In [84]:
# calling above function to remove special charaters

movie_data['tags'] = movie_data['tags'].progress_apply(remove_SpecialChar)

  0%|          | 0/4800 [00:00<?, ?it/s]

In [85]:
movie_data['tags'][1]

'english en captain barbossa long believed to be dead has come back to life and is headed to the edge of the earth with will turner and elizabeth swann but nothing is quite as it seems adventure fantasy action ocean drugabuse exoticisland eastindiatradingcompany loveofone slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger johnnydepp orlandobloom keiraknightley stellanskarsg rd chowyun fat billnighy geoffreyrush jackdavenport kevinmcnally tomhollander naomieharris jonathanpryce keithrichards leearenberg mackenziecrook gregellis davidbailie martinklebba davidschofield laurenmaher vanessabranch angusbarnett gilesnew reggielee dominicscottkay takayofischer davidmeunier ho kwantse andybeckwith peterdonaldbadalamentiii christophers capp keithrichards hakeemkae kazim ghassanmassoud goreverbinski'

# Modeling....

In [88]:
# Initalizing count vectors

cv = CountVectorizer(max_features = 5000, stop_words = 'english')
vector = cv.fit_transform(movie_data['tags']).toarray()

In [89]:
vector.shape

(4800, 5000)

In [91]:
# Computing cosine similarity of newly created featre.

similarity = cosine_similarity(vector)

In [92]:
similarity

array([[1.        , 0.11111111, 0.13483997, ..., 0.06741999, 0.07254763,
        0.05039526],
       [0.11111111, 1.        , 0.11236664, ..., 0.06741999, 0.04836508,
        0.05039526],
       [0.13483997, 0.11236664, 1.        , ..., 0.06818182, 0.0489116 ,
        0.05096472],
       ...,
       [0.06741999, 0.06741999, 0.06818182, ..., 1.        , 0.122279  ,
        0.07644708],
       [0.07254763, 0.04836508, 0.0489116 , ..., 0.122279  , 1.        ,
        0.08226127],
       [0.05039526, 0.05039526, 0.05096472, ..., 0.07644708, 0.08226127,
        1.        ]])

In [376]:
def recommend(movie):
    index = movie_data[movie_data['title'] == movie].index[0]   # Extracting index number of movvir
    distance = sorted(list(enumerate(similarity[index])), reverse = True, key = lambda x: x[1]) # Fetc
    for i in distance[1:6]:
        print(movie_data.iloc[i[0]].title) # ubdex #  predicting top 5 levels present te datset

In [377]:
recommend("Pirates of the Caribbean: At World's End")

Pirates of the Caribbean: Dead Man's Chest
Pirates of the Caribbean: The Curse of the Black Pearl
Pirates of the Caribbean: On Stranger Tides
20,000 Leagues Under the Sea
Life of Pi


In [94]:
sorted(list(enumerate(similarity[0])), reverse = True, key = lambda x: x[1])

[(0, 0.9999999999999999),
 (1916, 0.3103164454170875),
 (1440, 0.26943012562182533),
 (4399, 0.26666666666666666),
 (1202, 0.2609312292213769),
 (539, 0.2592724864350674),
 (1192, 0.2560911084488454),
 (582, 0.25446209512303813),
 (1214, 0.2528242470079879),
 (260, 0.24845199749997662),
 (507, 0.24812594486934292),
 (4186, 0.24343224778007388),
 (74, 0.2417728402451219),
 (2329, 0.2412090756622109),
 (3604, 0.2412090756622109),
 (3158, 0.2390457218668788),
 (3618, 0.2387049580131443),
 (3724, 0.2385139175999776),
 (83, 0.2366823156015644),
 (3624, 0.23354968324845696),
 (3323, 0.23333333333333334),
 (2995, 0.23094010767585033),
 (2967, 0.22961557339788816),
 (973, 0.22677868380553634),
 (1327, 0.2253744679276044),
 (2782, 0.2253744679276044),
 (36, 0.22473328748774735),
 (2405, 0.22473328748774735),
 (1533, 0.223606797749979),
 (1319, 0.22247460415730486),
 (3534, 0.22056438662814237),
 (972, 0.22056438662814234),
 (2200, 0.22056438662814234),
 (466, 0.21773242158072695),
 (2727, 0.217

In [None]:
 movie_data[movie_data['title'] == movie].index[0]

In [378]:
import pickle

In [379]:
pickle.dump(movie_data, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))