In [153]:
import numpy as np
import pandas as pd

In [154]:
x = pd.read_csv('tmdb_5000_movies.csv')
y = pd.read_csv('tmdb_5000_credits.csv')

In [155]:
print(x.shape, y.shape)

(4803, 20) (4803, 4)


In [156]:
print(x.columns)
print(y.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')
Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')


In [157]:
df = x.merge(y, on = 'title')

In [158]:
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [159]:
df = df[['movie_id','title','overview','genres','keywords','cast','crew']]

In [160]:
df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [161]:
df.isna().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [162]:
# as only 3 values are na so lets delete the 3 rows
df.dropna(inplace = True)
df.isna().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [163]:
df.duplicated().sum()

0

In [164]:
# x = '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}]'
# x = ['Action', 'Adventure']

# ast.literal_eval: Safely evaluate an expression node or a string containing
# a Python literal or container display. The string or node provided may
# only consist of the following Python literal structures: strings, bytes,
# numbers, tuples, lists, dicts, sets, booleans, None, bytes and sets.

# taking a backup is a good practice
backup = df.copy()

In [165]:
import ast
def convert(text):
    L = []
    for d in ast.literal_eval(text):
        L.append(d['name'])
    return L

def fetch_director(text):
    L = []
    for d in ast.literal_eval(text):
        if d['job'] == 'Director':
            L.append(d['name'])
    return L

In [166]:
df['overview'] = df['overview'].apply(lambda x:x.split())
df['genres'] = df['genres'].apply(convert)
df['keywords'] = df['keywords'].apply(convert)
df['cast'] = df['cast'].apply(convert)
df['cast'] = df['cast'].apply(lambda x:x[0:3])
df['crew'] = df['crew'].apply(fetch_director)

In [170]:
df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [171]:
# [Sam Worthington, Zoe Saldana, Sigourney Weaver]
# to 
# [SamWorthington, ZoeSaldana, SigourneyWeaver]

In [174]:
df['cast'] = df['cast'].apply(lambda words : [w.replace(" ","") for w in words])
df['crew'] = df['crew'].apply(lambda words : [w.replace(" ","") for w in words])
df['genres'] = df['genres'].apply(lambda words : [w.replace(" ","") for w in words])
df['keywords'] = df['keywords'].apply(lambda words : [w.replace(" ","") for w in words])

In [177]:
df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[Andrew Stanton]


In [178]:
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']

In [210]:
train = df.drop(columns=['overview','genres','keywords','cast','crew'])
train['tags'] = train['tags'].apply(lambda x : " ".join(x))
train.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [211]:
train['tags'] = train['tags'].apply(lambda x:x.lower())
train.iloc[0]['tags']

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver james cameron'

In [212]:
train.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [213]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [214]:
def stem(text):
    res = []
    for w in text.split():
        res.append(ps.stem(w))
    return " ".join(res)

In [215]:
train['tags'] = train['tags'].apply(stem)

In [217]:
train.iloc[0]['tags']

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jame cameron'

In [218]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [219]:
vector = cv.fit_transform(train['tags']).toarray()

In [220]:
vector.shape

(4806, 5000)

In [221]:
from sklearn.metrics.pairwise import cosine_similarity

In [222]:
similarity = cosine_similarity(vector)

In [223]:
similarity

array([[1.        , 0.08119979, 0.0836242 , ..., 0.04358136, 0.        ,
        0.        ],
       [0.08119979, 1.        , 0.05884899, ..., 0.02300219, 0.        ,
        0.02390457],
       [0.0836242 , 0.05884899, 1.        , ..., 0.02368897, 0.0492366 ,
        0.        ],
       ...,
       [0.04358136, 0.02300219, 0.02368897, ..., 1.        , 0.03849002,
        0.03849002],
       [0.        , 0.        , 0.0492366 , ..., 0.03849002, 1.        ,
        0.08      ],
       [0.        , 0.02390457, 0.        , ..., 0.03849002, 0.08      ,
        1.        ]])

In [230]:
def recommend(movie):
    index = train[train['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(train.iloc[i[0]].title)

In [231]:
recommend('Gandhi')

Gandhi, My Father
Guiana 1838
The Wind That Shakes the Barley
The Rocket: The Legend of Rocket Richard
Mr. Turner


In [232]:
import pickle

In [234]:
pickle.dump(train, open('movie_list.pkl','wb'))
pickle.dump(similarity, open('similarity.pkl','wb'))