### Import Libraries

In [54]:
import numpy as np
import pandas as pd
import ast
import warnings
warnings.filterwarnings('ignore')

### Loading Dataset

In [55]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

### Joining two Dataset

In [56]:
movies = movies.merge(credits, on='title')

In [57]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Fetching only required column for recommendation system 

In [58]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [59]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   genres    4809 non-null   object
 4   keywords  4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 263.1+ KB


In [60]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


### Checking missing values

In [61]:
movies.isnull().sum().to_frame().rename(columns={0:"Total No. of Missing Values"})

Unnamed: 0,Total No. of Missing Values
movie_id,0
title,0
overview,3
genres,0
keywords,0
cast,0
crew,0


### Dropping missing values

In [62]:
movies.dropna(inplace=True)

### Checking duplidate value

In [63]:
print("Duplicate Values =",movies.duplicated().sum())

Duplicate Values = 0


### Convert json like details to list

In [64]:
## This is for genres and keywords column
def convert_to_list(json_like_format):
    list_append = []
    for object in ast.literal_eval(json_like_format):
        list_append.append(object['name'])
    return list_append

In [65]:
 movies['genres'] = movies['genres'].apply(convert_to_list)

In [66]:
 movies['keywords'] = movies['keywords'].apply(convert_to_list)

In [67]:
## This is for cast column (fetching first 3 three actors name)
def convert_to_list(json_like_format):
    counter = 0
    list_append = []
    for object in ast.literal_eval(json_like_format):
        if counter != 3:
            list_append.append(object['name'])
            counter = counter + 1
        else:
            break
    return list_append

In [68]:
movies['cast'] = movies['cast'].apply(convert_to_list)

In [69]:
## This is for crew column (fetching only Director Name)
def convert_to_list(json_like_format):
    list_append = []
    for object in ast.literal_eval(json_like_format):
        if object['job'] == 'Director':
            list_append.append(object['name'])
            break
    return list_append

In [70]:
movies['crew'] = movies['crew'].apply(convert_to_list)

### Convert String to list for overview column

In [71]:
movies['overview'] = movies['overview'].apply(lambda x : x.split())

In [72]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


### If there is space in comma seprated list then merge it to single word

In [73]:
movies['genres']= movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [74]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


### Combine all the list into single list

In [75]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

### Create a new dataframe with required columns

In [76]:
new_df = movies[['movie_id','title','tags']]

### convert list to string and lower case for all the tags

In [77]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

In [78]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

### The Porter Stemmer is an algorithm used for stemming, a natural language processing (NLP) technique that reduces words to their base or root form (known as the "stem")

In [79]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [80]:
def stem(texts):
    list_append = []
    for text in texts.split():
        list_append.append(ps.stem(text))
    return " ".join(list_append)

In [81]:
new_df['tags'] = new_df['tags'].apply(stem)

### The CountVectorizer is a feature extraction technique in text processing. It converts a collection of text documents into a matrix of token counts, commonly referred to as the bag-of-words (BoW) representation.

In [83]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [84]:
vectors = cv.fit_transform(new_df['tags']).toarray()

### Cosine similarity is a measure of similarity between two non-zero vectors. It calculates the cosine of the angle between the vectors in an n-dimensional space. The value ranges from -1 to 1

In [85]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

### Movie recommendation function

In [87]:
def recommand(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)),reverse=True, key= lambda x: x[1])[1:11]
    for movie in movie_list:
        print(new_df.iloc[movie[0]].title)

In [90]:
recommand('Spider-Man')

Spider-Man 3
Spider-Man 2
The Amazing Spider-Man 2
Arachnophobia
Kick-Ass
The Amazing Spider-Man
21 Jump Street
X-Men
Eight Legged Freaks
Light It Up


### Dump the movies details and similarity matrix to use it in the web deployment

In [91]:
import pickle

In [92]:
pickle.dump(new_df,open('movies.pkl','wb'))

In [93]:
pickle.dump(similarity,open('similarity.pkl','wb'))