###### Import required Library

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import numpy as np
import pandas as pd
import pickle

###### Import Dataset

In [2]:
movies = pd.read_csv('/Users/Dataset/movie/TMDB movie/tmdb_5000_movies.csv')
credits = pd.read_csv('/Users/Dataset/movie/TMDB movie/tmdb_5000_credits.csv')

###### Merging two dataset into same file

In [3]:
movie = movies.merge(credits, on='title')

###### Feature Engineering

In [4]:
movie.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [5]:
movie = movie[['movie_id','title','overview', 'genres', 'keywords', 'cast', 'crew']]

In [6]:
movie.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


###### Checke null and duplicate values

In [7]:
movie.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [8]:
movie = movie.drop_duplicates()
movie = movie.dropna()

In [9]:
movie.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [10]:
movie.shape

(4806, 7)

###### Build Feature for search criteria
* Movie recommendation is done on the basis of 'genre' , 'keywork' , 'cast', and 'crew' data.

In [11]:
movie.iloc[0]

movie_id                                                19995
title                                                  Avatar
overview    In the 22nd century, a paraplegic Marine is di...
genres      [{"id": 28, "name": "Action"}, {"id": 12, "nam...
keywords    [{"id": 1463, "name": "culture clash"}, {"id":...
cast        [{"cast_id": 242, "character": "Jake Sully", "...
crew        [{"credit_id": "52fe48009251416c750aca23", "de...
Name: 0, dtype: object

In [12]:
import ast
def name_extract1(items):
    name=[]
    for i in ast.literal_eval(items): # convert string to list
        name.append(i['name'])
    return name

def name_extract2(items):
    name=[]
    count = 0
    for i in ast.literal_eval(items): # convert string to int
        if count < 3:
            name.append(i['name'])
            count +=1
        else:
            break
    return name

def name_extract3(items):
    director_name=[]
    for i in ast.literal_eval(items): 
        if i['job']=='Director':
            director_name.append(i['name'])
    return director_name

In [13]:
movie['genres'] = movie['genres'].apply(name_extract1)
movie['keywords'] = movie['keywords'].apply(name_extract1)
movie['cast'] = movie['cast'].apply(name_extract2)
movie['crew'] = movie['crew'].apply(name_extract3)
movie['overview']= movie['overview'].apply(lambda x: x.split())

In [14]:
movie['genres'] = movie['genres'].apply(lambda x:[i.replace(" ","")for i in x])
movie['keywords'] = movie['keywords'].apply(lambda x:[i.replace(" ","")for i in x])
movie['cast'] = movie['cast'].apply(lambda x:[i.replace(" ","")for i in x])
movie['crew'] = movie['crew'].apply(lambda x:[i.replace(" ","")for i in x])

In [18]:
movie['features'] = movie['overview']+ movie['genres']+movie['keywords']+movie['cast']+movie['crew']

In [19]:
movie = movie[['movie_id','title','features']]


In [20]:
movie.head()

Unnamed: 0,movie_id,title,features
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [21]:
movie['features'] = movie['features'].apply(lambda x:" ".join(x) ) # convert list into string
movie['features'] = movie['features'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['features'] = movie['features'].apply(lambda x:" ".join(x) ) # convert list into string
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['features'] = movie['features'].apply(lambda x: x.lower())


In [22]:
movie.head()

Unnamed: 0,movie_id,title,features
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [19]:
po = PorterStemmer()

In [20]:
def stem(item):
    word =[]
    for i in item.split():
        word.append(po.stem(i))
    return " ".join(word)
    

In [21]:
movie['features'] = movie['features'].apply(stem)

###### Converting feature description into vecto form using 'Bag of words' technique

In [23]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [24]:
feature_vector = cv.fit_transform(movie['features'])
feature_vector = feature_vector.toarray() # convertin metrics into array

In [25]:
feature_vector.shape

(4806, 5000)

In [26]:
feature_vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [27]:
feature_vector[0]

array([0, 0, 0, ..., 0, 0, 0])

###### Calculation distance between each vectors

In [28]:
similarity = cosine_similarity(feature_vector)

In [28]:
def recommend(film):
    movie_index = movie[movie['title']==movie].index[0]
    distance = similarity[movie_index]
    recom_movie_list = sorted(list(enumerate(distance)),reverse=True,key = lambda x: x[1])[1:6]
    
    for i in recom_movie_list:
        print (movie.iloc[i[0]].title)
    return

###### Checking 5 recommended movies for people who like 'Avatar'

In [29]:
recommend('Avatar')

  movie_index = movie[movie['title']==movie].index[0]


Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.
