In [252]:
import ast
import nltk
import pickle
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [196]:
# load dataset 
movies = pd.read_csv('dataset\\tmdb_5000_movies.csv')
credits = pd.read_csv('dataset\\tmdb_5000_credits.csv')

In [197]:
movies = movies.merge(credits,on='title')

In [198]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [199]:
# take only those columns having strong relation to the output feature
movies = movies[['id','title','overview','genres','keywords','cast','crew']]

In [200]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [201]:
# check for the null value
movies.isnull().sum()

id          0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [202]:
# check for the duplicaed value
movies.duplicated().sum()

0

In [203]:
# drop nan values
movies.dropna(inplace=True)

In [204]:
# after drop nan value check for the nan values
movies.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [205]:
def convert(obj):
    """
    this fucntion convert dict into list with some use full info
    take only genre name value from the dict
    """
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [206]:
# apply convert funtion 
movies['genres']=movies.genres.apply(convert)

In [207]:
# apply convert funtion 
movies['keywords']=movies.keywords.apply(convert)

In [208]:
def convert_cast(obj):
    """
    this fucntion convert dict into list with some use full info 
    take only top 3 actor work into the movie
    """
    L = []
    convert = 0
    for i in ast.literal_eval(obj):
        if convert != 3:
            L.append(i['name'])
            convert+=1
        else:
            break
    return L

In [209]:
# apply convert_cast funtion 
movies['cast']=movies.cast.apply(convert_cast)

In [210]:
def fetch_director(obj):
    """
    this fucntion convert dict into list with some use full info
    take director work into the movie
    """
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [211]:
# apply fetch_director funtion
movies['crew'] = movies['crew'].apply(fetch_director)

In [212]:
#convert overview feature values into list
movies['overview']=movies['overview'].apply(lambda x: x.split())

In [233]:
# replace space from the word
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])
movies['overview'] = movies['overview'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])

In [234]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [235]:
# combine all feature and convert into a new feature tag
movies['tags'] = movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']

In [236]:
# create a new DataFrame with new feature
new_df  = movies[['id','title','tags']]

In [237]:
# convert tags feature values list into string
new_df['tags']= new_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']= new_df['tags'].apply(lambda x: " ".join(x))


In [238]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [239]:
#convert tags feature into lower case
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:x.lower())


In [240]:
# remove meaning less word 
cv = CountVectorizer(max_features=5000,stop_words='english')

In [241]:
# fit and transfrom data and convert into array
vector = cv.fit_transform(new_df['tags']).toarray()

In [243]:
ps = PorterStemmer()

In [244]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [245]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [247]:
# find vector distance of each movie 
similarty = cosine_similarity(vector)

In [248]:
def recommend(movie):
    """
    this function give us recommendation of the movie .
    this take movie name as input
    recommend 10 movies base on search

    """
    movie_index = new_df[new_df['title']==movie].index[0]
    distances = similarty[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True,key=lambda x:x[1])[1:10]
    for i in movie_list:
        print(new_df.iloc[i[0]].title)

In [249]:
# convert dataset into dict and after create pickle file dataset  
pickle.dump(new_df.to_dict(),open('movie_list.pkl','wb'))

In [250]:
# this file have vector distance of each movie 
pickle.dump(similarty,open('similarity.pkl','wb'))