In [174]:
# importing libraries

import os
import ast

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.stem.porter import PorterStemmer

import pickle

In [175]:
# import data
movies = pd.read_csv(r"data\tmdb_5000_movies.csv")
credits = pd.read_csv(r"data\tmdb_5000_credits.csv")

In [176]:
# explore data

movies.head(5)
credits.head(5)

# combine datasetes using title
movies = movies.merge(credits , on='title')

In [177]:
# id
# title
# genres
# keywords
# overview
# cast 
# crew

movies = movies[['id','title','genres','keywords','overview','cast','crew']]

In [178]:
# data cleaning
movies.isna().sum()
movies.dropna(inplace = True)

In [179]:
# check duplicated values and if any delete them
movies.duplicated().sum()

0

In [180]:
# preprocess data
def preprocess(obj):
    obj = ast.literal_eval(obj)
    proper_list = []
    for i in obj:
        proper_list.append(i['name'])
    return proper_list

In [181]:
movies['genres'] = movies['genres'].apply(preprocess)

In [182]:
movies['keywords']=movies['keywords'].apply(preprocess)

In [183]:
# preprocess cast 
# preprocessing include considering three cast member names
def preprocessCast(obj):
    obj = ast.literal_eval(obj)
    proper_list = []
    count = 0 
    for i in obj:
        if count < 3:
            proper_list.append(i['name'])
            count+= 1
        else:
            break
    return proper_list

In [184]:
movies['cast'] = movies['cast'].apply(preprocessCast)

In [185]:
# preprocess crew
# preprocessing include considering extraction of director name for the movie
def preprocessCrew(obj):
    obj = ast.literal_eval(obj)
    proper_list = []
    for i in obj:
        if(i['job'] == 'Director'):
            proper_list.append(i['name'])
            break
    return proper_list

In [186]:
movies['crew'] = movies['crew'].apply(preprocessCrew)

In [187]:
movies['overview'] = movies['overview'].apply(lambda x : x.split())

In [188]:
# spilit each word 
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])
movies.head()

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [189]:
movies['crew']

0           [JamesCameron]
1          [GoreVerbinski]
2              [SamMendes]
3       [ChristopherNolan]
4          [AndrewStanton]
               ...        
4804     [RobertRodriguez]
4805         [EdwardBurns]
4806          [ScottSmith]
4807          [DanielHsia]
4808     [BrianHerzlinger]
Name: crew, Length: 4806, dtype: object

In [190]:
# concatenate overview,genres,keywords,cast and crew 
# so that tags has all the key information about the movie
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [191]:
# initialize a new dataframe
actual_data = movies[['id','title','tags']]

In [192]:

actual_data['tags'] = actual_data['tags'].apply(lambda x: " ".join(x)) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_data['tags'] = actual_data['tags'].apply(lambda x: " ".join(x))


In [193]:
actual_data['tags'] = actual_data['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_data['tags'] = actual_data['tags'].apply(lambda x:x.lower())


In [194]:
# PorterStemmer method helps to find dominant word
# ex : [recommended , recommending ,recommend]
# on apply porterstemmer on above example gives [recommend,recommend,recommend]
# which makes sense because the intention of above list of words sums up to "recommend"
ps = PorterStemmer()

In [195]:
def stem(text):
    l = []

    for i in text.split():
        l.append(ps.stem(i))

    return " ".join(l)

actual_data['tags'] = actual_data['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_data['tags'] = actual_data['tags'].apply(stem)


In [196]:
# Finding similarity between movies can be done by finding cosine similarity
# inorder to find cosine similarity the features should be represented 
# in the form of vectors. CountVectorizer method converts in the form of vectors
# This process is called Vectorization
cv = CountVectorizer(max_features= 5000,stop_words='english')

In [197]:
# transformation of tags into vectors
tags_vectors = cv.fit_transform(actual_data['tags']).toarray()

In [198]:
# calculate cosine similarity between vectors
cos_sim = cosine_similarity(tags_vectors)

In [199]:
# Pickle movies details and movie similarity 
pickle.dump(actual_data.to_dict(),open( "movies.pkl", "wb" ))
pickle.dump(cos_sim,open( "similarity.pkl", "wb" ))

AttributeError: 'numpy.ndarray' object has no attribute 'to_dict'