In [1]:
# importing libraries

import os
import ast

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.stem.porter import PorterStemmer

import joblib as jlib

In [2]:
# import data
movies = pd.read_csv(r"data\tmdb_5000_movies.csv")
credits = pd.read_csv(r"data\tmdb_5000_credits.csv")

In [3]:
# explore data

movies.head(5)
credits.head(5)

# combine datasetes using title
movies = movies.merge(credits , on='title')

In [4]:
# id
# title
# genres
# keywords
# overview
# cast 
# crew

movies = movies[['id','title','genres','keywords','overview','cast','crew']]

In [5]:
# data cleaning
movies.isna().sum()
movies.dropna(inplace = True)

In [6]:
# check duplicated values and if any delete them
movies.duplicated().sum()

0

In [7]:
# preprocess data
def preprocess(obj):
    obj = ast.literal_eval(obj)
    proper_list = []
    for i in obj:
        proper_list.append(i['name'])
    return proper_list

In [8]:
movies['genres'] = movies['genres'].apply(preprocess)

In [9]:
movies['keywords']=movies['keywords'].apply(preprocess)

In [10]:
# preprocess cast 
# preprocessing include considering three cast member names
def preprocessCast(obj):
    obj = ast.literal_eval(obj)
    proper_list = []
    count = 0 
    for i in obj:
        if count < 3:
            proper_list.append(i['name'])
            count+= 1
        else:
            break
    return proper_list

In [11]:
movies['cast'] = movies['cast'].apply(preprocessCast)

In [12]:
# preprocess crew
# preprocessing include considering extraction of director name for the movie
def preprocessCrew(obj):
    obj = ast.literal_eval(obj)
    proper_list = []
    for i in obj:
        if(i['job'] == 'Director'):
            proper_list.append(i['name'])
            break
    return proper_list

In [13]:
movies['crew'] = movies['crew'].apply(preprocessCrew)

In [14]:
movies['overview'] = movies['overview'].apply(lambda x : x.split())

In [15]:
# spilit each word 
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])
movies.head()

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [16]:
movies['crew']

0           [JamesCameron]
1          [GoreVerbinski]
2              [SamMendes]
3       [ChristopherNolan]
4          [AndrewStanton]
               ...        
4804     [RobertRodriguez]
4805         [EdwardBurns]
4806          [ScottSmith]
4807          [DanielHsia]
4808     [BrianHerzlinger]
Name: crew, Length: 4806, dtype: object

In [17]:
# concatenate overview,genres,keywords,cast and crew 
# so that tags has all the key information about the movie
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [18]:
# initialize a new dataframe
actual_data = movies[['id','title','tags']]

In [19]:

actual_data['tags'] = actual_data['tags'].apply(lambda x: " ".join(x)) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_data['tags'] = actual_data['tags'].apply(lambda x: " ".join(x))


In [20]:
actual_data['tags'] = actual_data['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_data['tags'] = actual_data['tags'].apply(lambda x:x.lower())


In [21]:
# PorterStemmer method helps to find dominant word
# ex : [recommended , recommending ,recommend]
# on apply porterstemmer on above example gives [recommend,recommend,recommend]
# which makes sense because the intention of above list of words sums up to "recommend"
ps = PorterStemmer()

In [22]:
def stem(text):
    l = []

    for i in text.split():
        l.append(ps.stem(i))

    return " ".join(l)

actual_data['tags'] = actual_data['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_data['tags'] = actual_data['tags'].apply(stem)


In [23]:
# Finding similarity between movies can be done by finding cosine similarity
# inorder to find cosine similarity the features should be represented 
# in the form of vectors. CountVectorizer method converts in the form of vectors
# This process is called Vectorization
cv = CountVectorizer(max_features= 5000,stop_words='english')

In [24]:
# transformation of tags into vectors
tags_vectors = cv.fit_transform(actual_data['tags']).toarray()

In [25]:
# calculate cosine similarity between vectors
cos_sim = cosine_similarity(tags_vectors)

In [26]:
cos_sim = pd.DataFrame(cos_sim)
cos_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4796,4797,4798,4799,4800,4801,4802,4803,4804,4805
0,1.000000,0.083462,0.086031,0.073472,0.189299,0.108389,0.040242,0.146735,0.059235,0.096730,...,0.000000,0.000000,0.042239,0.052632,0.000000,0.019252,0.046829,0.044992,0.000000,0.000000
1,0.083462,1.000000,0.060634,0.038837,0.075047,0.114587,0.021272,0.129272,0.062622,0.102262,...,0.000000,0.000000,0.022327,0.027821,0.000000,0.040706,0.000000,0.023783,0.000000,0.026153
2,0.086031,0.060634,1.000000,0.060048,0.077357,0.070868,0.021926,0.133250,0.064550,0.105409,...,0.085749,0.000000,0.000000,0.000000,0.017590,0.041959,0.000000,0.024515,0.000000,0.000000
3,0.073472,0.038837,0.060048,1.000000,0.033032,0.060523,0.056177,0.068279,0.041345,0.202548,...,0.027462,0.027462,0.058964,0.055104,0.022533,0.067188,0.000000,0.031404,0.048526,0.086335
4,0.189299,0.075047,0.077357,0.033032,1.000000,0.097460,0.054277,0.197910,0.079894,0.108721,...,0.035377,0.000000,0.075960,0.023662,0.145141,0.155799,0.000000,0.020228,0.083351,0.044488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4801,0.019252,0.040706,0.041959,0.067188,0.155799,0.079295,0.029440,0.143131,0.130005,0.035383,...,0.000000,0.057567,0.139055,0.057756,0.259796,1.000000,0.000000,0.000000,0.152586,0.126688
4802,0.046829,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.070014,0.000000,0.000000,0.028724,0.000000,1.000000,0.120096,0.000000,0.000000
4803,0.044992,0.023783,0.024515,0.031404,0.020228,0.018531,0.000000,0.000000,0.050637,0.020672,...,0.067267,0.033634,0.018054,0.044992,0.013799,0.000000,0.120096,1.000000,0.039621,0.042295
4804,0.000000,0.000000,0.000000,0.048526,0.083351,0.057270,0.035438,0.043073,0.104328,0.021296,...,0.000000,0.034648,0.092992,0.000000,0.142148,0.152586,0.000000,0.039621,1.000000,0.087142


In [27]:
# Pickle movies details and movie similarity 
jlib.dump(actual_data,"movies.csv")
jlib.dump(cos_sim,"similarity.csv")

['similarity.csv']

In [7]:
import joblib as jlib



ModuleNotFoundError: No module named 'joblib'