In [1]:
import numpy as np
import pandas as pd
import json
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
data1 = pd.read_csv("merged_data.csv")

In [3]:
data1.head()

Unnamed: 0.1,Unnamed: 0,title,id,language,imdb_id,adult,genres,keywords,cast,crew
0,0,Toy Story,862,en,tt0114709,False,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,1,Jumanji,8844,en,tt0113497,False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,2,Grumpier Old Men,15602,en,tt0113228,False,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,3,Waiting to Exhale,31357,en,tt0114885,False,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,4,Father of the Bride Part II,11862,en,tt0113041,False,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [4]:
data1.genres[0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

### here we have to extract the important keywords from the  stringified json text

In [5]:
a = data1.genres[0]
a = re.sub("[^a-zA-Z]", " ", a)
a = a.lower()
a = a.replace("id", "")
a = a.replace("name", "")
a = nltk.word_tokenize(a)
a

['animation', 'comedy', 'family']

In [6]:
## for keywords and genres columns
def keyword_extraction(text) :
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    text = text.replace("id", "")
    text = text.replace("name", "")
    text = nltk.word_tokenize(text)
    text = " ".join(text)
    
    return text

In [7]:
keyword_extraction(data1["genres"][0])

'animation comedy family'

In [8]:
keyword_extraction(data1["keywords"][0])

'jealousy toy boy friendship friends rivalry boy next door new toy toy comes to life'

In [9]:
data1["genres_data"] = data1["genres"].apply(keyword_extraction)
data1["keywords_data"] = data1["keywords"].apply(keyword_extraction)

### Remove genres and keywords columns

In [10]:
data1 = data1.drop(["genres","keywords"], axis=1)

In [11]:
data1.head(3)

Unnamed: 0.1,Unnamed: 0,title,id,language,imdb_id,adult,cast,crew,genres_data,keywords_data
0,0,Toy Story,862,en,tt0114709,False,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",animation comedy family,jealousy toy boy friendship friends rivalry bo...
1,1,Jumanji,8844,en,tt0113497,False,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",adventure fantasy family,board game disappearance based on children s b...
2,2,Grumpier Old Men,15602,en,tt0113228,False,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",romance comedy,fishing best friend duringcreditsstinger old men


### Extract top 3 cast name

In [12]:
def top_3_crew(text) :
    cast = []
    texts = text.split("},")[:]
    for i in range(3) :
        cast.append((texts[i].split("name': '")[1].split("'")[0].replace(" ","")).lower())
        
    cast = " ".join(cast)
    return cast

In [13]:
top_3_crew(data1["cast"][0])

'tomhanks timallen donrickles'

In [14]:
top_3_crew(data1["crew"][456])

'rosetroche rosetroche guinevereturner'

In [15]:
data1["cast_data"] = ""
data1["crew_data"] = ""
data1.head(1)

Unnamed: 0.1,Unnamed: 0,title,id,language,imdb_id,adult,cast,crew,genres_data,keywords_data,cast_data,crew_data
0,0,Toy Story,862,en,tt0114709,False,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",animation comedy family,jealousy toy boy friendship friends rivalry bo...,,


In [16]:
for index, row in data1.iterrows() :
    try :
        text1 = row.cast
        casts = top_3_crew(text1)
        data1["cast_data"].iloc[index] = casts
    except IndexError:
        data1["cast_data"].iloc[index] = ""

KeyboardInterrupt: 