In [43]:
import pandas as pd
import numpy as np

# Import data from clean file
df = pd.read_csv('../data/metadata_clean1.csv')

# Plot Description Based Recommender

In [44]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995


In [45]:
# Import the original file 
orig_df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)


In [46]:
# Add the userful features into the cleaned dataframe 
df['overview'], df['id'] = orig_df['overview'], orig_df['id']

In [47]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


In [48]:
df = df[:101]

In [49]:
df.shape

(101, 8)

In [23]:
# Import IfIdVectorizer from the scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
# Define a TD-IDF Vectorizer Object. Remove all English Stop Words
tfidf = TfidfVectorizer(stop_words='english')

In [25]:
# Replace NaN with an empty string
df['overview'] = df['overview'].fillna('')

In [26]:
# Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(df['overview'])


In [10]:
print(tfidf_matrix)

  (0, 38088)	0.10739705953465474
  (0, 73468)	0.4809827114790238
  (0, 3159)	0.4117836571172595
  (0, 67874)	0.14878284660693247
  (0, 39012)	0.0871868917895906
  (0, 28729)	0.13311522181618415
  (0, 56872)	0.11124851086523602
  (0, 7491)	0.12380553184830105
  (0, 9087)	0.10635375129287979
  (0, 9874)	0.502803868613561
  (0, 38693)	0.2062792468281062
  (0, 58571)	0.11355918868736861
  (0, 1847)	0.140911774178889
  (0, 39423)	0.11907123344715954
  (0, 50914)	0.09190797940163037
  (0, 29238)	0.10093917370354447
  (0, 51108)	0.1343481728311918
  (0, 12490)	0.12544427954397822
  (0, 59519)	0.1300801610445509
  (0, 48558)	0.10339358185033236
  (0, 19641)	0.13281884272823927
  (0, 21887)	0.10438761058719499
  (0, 38030)	0.10142919482788752
  (0, 4388)	0.14748820342184052
  (0, 17764)	0.1348314953863925
  :	:
  (45464, 63811)	0.12338183251109078
  (45464, 19945)	0.13612977846000052
  (45464, 15394)	0.14870056078358065
  (45464, 20273)	0.16775054113971025
  (45465, 74030)	0.10752037301100263
 

In [27]:
# Output the shape of tfdif_matrix
tfidf_matrix.shape

(100, 1895)

In [28]:
# Import linear_kernal to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

In [29]:
# Compute the consine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [30]:
# Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [36]:
# a Function that takes in movie title as an input and gives recommendations
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all movies with that movie
    # And convert it into a list of tuples as descriped above
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top ten most similar movies
    return df['title'].iloc[movie_indices]


In [37]:
#Get recommendations for The Lion King
content_recommender('Persuasion')

83    Last Summer in the Hamptons
56          Home for the Holidays
36         Across the Sea of Time
54                        Georgia
42                    Restoration
29                 Shanghai Triad
79              The White Balloon
73                   Bed of Roses
38                       Clueless
47                     Pocahontas
Name: title, dtype: object

In [33]:
df.head(50)

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862
5,Heat,"['action', 'crime', 'drama', 'thriller']",170.0,7.7,1886.0,1995,"Obsessive master thief, Neil McCauley leads a ...",949
6,Sabrina,"['comedy', 'romance']",127.0,6.2,141.0,1995,An ugly duckling having undergone a remarkable...,11860
7,Tom and Huck,"['action', 'adventure', 'drama', 'family']",97.0,5.4,45.0,1995,"A mischievous young boy, Tom Sawyer, witnesses...",45325
8,Sudden Death,"['action', 'adventure', 'thriller']",106.0,5.5,174.0,1995,International action superstar Jean Claude Van...,9091
9,GoldenEye,"['adventure', 'action', 'thriller']",130.0,6.6,1194.0,1995,James Bond must unmask the mysterious head of ...,710


# Meta Data Based Recommender

In [50]:
# Load the keywords and creditfiles
cred_df = pd.read_csv('../data/credits.csv')
key_df = pd.read_csv('../data/keywords.csv')

In [51]:
# Print the head of the credit dataframe
cred_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [52]:
# Print the head of the keywords dataframe
key_df.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [54]:
# Function to convert all non-integer ids to NaN 
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

In [56]:
# Clean the ids of df
df['id'] = df['id'].apply(clean_ids)

In [57]:
# Filter all rows that have a null ids
df = df[df['id'].notnull()]

In [59]:
# Convert ids into integer 
df['id'] = df['id'].astype('int')
key_df['id'] = key_df['id'].astype('int')
cred_df['id'] = cred_df['id'].astype('int')

In [60]:
key_df = key_df[:101]
cred_df = cred_df[:101]

In [61]:
# Merge keywords and credits into your main metadata dataframe
df = df.merge(cred_df, on='id')
df = df.merge(key_df, on='id')

In [62]:
# Display the merge head of the df
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id,cast,crew,keywords
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [63]:
df.shape

(101, 11)

In [64]:
# Convert the stringfield objects into the native Python objects
from ast import literal_eval

In [67]:
features = ['cast', 'crew', 'keywords', 'genres']

In [69]:
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [71]:
df.columns

Index(['title', 'genres', 'runtime', 'vote_average', 'vote_count', 'year',
       'overview', 'id', 'cast', 'crew', 'keywords'],
      dtype='object')

In [73]:
# Print the first cast member of the movie in df
df.iloc[0]['crew'][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [74]:
# Extract the director's name If director is not listed 
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan
        
        
        

In [75]:
# Define the new director feature
df['director'] = df['crew'].apply(get_director)

In [76]:
# Print the directors of the first five movies
df['director'].head()

0      John Lasseter
1       Joe Johnston
2      Howard Deutch
3    Forest Whitaker
4      Charles Shyer
Name: director, dtype: object

In [77]:
# Returns the list top 3 elements or entire list; whichever is more
def generate_list(x):
    if isinstance(x, list):
        names = [ele['name'] for ele in x]
        # Check if more than 3 elements exist. If yes, return only first three.
        # If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names
    
    # Return empty list in case of missing/malformed data
    return []
    
        
        
        


In [78]:
# Apply the generate_list function to cast keywords
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)


In [79]:
# Only consider a maximum of 3 genres
df['genres'] = df['genres'].apply(lambda x: x[:3])

In [81]:
# Print the new features of the first five movies along with title
df[['title', 'cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[animation, comedy, family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[adventure, fantasy, family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[romance, comedy]"
3,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devine]",Forest Whitaker,"[based on novel, interracial relationship, sin...","[comedy, drama, romance]"
4,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",Charles Shyer,"[baby, midlife crisis, confidence]",[comedy]


In [82]:
# Function to sanitize data to prevent ambiguity
# Removes spaces and converts to lower case

def sanitize(x):
    if isinstance(x, list):
        # Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ","")) for i in x]
    else:
        # Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
            
        
        
        
        
    


In [83]:
# Apply the generate_list function to cast, keywords, director and genres
for feature in ['cast', 'director', 'genres', 'keywords']:
    df[feature] = df[feature].apply(sanitize)

In [84]:
# Function that creates a soup out of the desired metadata
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [85]:
# Create a new soup feature
df['soup'] = df.apply(create_soup, axis=1)

In [86]:
# Display the soup of the first movie
df.iloc[0]['soup']

'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'

In [88]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Define a CountVectorizer object and create vectors for the soup
count = CountVectorizer(stop_words = 'english')
count_matrix = count.fit_transform(df['soup'])

In [89]:
# Import Cosine Similarity Function
from sklearn.metrics.pairwise import cosine_similarity

In [90]:
# Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)
cosin_sim2 = cosine_similarity(count_matrix, count_matrix)

In [91]:
# Reset index of df and construct reverse mapping again
indices2 = pd.Series(df.index, index=df['title'])

In [92]:
content_recommender('Persuasion', cosin_sim2, df, indices2)

73                     Bed of Roses
92                  Beautiful Girls
16            Sense and Sensibility
24                Leaving Las Vegas
42                      Restoration
45    How To Make An American Quilt
48            When Night Is Falling
71            Kicking and Screaming
3                 Waiting to Exhale
34                       Carrington
Name: title, dtype: object

In [93]:
count_matrix.shape

(101, 638)

In [95]:
print(count_matrix[0:1])

  (0, 31)	1
  (0, 63)	1
  (0, 105)	1
  (0, 144)	1
  (0, 169)	1
  (0, 264)	1
  (0, 286)	1
  (0, 588)	1
  (0, 594)	1
  (0, 596)	1


In [100]:
cosin_sim2[27:28, 92:93]

array([[0.23570226]])