<a href="https://colab.research.google.com/github/thesyisyi/google_colab/blob/main/movie_recommender_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import json
from pprint import PrettyPrinter
import ast
import nltk
import collections
import operator

In [None]:
# import data file here - run this line then you will be able to upload the file from whereever it is saved
from google.colab import files
uploaded = files.upload()

Saving tmdb_5000_credits.csv to tmdb_5000_credits.csv
Saving tmdb_5000_movies.csv to tmdb_5000_movies.csv


# Data Cleaning and Prep

In [None]:
# Reading in the Movies dataset as df1
df1 = pd.read_csv('tmdb_5000_movies.csv')

In [None]:
# Observing the dataset so we know what we are working with.
df1.head(5)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


Here we can see that the Overview column has 31 missing values. Since we are not using tagline and homepage we will ignore those values. We will create a dictionary of all movies with missing overviews and replace the null values with that data. Overview data will all come from IMDB.

In [None]:
df1.isna().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                  31
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

In [None]:
# Only collecting null values so we can see which titles (and date in case some films have the same title) are missing an overview.

nan_values = df1[df1['overview'].isna()]

# Selecting the release date column so I can make sure I am referencing the correct title as lots of movies have the same title.

nan_values[['original_title','title','release_date', 'overview' ]]

Unnamed: 0,original_title,title,release_date,overview
65,The Dark Knight,The Dark Knight,2008-07-16,
77,Inside Out,Inside Out,2015-06-09,
94,Guardians of the Galaxy,Guardians of the Galaxy,2014-07-30,
95,Interstellar,Interstellar,2014-11-05,
96,Inception,Inception,2010-07-14,
262,The Lord of the Rings: The Fellowship of the Ring,The Lord of the Rings: The Fellowship of the Ring,2001-12-18,
287,Django Unchained,Django Unchained,2012-12-25,
298,The Wolf of Wall Street,The Wolf of Wall Street,2013-12-25,
329,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King,2003-12-01,
330,The Lord of the Rings: The Two Towers,The Lord of the Rings: The Two Towers,2002-12-18,


In [None]:
# Creating a dictionary of all the missing values so we can replace them in the dataset

missing_val_dict = {
    'The Dark Knight':'When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.',
    'Inside Out' : 'After young Riley is uprooted from her Midwest life and moved to San Francisco, her emotions - Joy, Fear, Anger, Disgust and Sadness - conflict on how best to navigate a new city, house, and school.',
    'Guardians of the Galaxy': 'A group of intergalactic criminals must pull together to stop a fanatical warrior with plans to purge the universe.',
    'Interstellar' : 'A team of explorers travel through a wormhole in space in an attempt to ensure humanitys survival.',
    'Inception' : 'A thief who steals corporate secrets through the use of dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O., but his tragic past may doom the project and his team to disaster.',
    'The Lord of the Rings: The Fellowship of the Ring' : 'A meek Hobbit from the Shire and eight companions set out on a journey to destroy the powerful One Ring and save Middle-earth from the Dark Lord Sauron.',
    'Django Unchained' : 'With the help of a German bounty-hunter, a freed slave sets out to rescue his wife from a brutal plantation owner in Mississippi.',
    'The Wolf of Wall Street' : 'Based on the true story of Jordan Belfort, from his rise to a wealthy stock-broker living the high life to his fall involving crime, corruption and the federal government.',
    'The Lord of the Rings: The Return of the King' : 'Gandalf and Aragorn lead the World of Men against Sauron\'s army to draw his gaze from Frodo and Sam as they approach Mount Doom with the One Ring.',
    'The Lord of the Rings: The Two Towers' : 'While Frodo and Sam edge closer to Mordor with the help of the shifty Gollum, the divided fellowship makes a stand against Sauron\'s new ally, Saruman, and his hordes of Isengard.',
    'The Lion King' : 'Lion prince Simba and his father are targeted by his bitter uncle, who wants to ascend the throne himself.',
    'The Matrix' : 'When a beautiful stranger leads computer hacker Neo to a forbidding underworld, he discovers the shocking truth--the life he knows is the elaborate deception of an evil cyber-intelligence.',
    'Fight Club' : 'An insomniac office worker and a devil-may-care soap maker form an underground fight club that evolves into much more.',
    'The Green Mile' : 'The lives of guards on Death Row are affected by one of their charges: a black man accused of child murder and rape, yet who has a mysterious gift.',
    'Forrest Gump' : 'The presidencies of Kennedy and Johnson, the Vietnam War, the Watergate scandal and other historical events unfold from the perspective of an Alabama man with an IQ of 75, whose only desire is to be reunited with his childhood sweetheart.',
    'Se7en' : 'Two detectives, a rookie and a veteran, hunt a serial killer who uses the seven deadly sins as his motives.',
    'Schindler\'s List' : 'In German-occupied Poland during World War II, industrialist Oskar Schindler gradually becomes concerned for his Jewish workforce after witnessing their persecution by the Nazis.',
    'The Shawshank Redemption' : 'Over the course of several years, two convicts form a friendship, seeking consolation and, eventually, redemption through basic compassion.',
    'The Empire Strikes Back' : 'After the Rebels are overpowered by the Empire, Luke Skywalker begins his Jedi training with Yoda, while his friends are pursued across the galaxy by Darth Vader and bounty hunter Boba Fett.',
    'The Silence of the Lambs' : 'A young F.B.I. cadet must receive the help of an incarcerated and manipulative cannibal killer to help catch another serial killer, a madman who skins his victims.',
    'Back to the Future' : 'Marty McFly, a 17-year-old high school student, is accidentally sent 30 years into the past in a time-traveling DeLorean invented by his close friend, the maverick scientist Doc Brown.',
    '千と千尋の神隠し' : 'During her family\'s move to the suburbs, a sullen 10-year-old girl wanders into a world ruled by gods, witches and spirits, a world where humans are changed into beasts.',
    'The Imitation Game' : 'During World War II, the English mathematical genius Alan Turing tries to crack the German Enigma code with help from fellow mathematicians while attempting to come to terms with his troubled private life.',
    'Chiamatemi Francesco - Il Papa della gente' : 'Following the rise of father Jorge Mario Bergoglio from his early life as a teacher in a Jesuit High School in Argentina, to archbishop and cardinal of Buenos Aires, until he was elected Pope of the Roman Catholic Church',
    'The Godfather: Part II' : 'The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Michael, expands and tightens his grip on the family crime syndicate.',
    'Star Wars' : 'Luke Skywalker joins forces with a Jedi Knight, a cocky pilot, a Wookiee and two droids to save the galaxy from the Empire\'s world-destroying battle station, while also attempting to rescue Princess Leia from the mysterious Darth Vader.',
    'Pulp Fiction' : 'The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.',
    'The Godfather' : 'The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son.',
    'Whiplash' : 'A promising young drummer enrolls at a cut-throat music conservatory where his dreams of greatness are mentored by an instructor who will stop at nothing to realize a student\'s potential.',
    'To Be Frank, Sinatra at 100' : 'The life of Frank Sinatra, as an actor and singer and the steps along the way that led him to become such an icon.',
    'Food Chains' : 'To protest their working conditions and poor wages, farmworkers in Immokalee, Florida, start a hunger strike outside the headquarters of Publix supermarkets'
}

In [None]:
# Creating a function to replace the missing values and apply it to the df

def replace_dictionary(row):
  if pd.isnull(row['overview']):
        return missing_val_dict.get(row['original_title'], row['overview'])
  else:
        return row['overview']

# applying the function
df1['overview'] = df1.apply(replace_dictionary, axis=1)

In [None]:
# Checking to make sure it worked
df1.isna().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   0
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

Unpacking the genre column here.


In [None]:
df1['genres'][0]
type(df1['genres'][0])
ast.literal_eval(df1['genres'][0])

# function to extract values from a dict
def get_names(lst):
    feat = []
    for i in ast.literal_eval(lst):
        feat.append(i['name'])
    return feat

df1['genres_names'] = df1['genres'].apply(get_names)

Unpacking the keywords columns here.

In [None]:
def get_keyword_names(row):
    keywords = json.loads(row['keywords'])
    names = []
    for keyword in keywords:
        names.append(keyword['name'])
    return names

df1['keyword_names'] = df1.apply(get_keyword_names, axis=1)

Unpacking Production Companies here.

In [None]:
def get_keyword_names(row):
    keywords = json.loads(row['production_companies'])
    names = []
    for keyword in keywords:
        names.append(keyword['name'])
    return names

df1['production_names'] = df1.apply(get_keyword_names, axis=1)

In [None]:
def list_to_string(lst):
    return ', '.join(map(str, lst))

df1['genres_names'] = df1['genres_names'].apply(list_to_string)
df1['keyword_names'] = df1['keyword_names'].apply(list_to_string)
df1['production_names'] = df1['production_names'].apply(list_to_string)

Double checking that keywords and genres are unpacked and there are no missing values.

In [None]:
df1.isna().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   0
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
genres_names               0
keyword_names              0
production_names           0
dtype: int64

In [None]:
# Taking a look at the two new columns.
df1[['genres_names', 'keyword_names']].head()

Unnamed: 0,genres_names,keyword_names
0,"Action, Adventure, Fantasy, Science Fiction","culture clash, future, space war, space colony..."
1,"Adventure, Fantasy, Action","ocean, drug abuse, exotic island, east india t..."
2,"Action, Adventure, Crime","spy, based on novel, secret agent, sequel, mi6..."
3,"Action, Crime, Drama, Thriller","dc comics, crime fighter, terrorist, secret id..."
4,"Action, Adventure, Science Fiction","based on novel, mars, medallion, space travel,..."


Cleaning the credits csv

In [None]:
# Reading in the credits dataframe
df2 = pd.read_csv('tmdb_5000_credits.csv', encoding = "ISO-8859-1")

In [None]:
# Double checking there is no missing data.
df2.isna().sum()

movie_id    0
title       0
cast        0
crew        0
dtype: int64

Since there is no missing data, we are going to unpack the cast and crew column to extract the top actors and the director.

In [None]:
all_casts = []

for index, row in df2.iterrows():
  casts = json.loads(row["cast"])
  for cast in casts:
    cast["movie_id"] = row["movie_id"]
    cast["title"] = row["title"]
    all_casts.append(cast)
cast_df = pd.DataFrame(all_casts)

# Keep top three cast and store the names in new field "lead_cast", drop the rest
cast_df = cast_df.loc[cast_df['order'].isin([0, 1, 2])].copy()
cast_df.drop(['cast_id', 'character', 'credit_id', 'gender','id'], axis=1, inplace=True)
leading_cast= cast_df.groupby('movie_id')['name'].agg(', '.join).reset_index()
cast_df =cast_df.drop(['name', 'order'],axis=1).merge(leading_cast, on='movie_id', how='left')
cast_df.rename(columns={'name': 'lead_cast'}, inplace=True)
cast_df = cast_df.drop_duplicates('movie_id')

cast_df

Unnamed: 0,movie_id,title,lead_cast
0,19995,Avatar,"Sam Worthington, Zoe Saldana, Sigourney Weaver"
3,285,Pirates of the Caribbean: At World's End,"Johnny Depp, Orlando Bloom, Keira Knightley"
6,206647,Spectre,"Daniel Craig, Christoph Waltz, Léa Seydoux"
9,49026,The Dark Knight Rises,"Christian Bale, Michael Caine, Gary Oldman"
12,49529,John Carter,"Taylor Kitsch, Lynn Collins, Samantha Morton"
...,...,...,...
13983,9367,El Mariachi,"Carlos Gallardo, Jaime de Hoyos"
13985,72766,Newlyweds,"Edward Burns, Kerry Bishé, Marsha Dietlein"
13988,231617,"Signed, Sealed, Delivered","Eric Mabius, Kristin Booth, Crystal Lowe"
13991,126186,Shanghai Calling,"Daniel Henney, Eliza Coupe, Bill Paxton"


In [None]:
# UNPACK THE CREW COLUMN HERE

import json
all_crew = []
for index, row in df2.iterrows():
  crews = json.loads(row["crew"])
  for crew in crews:
    crew["movie_id"] = row["movie_id"]
    crew["title"] = row["title"]
    all_crew.append(crew)
crew_df = pd.DataFrame(all_crew)

# Keep only director to be used as a meaningful field. Drop other roles in the crew.

crew_df=crew_df.loc[crew_df['job'] == 'Director'].copy()
crew_df.drop(['credit_id', 'department', 'gender', 'id','job'], axis=1, inplace=True)
crew_df.rename(columns={'name': 'director'}, inplace=True)

# adding a new field "director" with concatenated values of directors for each movie_id

director= crew_df.groupby('movie_id')['director'].agg(', '.join).reset_index()
crew_df =crew_df.drop('director',axis=1).merge(director, on='movie_id', how='left')
crew_df = crew_df.drop_duplicates('movie_id')

# merge clean crew_df and cast_df into df2 based on "movie_id", remove duplicated titles

df2 = pd.merge(crew_df, cast_df, on='movie_id', how='inner')
df2 = df2.drop('title_y', axis=1)
df2=df2.rename(columns={'title_x': 'title'})
df2

Unnamed: 0,movie_id,title,director,lead_cast
0,19995,Avatar,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver"
1,285,Pirates of the Caribbean: At World's End,Gore Verbinski,"Johnny Depp, Orlando Bloom, Keira Knightley"
2,206647,Spectre,Sam Mendes,"Daniel Craig, Christoph Waltz, Léa Seydoux"
3,49026,The Dark Knight Rises,Christopher Nolan,"Christian Bale, Michael Caine, Gary Oldman"
4,49529,John Carter,Andrew Stanton,"Taylor Kitsch, Lynn Collins, Samantha Morton"
...,...,...,...,...
4743,9367,El Mariachi,Robert Rodriguez,"Carlos Gallardo, Jaime de Hoyos"
4744,72766,Newlyweds,Edward Burns,"Edward Burns, Kerry Bishé, Marsha Dietlein"
4745,231617,"Signed, Sealed, Delivered",Scott Smith,"Eric Mabius, Kristin Booth, Crystal Lowe"
4746,126186,Shanghai Calling,Daniel Hsia,"Daniel Henney, Eliza Coupe, Bill Paxton"


Now that we extracted everything, we are going to merge the two datasets together and double check that there are no missing values.

In [None]:
merged_df = pd.merge(df1, df2[['movie_id', 'director', 'lead_cast']], left_on='id', right_on='movie_id')
df1['director'] = merged_df['director']
df1['lead_cast'] = merged_df['lead_cast']

In [None]:
df1.isna().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   0
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
genres_names               0
keyword_names              0
production_names           0
director                  55
lead_cast                 55
dtype: int64

Since the cast and crew dataset had 55 less values that the title dataset, we are going to use OMDb to replace the missing values.

In [None]:
# Create a dataframe with the missing values

nan_values = df1[df1['director'].isna()]
nan_values.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,status,tagline,title,vote_average,vote_count,genres_names,keyword_names,production_names,director,lead_cast
4748,70000,"[{""id"": 9648, ""name"": ""Mystery""}, {""id"": 27, ""...",,74777,"[{""id"": 13149, ""name"": ""pregnancy""}, {""id"": 20...",en,Absentia,Tricia's husband Daniel has been missing for s...,6.328665,"[{""name"": ""Blue Dot Productions"", ""id"": 24562}...",...,Released,There are fates worse than death.,Absentia,5.8,121,"Mystery, Horror, Thriller","pregnancy, declared dead, returned alive","Blue Dot Productions, FallBack Plan Productions",,
4749,0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",,16388,"[{""id"": 9673, ""name"": ""love""}, {""id"": 10183, ""...",en,The Brothers McMullen,Deals with the lives of the three Irish Cathol...,1.578903,"[{""name"": ""Fox Searchlight Pictures"", ""id"": 43}]",...,Released,Sometimes the Best Friends Are the Ones You've...,The Brothers McMullen,6.3,23,"Comedy, Drama, Romance","love, independent film, best friend, true love...",Fox Searchlight Pictures,,
4750,0,"[{""id"": 18, ""name"": ""Drama""}]",http://www.thedirtiesthemovie.com/,159770,[],en,The Dirties,Two best friends are filming a comedy about ge...,0.833937,"[{""name"": ""XYZ Films"", ""id"": 12142}, {""name"": ...",...,Released,,The Dirties,6.0,42,Drama,,"XYZ Films, Zapruder Films",,
4751,0,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""n...",,42109,[],pt,"Gabriela, Cravo e Canela","In 1925, Gabriela becomes cook, mistress, and ...",0.557602,"[{""name"": ""United Artists"", ""id"": 60}, {""name""...",...,Released,,Gabriela,6.0,2,"Drama, Romance",,"United Artists, Metro-Goldwyn-Mayer (MGM), Sul...",,
4752,65000,"[{""id"": 10749, ""name"": ""Romance""}, {""id"": 35, ...",http://tinyfurniture.com/,47607,"[{""id"": 1156, ""name"": ""sister sister relations...",en,Tiny Furniture,"After graduating from film school, Aura return...",2.380332,"[{""name"": ""Tiny Ponies"", ""id"": 65873}]",...,Released,Aura would like you to know that she is having...,Tiny Furniture,5.6,59,"Romance, Comedy, Drama","sister sister relationship, male female relati...",Tiny Ponies,,
4753,60000,"[{""id"": 53, ""name"": ""Thriller""}, {""id"": 27, ""n...",,193603,"[{""id"": 3358, ""name"": ""haunted house""}, {""id"":...",en,Hayride,A college student returning home for Halloween...,0.412342,[],...,Released,Southern Fried Horror,Hayride,5.1,6,"Thriller, Horror","haunted house, slasher",,,
4754,0,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 35, ""name...",,84659,[],en,The Naked Ape,The Naked Ape is a coming-of-age film followin...,0.077577,[],...,Rumored,,The Naked Ape,5.0,1,"Drama, Comedy, Family",,,,
4755,50000,"[{""id"": 99, ""name"": ""Documentary""}]",,322745,[],en,Counting,An associative collection of visual impression...,0.293587,[],...,Released,,Counting,8.3,3,Documentary,,,,
4756,50000,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 53, ""nam...",http://www.cthulhulives.org/cocmovie/index.html,20981,"[{""id"": 1523, ""name"": ""obsession""}, {""id"": 303...",en,The Call of Cthulhu,A dying professor leaves his great-nephew a co...,1.777148,"[{""name"": ""HPLHS"", ""id"": 17827}]",...,Released,,The Call of Cthulhu,6.9,41,"Horror, Thriller, Fantasy","obsession, nightmare, notebook, cult, h.p. lov...",HPLHS,,
4757,0,"[{""id"": 99, ""name"": ""Documentary""}]",http://www.bendingsteelmovie.com/,174362,[],en,Bending Steel,"The Cyclone, The Freakshow, The Mermaid Parade...",0.048726,[],...,Released,The Limiting Factor Is The Mind,Bending Steel,5.0,1,Documentary,,,,


In [None]:
# creating a list of missing movie titles so we can replace them

movie_titles = nan_values['title'].tolist()

In [None]:
movie_titles

['Absentia',
 'The Brothers McMullen',
 'The Dirties',
 'Gabriela',
 'Tiny Furniture',
 'Hayride',
 'The Naked Ape',
 'Counting',
 'The Call of Cthulhu',
 'Bending Steel',
 'The Signal',
 'The Image Revolution',
 'This Is Martin Bonner',
 'A True Story',
 'George Washington',
 'Smiling Fish & Goat On Fire',
 'Dawn of the Crescent Moon',
 'Raymond Did It',
 'The Last Waltz',
 'Run, Hide, Die',
 'The Exploding Girl',
 "The Legend of God's Gun",
 'Mutual Appreciation',
 'Her Cry: La Llorona Investigation',
 'Down Terrace',
 'Clerks',
 'Pink Narcissus',
 'Funny Ha Ha',
 'In the Company of Men',
 'Manito',
 'Rampage',
 'Slacker',
 'Dutch Kills',
 'Dry Spell',
 'Flywheel',
 'Backmask',
 'The Puffy Chair',
 'Stories of Our Lives',
 'Breaking Upwards',
 'All Superheroes Must Die',
 'Pink Flamingos',
 'Clean',
 'The Circle',
 'Tin Can Man',
 'Cure',
 'On The Downlow',
 'Sanctuary: Quite a Conundrum',
 'Bang',
 'Primer',
 'Cavite',
 'El Mariachi',
 'Newlyweds',
 'Signed, Sealed, Delivered',
 'Sh

In [None]:
missing_director_dict = {
    'Absentia': 'Mike Flanagan',
    'The Brothers McMullen': 'Edward Burns',
    'The Dirties': 'Matt Johnson',
    'Gabriela': 'Vincente Minnelli',
    'Tiny Furniture': 'Lena Dunham',
    'Hayride': 'Terron R. Parsons',
    'The Naked Ape': 'Donald Driver',
    'Counting': 'Jem Cohen',
    'The Call of Cthulhu': 'Andrew Leman',
    'Bending Steel': 'Dave Carroll',
    'The Signal': 'William Eubank',
    'The Image Revolution': 'Patrick Meaney',
    'This Is Martin Bonner': 'Chad Hartigan',
    'A True Story': 'Rupert Goold',
    'George Washington': 'David Gordon Green',
    'Smiling Fish & Goat On Fire': 'Kevin Jordan',
    'Dawn of the Crescent Moon': 'Kirk Loudon',
    'Raymond Did It': 'Travis Legge',
    'The Last Waltz': 'Martin Scorsese',
    'Run, Hide, Die': 'Collin Joseph Neal',
    'The Exploding Girl': 'Bradley Rust Gray',
    'The Legend of God\'s Gun': 'Mike Bruce',
    'Mutual Appreciation': 'Andrew Bujalski',
    'Her Cry: La Llorona Investigation': 'Damian Romay',
    'Down Terrace': 'Ben Wheatley',
    'Clerks': 'Kevin Smith',
    'Pink Narcissus': 'James Bidgood',
    'Funny Ha Ha': 'Andrew Bujalski',
    'In the Company of Men': 'Neil LaBute',
    'Manito': 'Eric Eason',
    'Rampage': 'Uwe Boll',
    'Slacker': 'Richard Linklater',
    'Dutch Kills': 'Joseph Mazzella',
    'Dry Spell': 'Travis Legge',
    'Flywheel': 'Alex Kendrick',
    'Backmask': 'Marcus Nispel',
    'The Puffy Chair': 'Jay Duplass and Mark Duplass',
    'Stories of Our Lives': 'Jim Chuchu',
    'Breaking Upwards': 'Daryl Wein',
    'All Superheroes Must Die': 'Jason Trost',
    'Pink Flamingos': 'John Waters',
    'Clean': 'Olivier Assayas',
    'The Circle': 'Jafar Panahi',
    'Tin Can Man': 'Ivan Kavanagh',
    'Cure': 'Kiyoshi Kurosawa',
    'On The Downlow': 'Tadeo Garcia',
    'Sanctuary: Quite a Conundrum': 'Thomas L. Phillips',
    'Bang': 'Ash Baron-Cohen',
    'Primer': 'Shane Carruth',
    'Cavite': 'Neill Dela Llana and Ian Gamazon',
    'El Mariachi': 'Robert Rodriguez',
    'Newlyweds': 'Edward Burns',
    'Signed, Sealed, Delivered': 'Scott Smith',
    'Shanghai Calling': 'Daniel Hsia',
    'My Date with Drew': 'Brian Herzlinger, Jon Gunn, Brett Winn'
}

In [None]:
missing_cast_dict = {
    'Absentia': 'Katie Parker, Courtney Bell, Dave Levine',
    'The Brothers McMullen': 'Jack Mulcahy, Mike McGlone, Edward Burns',
    'The Dirties': 'Matthew Johnson, Owen Williams, Krista Madison',
    'Gabriela': 'Zulma Faiad, Ricardo Bauleo, Miguel Ligero',
    'Tiny Furniture': 'Lena Dunham, Laurie Simmons, Grace Dunham',
    'Hayride': 'Richard Tyson, Sherri Eakin, Jeremy Sande',
    'The Naked Ape': 'Andrew Sachs, Paul Danquah, John Hamill',
    'Counting': 'Jem Cohen, Tal Gur, Avi Belleli',
    'The Call of Cthulhu': 'Matt Foyer, John Bolen, Ralph Lucas',
    'Bending Steel': 'Chris Schoeck, Adele Schoeck, Maurice Lapp',
    'The Signal': 'Brenton Thwaites, Olivia Cooke, Beau Knapp',
    'The Image Revolution': 'Jim Lee, Rob Liefeld, Todd McFarlane',
    'This Is Martin Bonner': 'Paul Eenhoorn, Richmond Arquette, Sam Buchanan',
    'A True Story': 'Jonah Hill, James Franco, Felipe Dieppa',
    'George Washington': 'Candace Evanofski, Donald Holden, Damian Jewan Lee',
    'Smiling Fish & Goat On Fire': 'Derick Martini, Bill Henderson, Pia Glenn',
    'Dawn of the Crescent Moon': 'Justin Ament, Tara Buck, Torey Adkins',
    'Raymond Did It': 'Linda Cieslik, Elissa Dowling, Steven Lee Edwards',
    'The Last Waltz': 'Robbie Robertson, Muddy Waters, Neil Young',
    'Run, Hide, Die': 'Alicia Mendez, Lisseth Chavez, Sarah Jannett Parish',
    'The Exploding Girl': 'Zoe Kazan, Mark Rendall, Maryann Urbano',
    "The Legend of God's Gun": 'Robert Bones, Kirpatrick Thomas, Michael Madsen',
    'Mutual Appreciation': 'Justin Rice, Rachel Clift, Andrew Bujalski',
    'Her Cry: La Llorona Investigation': 'Nicholas Barrera, James Ezrin, Everardo Guzman',
    'Down Terrace': 'Robin Hill, Robert Hill, Julia Deakin',
    'Clerks': "Brian O'Halloran, Jeff Anderson, Marilyn Ghigliotti",
    'Pink Narcissus': 'Don Brooks, Bobby Kendall, Charles Ludlam',
    'Funny Ha Ha': 'Kate Dollenmayer, Christian Rudder, Jennifer L. Schaper',
    'In the Company of Men': 'Aaron Eckhart, Matt Malloy, Stacy Edwards',
    'Manito': 'Franky G, Leo Minaya, Manuel Cabral',
    'Rampage': 'Brendan Fletcher, Shaun Sipos, Michael Paré',
    'Slacker': 'Richard Linklater, Rudy Basquez, Jean Caffeine',
    'Dutch Kills': 'R.L. Mann, Tama Filianga, Maurice Ripke',
    'Dry Spell': 'Suzi Lorraine, Kyle Jason, Jeffrey Alan Solomon',
    'Flywheel': 'Rosetta Harris Armstrong, Lisa Arnold, Blake Bailey',
    'Backmask': 'Gergely Polgár, Zsolt Végh, Tamás Fodor',
    'The Puffy Chair': 'Mark Duplass, Kathryn Aselton, Rhett Wilkins',
    'Stories of Our Lives': 'Kelly Gichohi, Paul Ogola, Tim Mutungi',
    'Breaking Upwards': 'Zoe Lister-Jones, Daryl Wein, Julie White',
    'All Superheroes Must Die': 'Jason Trost, Lucas Till, James Remar',
    'Pink Flamingos': 'Divine, David Lochary, Mary Vivian Pearce',
    'Clean': 'Maggie Cheung, Nick Nolte, Béatrice Dalle',
    'The Circle': 'Maryiam Palvin Almani, Nargess Mamizadeh, Maryam Shayegan',
    'Tin Can Man': 'Michael Parle, Emma Eliza Regan, Patrick ODonnell',
    'Cure': 'Kôji Yakusho, Masato Hagiwara, Tsuyoshi Ujiki',
    'On The Downlow': 'Mark L. Young, Elijah C. Nealey, Lauren C. Mayhew',
    'Sanctuary: Quite a Conundrum': 'Sasha Ramos, Erin Nicole Cline, Emily Rogers',
    'Bang': 'Darling Narita, Peter Greene, Michael Newland',
    'Primer': 'Shane Carruth, David Sullivan, Casey Gooden',
    'Cavite': 'Ian Gamazon, Jasmine Trinca, Carlo Alban',
    'El Mariachi': 'Carlos Gallardo, Consuelo Gómez, Peter Marquardt',
    'Newlyweds': 'Edward Burns, Caitlin FitzGerald, Kerry Bishé',
    'Signed, Sealed, Delivered': 'Eric Mabius, Kristin Booth, Crystal Lowe',
    'Shanghai Calling': 'Le Geng, Daniel Henney, Sean Gallagher',
    'My Date with Drew': 'Drew Barrymore, Brian Herzlinger, Corey Feldman'
}

In [None]:
# Creating a function to replace the missing directors and apply it to the df

def replace_dictionary(row):
  if pd.isnull(row['director']):
        return missing_director_dict.get(row['title'], row['director'])
  else:
        return row['director']
# applying the function

df1['director'] = df1.apply(replace_dictionary, axis=1)

In [None]:
# Creating a function to replace the missing cast values from the dataset

def replace_cast(row):
  if pd.isnull(row['lead_cast']):
        return missing_cast_dict.get(row['title'], row['lead_cast'])
  else:
        return row['lead_cast']

# applying the function
df1['lead_cast'] = df1.apply(replace_cast, axis=1)

In [None]:
# checking the data to make sure values were replaced

df1.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   0
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
genres_names               0
keyword_names              0
production_names           0
director                   0
lead_cast                  0
dtype: int64

There are still some missing values from runtime and release date so we will manually update those. Using the completed dataset so I can refrence director and cast for the movies.

In [None]:
nan_values = df1[df1['runtime'].isna()]

# Selecting the release date column so I can make sure I am refrencing the correct title as lots of movies have the same title.

nan_values[['original_title','director','release_date']]

Unnamed: 0,original_title,director,release_date
2656,Chiamatemi Francesco - Il Papa della gente,Julien Temple,2015-12-03
4140,"To Be Frank, Sinatra at 100",Richard Lester,2015-12-12


In [None]:
runtime_dict = {
    'To Be Frank, Sinatra at 100':'72.6',
    'Chiamatemi Francesco - Il Papa della gente' : '82.8'
}

In [None]:
# Creating a function to replace the missing runtime info

def runtime(row):
  if pd.isnull(row['runtime']):
        return runtime_dict.get(row['original_title'], row['runtime'])
  else:
        return row['runtime']

#applying the function

df1['runtime'] = df1.apply(runtime, axis=1)

In [None]:
nan_values = df1[df1['release_date'].isna()]

# Selecting the release date column so I can make sure I am refrencing the correct title as lots of movies have the same title.
nan_values[['original_title','title','director']]

Unnamed: 0,original_title,title,director
4553,America Is Still the Place,America Is Still the Place,Paul Fox


In [None]:
release_dict = {'America Is Still the Place':'2022'}

# Creating a function to replace the missing release_date info

def releasedatefunction(row):
  if pd.isnull(row['release_date']):
        return release_dict.get(row['original_title'], row['release_date'])
  else:
        return row['release_date']

# applying the function
df1['release_date'] = df1.apply(releasedatefunction, axis=1)

In [None]:
df1.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   0
popularity                 0
production_companies       0
production_countries       0
release_date               0
revenue                    0
runtime                    0
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
genres_names               0
keyword_names              0
production_names           0
director                   0
lead_cast                  0
dtype: int64

In [None]:
df = df1

Create a weighted rating. According to articles on Google (https://math.stackexchange.com/questions/169032/understanding-the-imdb-weighted-rating-function-for-usage-on-my-own-website) the Formula is : (WR)=(v/(v+m))R+(m/(v+m))C

- R = average for the movie (mean) = (Rating)
- v = number of votes for the movie = (votes)
- m = minimum votes required to be listed
- C = the mean vote across the whole report (currently 6.1)

In [None]:
df.describe()

Unnamed: 0,budget,id,popularity,revenue,vote_average,vote_count
count,4803.0,4803.0,4803.0,4803.0,4803.0,4803.0
mean,29045040.0,57165.484281,21.492301,82260640.0,6.092172,690.217989
std,40722390.0,88694.614033,31.81665,162857100.0,1.194612,1234.585891
min,0.0,5.0,0.0,0.0,0.0,0.0
25%,790000.0,9014.5,4.66807,0.0,5.6,54.0
50%,15000000.0,14629.0,12.921594,19170000.0,6.2,235.0
75%,40000000.0,58610.5,28.313505,92917190.0,6.8,737.0
max,380000000.0,459488.0,875.581305,2787965000.0,10.0,13752.0


In [None]:
# Calculating m based on a 75% percentile (we're going to be generous at first to see what we get in the model)

m = df[df['vote_count'].notnull()]['vote_count'].astype('int').quantile(0.75)
m

737.0

In [None]:
C = df[df['vote_average'].notnull()]['vote_average'].astype('int').mean()
C

5.6529252550489275

In [None]:
# Creating a weighted rating function

def weighted_rating(movie):
  R= movie['vote_average']
  v = movie['vote_count']
  return (v/(v + m))*R + (m/(v+m))*C

In [None]:
df['weighted_rating'] = df.apply(weighted_rating, axis=1)

In [None]:
df

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,title,vote_average,vote_count,genres_names,keyword_names,production_names,director,lead_cast,classification,weighted_rating
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,Avatar,7.2,11800,"Action, Adventure, Fantasy, Science Fiction","culture clash, future, space war, space colony...","Ingenious Film Partners, Twentieth Century Fox...",James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver",Y2k,7.109054
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,Pirates of the Caribbean: At World's End,6.9,4500,"Adventure, Fantasy, Action","ocean, drug abuse, exotic island, east india t...","Walt Disney Pictures, Jerry Bruckheimer Films,...",Gore Verbinski,"Johnny Depp, Orlando Bloom, Keira Knightley",Y2k,6.724500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,Spectre,6.3,4466,"Action, Adventure, Crime","spy, based on novel, secret agent, sequel, mi6...","Columbia Pictures, Danjaq, B24",Sam Mendes,"Daniel Craig, Christoph Waltz, Léa Seydoux",Newer Movie,6.208342
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,The Dark Knight Rises,7.6,9106,"Action, Crime, Drama, Thriller","dc comics, crime fighter, terrorist, secret id...","Legendary Pictures, Warner Bros., DC Entertain...",Christopher Nolan,"Christian Bale, Michael Caine, Gary Oldman",Newer Movie,7.454212
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,John Carter,6.1,2124,"Action, Adventure, Science Fiction","based on novel, mars, medallion, space travel,...",Walt Disney Pictures,Andrew Stanton,"Taylor Kitsch, Lynn Collins, Samantha Morton",Y2k,5.984833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,220000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",,9367,"[{""id"": 5616, ""name"": ""united states\u2013mexi...",es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,"[{""name"": ""Columbia Pictures"", ""id"": 5}]",...,El Mariachi,6.6,238,"Action, Crime, Thriller","united states–mexico barrier, legs, arms, pape...",Columbia Pictures,Robert Rodriguez,"Carlos Gallardo, Consuelo Gómez, Peter Marquardt",Disco Era,5.884109
4799,9000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",,72766,[],en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],...,Newlyweds,5.9,5,"Comedy, Romance",,,Edward Burns,"Edward Burns, Caitlin FitzGerald, Kerry Bishé",Y2k,5.654590
4800,0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",http://www.hallmarkchannel.com/signedsealeddel...,231617,"[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...",en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,"[{""name"": ""Front Street Pictures"", ""id"": 3958}...",...,"Signed, Sealed, Delivered",7.0,6,"Comedy, Drama, Romance, TV Movie","date, love at first sight, narration, investig...","Front Street Pictures, Muse Entertainment Ente...",Scott Smith,"Eric Mabius, Kristin Booth, Crystal Lowe",Newer Movie,5.663803
4801,0,[],http://shanghaicalling.com/,126186,[],en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],...,Shanghai Calling,5.7,7,,,,Daniel Hsia,"Le Geng, Daniel Henney, Sean Gallagher",Y2k,5.653368


# Pre Process Data


First we are going to clean up the overview. The overview will require the most pre processing as it has the most text of all our features. We are going to use tokenization, remove stop words, lemmatization, and replace punctuation with spaces and make all text lower case. We will create a function and apply this to the overview column.

In [None]:
from nltk.corpus import stopwords
import unicodedata
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import unicodedata

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

***Here is where we pre process the overview. ***

In [None]:
#we will use tokenization, lowercasing, stopwords, and lemmatization. We will also repalce punctuation marks with spaces and convert accented characters.
def preprocess_overview(text):
    # Lowercasing
    text = text.lower()

    # replace punctuations with spaces
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

    # convert accented characters (turn é into e, etc)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Tokenization
    tokens = word_tokenize(text)

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatizatpreprocess_overviewion
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [None]:
# apply the text cleaing function to "overview"
df['overview_clean'] = df['overview'].apply(preprocess_overview)

In [None]:
df.iloc[2]['overview_clean']

'cryptic message bond past sends trail uncover sinister organization battle political force keep secret service alive bond peel back layer deceit reveal terrible truth behind spectre'

Next, we're going to create a function that cleans the remaining variables in our model: genre, director, lead cast, keywords, and production names. For these variables, we are going to remove accented characters, make all words lower case, remove spaces and replace commas with spaces.

In [None]:
# we will convert the text into lower case, remove the spaces in names, and replace commas in key features with space.
def preprocess_feature(text):
    # convert accented characters (turn é into e, etc)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # lowercase the text
    text = text.lower()
    # remove spaces
    text = text.replace(" ", "")
    # replace commas with spaces
    text = text.replace(",", " ")
    return text

In [None]:
# apply the function to the columns we identify as relevant features for recommendation.
key_features = ['genres_names', 'director', 'lead_cast', 'production_names', 'keyword_names']
for column in key_features:
    df[column + 'clean_text'] = df[column].apply(preprocess_feature)

# combine the pre-processed text into a new column named 'key_features'
df['key_features'] = df[[column + 'clean_text' for column in key_features]].apply(lambda x: ' '.join(x), axis=1)

In [None]:
df.iloc[2]['key_features']

'action adventure crime sammendes danielcraig christophwaltz leaseydoux columbiapictures danjaq b24 spy basedonnovel secretagent sequel mi6 britishsecretservice unitedkingdom'

Now that the columns are clean, we are going to combine our key features together into one long string. We will use the following features for our model:

- Overview
- Genre
- Director
- Lead Cast
- Production
- Keywords

In [None]:
df['combined_features'] = df['overview_clean'] + ' ' + df['key_features']

In [None]:
df = df[['original_title', 'combined_features', 'weighted_rating']]

In [None]:
df.head()

Unnamed: 0,original_title,combined_features,weighted_rating
0,Avatar,22nd century paraplegic marine dispatched moon...,7.109054
1,Pirates of the Caribbean: At World's End,captain barbossa long believed dead come back ...,6.7245
2,Spectre,cryptic message bond past sends trail uncover ...,6.208342
3,The Dark Knight Rises,following death district attorney harvey dent ...,7.454212
4,John Carter,john carter war weary former military captain ...,5.984833


# Model 1

First model will use CountVectorizer and we will get the cosine similarity. We are using the Count Vectorizer first (also known as Bag of Words) since this focuses more on the frequency of the words. This will give equal importance to all words represented in the data.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Apply CountVectorizer using Bag of Words
vectorizer = CountVectorizer()

# Fit the vectorizer to key_features and transform them into BoW representation
BOW= vectorizer.fit_transform(df['combined_features'])

# Compute the cosine similarity between the BoW vectors
cosine_sim_BoW = cosine_similarity(BOW)

In [None]:
def get_movie_recommendation(movie_title):
    # get the index of the movie title in the dataframe
    idx = df[df['original_title'] == movie_title].index[0]

    # get the pairwise similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim_BoW[idx]))

    # sort the movies based on their similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the top 5 most similar movies (excluding the input movie)
    sim_scores = sim_scores[1:6]

    # get the titles of the recommended movies
    movie_indices = [i[0] for i in sim_scores]
    movie_titles = df['original_title'].iloc[movie_indices].tolist()

    return movie_titles

In [None]:
# The Dark Knight" recommend value
get_movie_recommendation('The Dark Knight')

['Batman Begins',
 'The Dark Knight Rises',
 'Batman Returns',
 'Batman v Superman: Dawn of Justice',
 'Batman']

In [None]:
get_movie_recommendation('The Shawshank Redemption')

['Atonement', 'Bronson', 'Prison', "Buffalo '66", 'This Is Martin Bonner']

In [None]:
get_movie_recommendation('Frozen')

['Aladdin',
 'Spirit: Stallion of the Cimarron',
 'Delgo',
 'Curious George',
 'The Book of Life']

# Model 2
Since the model using the Count Vectorizer did not give us the results we were looking for, we are going to try the TfidfVectorizer. Additionally, we are going to utilize the weighted rating column we created earlier and only provide movies above a certain threshold. Since 75% of the movies are above a 6, we will use that as our threshold. Since the Tfidf focuses on frequency and the weight of the word, we assume we will achieve better results with this model. I am also going to take the similarity score and multiply it by the weighted rating. Because the similarity score only takes into account the similarity based on textual features, I wanted to create a combined score that incorporates the weighted rating, essentially creating a quality and similarity score.

In [None]:
df['weighted_rating'].describe()

count    4803.000000
mean        5.907397
std         0.445601
min         4.592165
25%         5.652041
50%         5.732613
75%         6.030258
max         8.265344
Name: weighted_rating, dtype: float64

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# apply TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to combined_features and transform them
TFIDF = vectorizer.fit_transform(df['combined_features'])

# Compute the cosine similarity between the TF-IDF vectors
cosine_sim_tfidf = cosine_similarity(TFIDF)


In [None]:
def get_movie_recommendation2(movie_title, threshold=6):
    # get the index of the movie title in the dataframe
    idx = df[df['original_title'] == movie_title].index[0]

    # get the pairwise similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim_tfidf[idx]))

    # filter movies by weighted_rating threshold
    filtered_sim_scores = [score for score in sim_scores if df.iloc[score[0]]['weighted_rating'] >= threshold]

    # multiply weighted rating and similarity score
    combined_scores = [(score[0], score[1] * df.iloc[score[0]]['weighted_rating']) for score in filtered_sim_scores]

    # sort the movies based on their combined scores
    combined_scores = sorted(combined_scores, key=lambda x: x[1], reverse=True)

    # get the top 5 most similar movies (excluding the input movie and considering the threshold)
    combined_scores = combined_scores[1:6]

    # get the titles of the recommended movies
    movie_indices = [i[0] for i in combined_scores]
    movie_titles = df['original_title'].iloc[movie_indices].tolist()

    return movie_titles

In [None]:
get_movie_recommendation2('The Dark Knight')

['The Dark Knight Rises',
 'Batman Begins',
 'Batman Returns',
 'Batman: The Dark Knight Returns, Part 2',
 'Batman']

In [None]:
get_movie_recommendation2('The Shawshank Redemption')

['Atonement',
 'Malcolm X',
 'Pulp Fiction',
 'Mean Streets',
 'Escape from Alcatraz']

In [None]:
get_movie_recommendation2('Frozen')

['Aladdin',
 'Snow White and the Seven Dwarfs',
 'The Princess and the Frog',
 'Brave',
 'Enchanted']