# Film Recommendation Engine for Classic & International Films

In [54]:
import pandas as pd

import string
import re

from ast import literal_eval
from unidecode import unidecode

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

In [55]:
## Reading CSV Files
ratings = pd.read_csv('archive\\ratings.csv').loc[0:5000001] ## limited to first 5 million rows
films = pd.read_csv('archive\\movies_metadata.csv')
links = pd.read_csv('archive\\links.csv')
keywords = pd.read_csv('archive\\keywords.csv')
films_copy = films.copy()

  films = pd.read_csv('archive\\movies_metadata.csv')


### Data Cleaning
---

In [56]:
# drop null values
films.dropna(subset = ['vote_count', 'release_date'], inplace = True) 

In [57]:
# drop duplicate rows
films = films.drop_duplicates(subset=['id'])
keywords = keywords.drop_duplicates(subset=['id'])

In [58]:
## convert numerical values to integer/float
films['budget'] = films['budget'].astype(int)
films['id'] = films['id'].astype(int)
films['popularity'] = films['popularity'].astype(float)
films['revenue'] = films['revenue'].astype(int)
films['vote_average'] = films['vote_average'].astype(float)
films['vote_count'] = films['vote_count'].astype(int)

In [59]:
## drop null values and convert to integer
films.dropna(subset=['runtime'], inplace = True)
films['runtime'] = films['runtime'].astype(int)

In [60]:
## extracting year from release date and converting to integer
films['year'] = films['release_date'].str[0:4].astype(int)

In [61]:
## Remove adult films
films = films[films['adult']=='False']
## Remove films that have not been released
films = films[films['status']=='Released']
## Remove Direct-to-Video films
films = films[films['video']==False]

In [62]:
## Drop nulls from original language column
films.dropna(subset='original_language', inplace=True)

In [63]:
films['tmdbId'] = films['id']

In [64]:
## I will use the Vote Count and Vote Average Columns to calculate a weighted rating.
## I can do this using the formula used by IMDB.

C = ratings['rating'].mean()
m = 20

def adjust_rating(vote_mean, vote_count):
    """A formula to calculate the weighted rating of a film.
    Input: Mean rating and total number of votes
    Output: Weighted rating using IMDB's formula
    """
    if vote_count > 0:
        new_rating = (vote_mean*vote_count/(vote_count+m))+(C*m/(vote_count+m))
    else:
        ### If there are no ratings, give output as zero
        new_rating = 0
    return new_rating

In [65]:
films['adj_rating'] = films.apply(lambda x: adjust_rating(x['vote_average'],x['vote_count']), axis=1)

In [66]:
films = films[['tmdbId','title','year','genres','original_language','original_title','overview','popularity','poster_path','production_companies','production_countries','runtime','spoken_languages','tagline','vote_average','vote_count','adj_rating']]

In [67]:
## Drop films that do not have an overview
films.dropna(subset='overview', inplace=True)
## For tagline, replace null values with an empty string
films['tagline'] = films['tagline'].fillna('')

In [68]:
films = films[(films['runtime']<240) & (films['runtime']>60)]
films = films[films['vote_count']>25]

In [70]:
## Converting Genre column into a list
films['genres'] = films['genres'].apply(literal_eval).apply(lambda x: [genre['name'] for genre in x])

### Removing TV Movies and Documentaries from dataframe
films = films[films['genres'].apply(lambda x: ('TV Movie' not in x) and ('Documentary' not in x))]

## Removing 'Foreign' from genres column
films['genres'] = films['genres'].apply(lambda x: [i for i in x if i != 'Foreign'])

In [73]:
## Converting Production Companies, Production Countries and Spoken Languages to lists

films['production_companies'] = films['production_companies'].apply(literal_eval).apply(lambda x: [co['name'] for co in x])
films['production_countries'] = films['production_countries'].apply(literal_eval).apply(lambda x: [country['name'] for country in x])
films['spoken_languages'] = films['spoken_languages'].apply(literal_eval).apply(lambda x: [lang['name'] for lang in x])

films['production_countries'] = films['production_countries'].apply(lambda countries: [x.replace('United States of America','USA').replace('United Kingdom','UK') for x in countries])

### Cleaning Ratings Dataframe
---

In [75]:
## Adding tmdbId to ratings dataframe so I can join with the films dataframe
ratings = ratings.join(links[['movieId','tmdbId']].set_index('movieId'), on='movieId')

## Adding Title and Year to the ratings dataframe
ratings = ratings.join(films[['tmdbId','title','year','production_countries']].set_index('tmdbId'), on='tmdbId')

## Removing ratings for films that are no longer in the films dataframe
ratings.dropna(subset='title', inplace=True)
ratings['year'] = ratings['year'].astype(int)

In [78]:
## Adding keywords to films dataframe
films = films.join(keywords.set_index('id'), on='tmdbId')

## Converting keywords to lists
films['keywords'] = films['keywords'].apply(literal_eval)
films['keywords'] = films['keywords'].apply(lambda x: [key['name'] for key in x])

In [80]:
def label_international(countries, languages):
    """A function to provide a label for international films
    Input: The list of productions countries and spoken languages for a film
    Output: 1 for international films, 0 for English films

    The function allows for an English-speaking country to appear in the production companies if the spoken languages does not include English.
    A film that includes spoken English will still count as international if the production companies do not include any English-speaking countries and English is not the only spoken language.
    """

    # Check for English-speaking countries in the production countries
    c_intersect = [country for country in countries if country in ['USA','UK','Australia','New Zealand']]
    # Check if the film contains spoken English, or no spoken language
    l_intersect = [lang for lang in languages if lang in ['English','No Language']]

    if languages == ['English']:
        label = 0 ## Films with English as the only spoken language
    elif len(c_intersect) > 0 and len(l_intersect) > 0:
        label = 0 ## Films from an English-speaking country, with spoken English
    elif languages==[] or countries==[]:
        label = 0 ## Do not label any rows where country/language data is potentially incomplete
    else:
        label = 1
    return label

In [81]:
## Applying labelling function to films dataframe
films['international'] = films.apply(lambda x: label_international(x['production_countries'],x['spoken_languages']), axis=1)

In [82]:
films['title_lower'] = films['title'].str.lower().str.replace('[{}]'.format(string.punctuation),'',regex=True).apply(unidecode)

In [83]:
df_films = films.copy()

### Preparing Summary
---

In [86]:
p_stemmer = PorterStemmer()
stpwrd = nltk.corpus.stopwords.words('english')
stpwrd.extend(string.punctuation)

def prep_summary(summary):
    """Function to prepare the Summary
    Input: single string of text
    Output: string of text in lower case, stop words removed, and stemmed
    """
    summary = summary.lower()
    summary = word_tokenize(summary)
    summary = [word for word in summary if word not in stpwrd]
    summary = [p_stemmer.stem(word) for word in summary]
    summary = [word for word in summary if word not in stpwrd]
    summary = [word for word in summary if word not in ['film','movie']]
    summary = [word for word in summary if len(word)>2]
    return ' '.join(summary)

In [87]:
## Creating summary where keywords are included twice

def concatenate(x):
    """a function to combine text-based features into a single string, with keywords included twice
    Input: separate string for Overview & Tagline, and a list of Keywords
    Output: Overview, Tagline and all Keywords (duplicated) in one string
    """
    return x['overview'] + ' ' + x['tagline'] + ' ' + ' '.join(x['keywords']) + ' ' + ' '.join(x['keywords'])

df_films['summary'] = df_films.apply(concatenate, axis=1)
df_films['summary'] = df_films['summary'].apply(prep_summary)

### Calculating Similarity Scores
---

In [88]:
## Replace 'Science Fiction' with 'Sci-Fi' so it does not affect similarity scores
df_films['genres'] = df_films['genres'].apply(lambda x: ' '.join(x)).str.replace('Science Fiction','Sci-Fi')

In [89]:
## Sort dataframe by year and reindex
## This will make it easier to return only older films in the function
df_films = df_films.sort_values('year')
df_films = df_films.reset_index()

In [90]:
## Converting production countries into a single string for better display
df_films['country'] = df_films['production_countries'].apply(lambda x: ', '.join(x))
ratings['country'] = ratings['production_countries'].apply(lambda x: ', '.join(x))

In [91]:
## Creating a separate dataframe for international films be reordering and reindexing
df_international = df_films.sort_values('international', ascending=False).copy()
df_international = df_international.reset_index()

In [92]:
### Cosine Similarities for Classic films

n = len(df_films[df_films['year']<1967]) # number of rows to be considered

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
summary_matrix = tf.fit_transform(df_films['summary'])
summary_sim = linear_kernel(summary_matrix, summary_matrix[0:n])

tf = TfidfVectorizer(analyzer='word', min_df=0, stop_words=None)
genre_matrix = tf.fit_transform(df_films['genres'])
genre_sim = linear_kernel(genre_matrix, genre_matrix[0:n])

In [94]:
### Cosine Similarities for International films

n = len(df_films[df_films['international']==1])

### Summary
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
inter_summary_matrix = tf.fit_transform(df_international['summary'])
inter_summary_sim = linear_kernel(inter_summary_matrix, inter_summary_matrix[0:n])

### Genres
tf = TfidfVectorizer(analyzer='word', min_df=0, stop_words=None)
inter_genre_matrix = tf.fit_transform(df_international['genres'])
inter_genre_sim = linear_kernel(inter_genre_matrix, inter_genre_matrix[0:n])

### Final Function
---

In [95]:
def recommend(film, international=False):
    """retrieves cosine score for summary (2*keywords) & genres
    final score by calculating product
    returns 10 most similar, then sorts by ratings"""

    if international:
        df = df_international
        sum_sims = inter_summary_sim
        gen_sims = inter_genre_sim
    else:
        df = df_films
        sum_sims = summary_sim
        gen_sims = genre_sim

    film = unidecode(re.sub(r'[{}]'.format(string.punctuation),'',film.lower()))

    ## retrieve film id
    if len(df[df['title_lower']==film].index) == 0:
        return 'This film could not be found.'
    elif len(df[df['title_lower']==film].index) > 1:
        year = int(input("What year was this film released?"))
        input_id = df[df['title_lower']==film]['year'].apply(lambda x: abs(x-year)).idxmin()
    else:
        input_id = df[df['title_lower']==film].index[0]
    print('Giving recommendations for ' + str(df['title'].iloc[input_id]) + ' (' + str(df['year'].iloc[input_id]) + ')')

    ## calculate metric scores
    summary_scores = sum_sims[input_id]
    genre_scores = gen_sims[input_id]

    ## combine into final scores
    scores = summary_scores * genre_scores
    scores = list(enumerate(scores))

    ## retrieve 10 highest scores
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    indices = [i[0] for i in scores[0:10]]
    output = df[['title','year','genres','tmdbId','adj_rating','country']].iloc[indices]

    ## reorder output using collaborative filtering
    ## collect relevant user ratings
    tmdb_id = df['tmdbId'].iloc[input_id]
    similar_users = set(ratings[(ratings['tmdbId']==tmdb_id) & (ratings['rating']>=4.5)]['userId'])
    mean_ratings = ratings[(ratings['userId'].isin(similar_users)) & (ratings['tmdbId'].isin(output['tmdbId']))]

    if mean_ratings['title'].nunique() < 8:
        output = output.sort_values('adj_rating',ascending=False)[0:5]
        return output.reset_index()[['title','year','country']]
    else:
        mean_ratings = mean_ratings.groupby('tmdbId')[['title','rating','year','country']].agg(title=('title','first'), year=('year','first'), country=('country','first'), mean=('rating','mean'), count=('rating','count'))
        mean_ratings['adj_rating'] = mean_ratings.apply(lambda x: adjust_rating(x['mean'],x['count']), axis=1)
        mean_ratings = mean_ratings.sort_values(by='adj_rating', ascending=False).iloc[0:5]
        return mean_ratings.reset_index()[['title','year','country']]