# IMDB Movie Recommender (Content Based)

#### Import Library

In [205]:
# Library
import pandas as pd
import re
import nltk

from requests import get
from bs4 import BeautifulSoup
from time import time, sleep
from random import randint
from IPython.core.display import clear_output
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

#### Define Additional Function

In [2]:
def short(x):
    lst = []
    for i in x:
        y = i.replace(' ','')
        z = y.lower()
        lst.append(z)
    return lst

In [3]:
class Preprocessor:
    def __init__(self):
        self.stop_words = set(nltk.corpus.stopwords.words('english'))
        self.ps = nltk.stem.PorterStemmer()

    # word tokenize text using nltk lib
    def tokenize(self, text):
        return nltk.word_tokenize(text)

    # stem word using provided stemmer
    def stem(self, word, stemmer):
        return stemmer.stem(word)

    # check if word is appropriate - not a stop word and isalpha, 
    # i.e consists of letters, not punctuation, numbers, dates
    def is_apt_word(self, word):
        return word not in self.stop_words and word.isalpha()

    # combines all previous methods together
    # tokenizes lowercased text and stems it, ignoring not appropriate words
    def preprocess(self, text):
        tokenized = self.tokenize(text.lower())
        return [self.stem(w, self.ps) for w in tokenized if self.is_apt_word(w)]

In [4]:
def build_inverted_index_orig_forms(documents):
    inverted_index = {}
    for no, strings in enumerate(documents):
        s = re.sub(r'([^\w\s])','',strings)
        tokens = nltk.word_tokenize(s.lower())
        file_index = Counter(tokens)
        # update global index
        for term in file_index.keys():
            file_freq = file_index[term]
            if term not in inverted_index:                
                inverted_index[term] = [file_freq, (no, file_freq)]
            else:
                inverted_index[term][0] += file_freq
                inverted_index[term].append((no, file_freq))
    return inverted_index

def generate_wildcard_options(wildcard, k, inverted_index): 
    list_word = []
    for term in inverted_index.keys():
        string = wildcard.replace('*','$')
        pad = '$' + string + '$'
        result = nltk.ngrams(pad, k)
        for i in list(result):
            tri = "".join(i)
            if re.search(tri, term):
                list_word.append(term)
    
    s_wildcard = wildcard.replace('*', '.+')
    return [string for string in list_word if re.match(s_wildcard, string)]

def search_wildcard(wildcard, k, index, docs):
    wildcard_options = generate_wildcard_options(wildcard, k, index)
    list_fact = []
    for term in wildcard_options:
        for line in docs:
            if re.search(term, line, flags=re.I):
                list_fact.append(line)      
    return list_fact

#### Read CSV

In [322]:
movie_rating = pd.read_csv('movie_data_5k.csv')

In [324]:
movie_rating.head()

Unnamed: 0.1,Unnamed: 0,movie_img,movie,year,certificate,duration,genre,description,director,stars,imdb,votes
0,0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,R,142 min,Drama,Two imprisoned men bond over a number of years...,['Frank Darabont'],"['Tim Robbins, Morgan Freeman, Bob Gunton, Wil...",9.3,2235501
1,1,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,PG-13,152 min,"Action, Crime, Drama",When the menace known as the Joker wreaks havo...,['Christopher Nolan'],"['Christian Bale, Heath Ledger, Aaron Eckhart,...",9.0,2206475
2,2,https://m.media-amazon.com/images/M/MV5BMjAxMz...,Inception,2010,PG-13,148 min,"Action, Adventure, Sci-Fi",A thief who steals corporate secrets through t...,['Christopher Nolan'],"['Leonardo DiCaprio, Joseph Gordon-Levitt, Ell...",8.8,1959988
3,3,https://m.media-amazon.com/images/M/MV5BMmEzNT...,Fight Club,1999,R,139 min,Drama,An insomniac office worker and a devil-may-car...,['David Fincher'],"['Brad Pitt, Edward Norton, Meat Loaf, Zach Gr...",8.8,1779638
4,4,https://m.media-amazon.com/images/M/MV5BNGNhMD...,Pulp Fiction,1994,R,154 min,"Crime, Drama","The lives of two mob hitmen, a boxer, a gangst...",['Quentin Tarantino'],"['John Travolta, Uma Thurman, Samuel L. Jackso...",8.9,1752001


In [325]:
movies = movie_rating
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   5000 non-null   int64  
 1   movie_img    5000 non-null   object 
 2   movie        5000 non-null   object 
 3   year         5000 non-null   object 
 4   certificate  5000 non-null   object 
 5   duration     5000 non-null   object 
 6   genre        5000 non-null   object 
 7   description  5000 non-null   object 
 8   director     4445 non-null   object 
 9   stars        4998 non-null   object 
 10  imdb         5000 non-null   float64
 11  votes        5000 non-null   int64  
dtypes: float64(1), int64(2), object(9)
memory usage: 468.9+ KB


#### Data Handling

In [327]:
movies['director'] = movies['director'].fillna('')
movies['director'].isnull().any()

False

In [328]:
movies['stars'] = movies['stars'].fillna('')
movies['stars'].isnull().any()

False

In [329]:
movies['stars'] = movies['stars'].str.split(',')
movies['n_stars'] = movies['stars'].apply(len)
movies['stars_3'] = movies['stars'].apply(lambda x: x[:3] if len(x) >=3 else x)
movies['stars_short'] = movies['stars_3'].apply(short)

movies['director_2'] = movies['director'].apply(lambda x: [x, x])
movies['director_short']=movies['director_2'].apply(short)

movies['genre'] = movies['genre'].str.split(',')
movies['genre_short'] = movies['genre'].apply(short)

movies['keywords'] = movies['description'].apply(Preprocessor().preprocess)

In [330]:
movies['bag'] = (movies['stars_short'] + movies['director_short'] + movies['genre_short'] + movies['keywords']).apply(lambda x: [s for s in x if s])
#movies['bag'] = (movies['stars_short'] + movies['director_short'] + movies['genre_short']).apply(lambda x: [s for s in x if s])
movies['bag_len'] = movies['bag'].apply(len)
movies['list_bag'] = movies['bag'].apply(lambda x: ' '.join(x))

#### Input Wildcard

In [331]:
movie_name = []
movie_dict = {}
for name, vote in zip(movies['movie'], movies['votes']):
    movie_name.append(name)
    movie_dict[name] = vote

index_orig_forms = build_inverted_index_orig_forms(movie_name)

In [332]:
wildcard = "dark"
top_k = 10

wildcard_results = search_wildcard(wildcard, 3, index_orig_forms, movie_name)
print(wildcard_results)

['The Dark Knight', 'The Dark Knight Rises', 'Donnie Darko', 'Thor: The Dark World', 'Star Trek Into Darkness', 'Transformers: Dark of the Moon', 'Zero Dark Thirty', 'Dark Shadows', 'Dark City', 'Dark', 'Darkest Hour', 'Army of Darkness', 'X-Men: Dark Phoenix', 'The Dark Tower', 'Terminator: Dark Fate', 'A Scanner Darkly', 'Dancer in the Dark', 'Edge of Darkness', 'Fifty Shades Darker', 'Dark Skies', 'Dark Water', 'The Darkest Hour', 'Darkman', 'The Ghost and the Darkness', 'Scary Stories to Tell in the Dark', "Don't Be Afraid of the Dark", 'You Will Meet a Tall Dark Stranger', 'Under Siege 2: Dark Territory', 'Alone in the Dark', 'Dark Places', 'Dark Matter', 'Dark Waters', 'His Dark Materials', 'Hold the Dark', 'Dark Angel', 'Darkness Falls', 'Dark Water', 'The Darkest Minds', 'Poldark', 'The Dark Knight', 'The Dark Knight Rises', 'Donnie Darko', 'Thor: The Dark World', 'Star Trek Into Darkness', 'Transformers: Dark of the Moon', 'Zero Dark Thirty', 'Dark Shadows', 'Dark City', 'Dark

In [333]:
if len(wildcard_results) != 0:
    movie_result = []
    for r in wildcard_results:
        if r not in movie_result: 
            movie_result.append(r)

    movie_res = {}
    for name in movie_result:
        movie_res[name] = movie_dict[name]

    for i in sorted(movie_res.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)[:]:
        print(i[0])
else:
    print("Sorry, nothing to suggest! \nType the movie title again!")

The Dark Knight
The Dark Knight Rises
Donnie Darko
Thor: The Dark World
Star Trek Into Darkness
Transformers: Dark of the Moon
Zero Dark Thirty
Dark Shadows
Dark City
Dark
Darkest Hour
Army of Darkness
X-Men: Dark Phoenix
The Dark Tower
Terminator: Dark Fate
A Scanner Darkly
Dancer in the Dark
Edge of Darkness
Fifty Shades Darker
Dark Skies
The Darkest Hour
Darkman
The Ghost and the Darkness
Scary Stories to Tell in the Dark
Don't Be Afraid of the Dark
You Will Meet a Tall Dark Stranger
Under Siege 2: Dark Territory
Alone in the Dark
Dark Places
Dark Matter
Dark Waters
His Dark Materials
Hold the Dark
Dark Angel
Darkness Falls
Dark Water
The Darkest Minds
Poldark


#### Calculate Cosine Similarity

In [334]:
count = CountVectorizer(analyzer='word', stop_words='english')
#count = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
movies_matrix = count.fit_transform(movies['list_bag'])

In [345]:
cosine_sim = cosine_similarity(movies_matrix)
#cosine_sim = linear_kernel(movies_matrix, movies_matrix)

In [336]:
def get_title_from_index(index):
    return movies[movies['Unnamed: 0'] == index]["movie"].values[0]
def get_index_from_title(title):
    return movies[movies["movie"] == title]["Unnamed: 0"].values[0]

In [346]:
movie_user_likes = "The Dark Knight"
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))

In [347]:
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[0:11]

In [339]:
print(sorted_similar_movies)

[(1, 1.0000000000000007), (17, 0.4082482904638631), (10, 0.3396831102433788), (24, 0.24077170617153845), (558, 0.24077170617153845), (813, 0.2333800140046683), (179, 0.23094010767585035), (2083, 0.22680460581325726), (4495, 0.22075539284417395), (12, 0.21516574145596762), (2634, 0.2151657414559676)]


In [348]:
print(sorted_similar_movies)

[(1, 27.0), (10, 9.0), (17, 9.0), (24, 6.0), (179, 6.0), (558, 6.0), (2, 5.0), (12, 5.0), (64, 5.0), (366, 5.0), (562, 5.0)]


In [341]:
i=0
movie_recommended = []
print("Top k similar movies to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    movie_recommended.append(get_title_from_index(element[0]))
    print(element[0], get_title_from_index(element[0]))
    i=i+1
    if i>10:
        break

Top 5 similar movies to The Dark Knight are:

1 The Dark Knight
17 Batman Begins
10 The Dark Knight Rises
24 The Prestige
558 Batman Returns
813 Gotham
179 Dunkirk
2083 Following
4495 Alphas
12 Interstellar
2634 Harsh Times


In [349]:
i=0
movie_recommended = []
print("Top k similar movies to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    movie_recommended.append(get_title_from_index(element[0]))
    print(element[0], get_title_from_index(element[0]))
    i=i+1
    if i>10:
        break

Top 5 similar movies to The Dark Knight are:

1 The Dark Knight
10 The Dark Knight Rises
17 Batman Begins
24 The Prestige
179 Dunkirk
558 Batman Returns
2 Inception
12 Interstellar
64 Joker
366 Daredevil
562 Insomnia


In [350]:
movies['description'].values[64]

'In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and mistreated by society. He then embarks on a downward spiral of revolution and bloody crime. This path brings him face-to-face with his alter-ego: the Joker.'

In [343]:
# map that gives the movie index in dataframe
indices = pd.Series(movies.index, index = movies['movie'])

# function that provides a list of similar movies
def recommendations(title):
    
    # get index of the movie in the dataframe
    idx = indices[title]
    
    # get movie matrix for the selected movie
    chosen_movie_matrix = movies_matrix[idx]
    
    #get cosine similarity of all the movies with the selected movie
    cos_sim = cosine_similarity(chosen_movie_matrix, movies_matrix)
    
    # get a list of tuple where first is movie index and second is cos similarity score for all 
    #the movies with the provided movie. Similarity score is penalised based on the number of contents of the bag. 
    scores = [(i, sim - 1/(movies.iloc[i]['bag_len']+.0001)) for i, sim in enumerate(cos_sim[0])]

    # sorting all the movies based on the similarity scores in an ascending order
    scores.sort(key=lambda x: x[1], reverse=True)

    # get the indices of the top 10 similar movies
    #movie_indices = [i[0] for i in scores[1:11]]
    movie_indices = []
    similarity = []
    for i in scores[1:21]:
        similarity.append(i[1])
        movie_indices.append(i[0])
    
    # get the names of the most similar movies
    name = movies.iloc[movie_indices]['movie']
    year = movies.iloc[movie_indices]['year']
    genre = movies.iloc[movie_indices]['genre']
    description = movies.iloc[movie_indices]['description']
    director = movies.iloc[movie_indices]['director']
    stars = movies.iloc[movie_indices]['stars']
    imdb = movies.iloc[movie_indices]['imdb']
    result = pd.concat([name, year, description, genre, director, stars, imdb], axis=1)
    result['cos_sim'] = similarity
    return result

In [351]:
recommendations('Gotham')

Unnamed: 0,movie,year,description,genre,director,stars,imdb,cos_sim
3395,CSI: NY,2004-2013,CSI head Detective Mac Taylor and his team sol...,"[Action, Crime, Drama]",,"[['Gary Sinise, Carmine Giovinazzo, Hill Har...",6.9,0.275026
2571,A Most Violent Year,2014,"In New York City 1981, an ambitious immigrant ...","[Action, Crime, Drama]",['J.C. Chandor'],"[['Oscar Isaac, Jessica Chastain, David Oyel...",7.0,0.243424
2605,Dead Man Down,2013,"In New York City, a crime lord's right-hand ma...","[Action, Crime, Drama]",['Niels Arden Oplev'],"[['Colin Farrell, Noomi Rapace, Dominic Coop...",6.5,0.202989
3158,15 Minutes,2001,A homicide detective and a fire marshal must s...,"[Action, Crime, Drama]",['John Herzfeld'],"[['Robert De Niro, Edward Burns, Kelsey Gram...",6.1,0.202083
4246,New Jack City,1991,A crime lord ascends to power and becomes mega...,"[Action, Crime, Drama]",['Mario Van Peebles'],"[['Wesley Snipes, Ice-T, Allen Payne, Chris...",6.7,0.20023
10,The Dark Knight Rises,2012,Eight years after the Joker's reign of anarchy...,"[Action, Adventure]",['Christopher Nolan'],"[['Christian Bale, Tom Hardy, Anne Hathaway,...",8.4,0.197826
1,The Dark Knight,2008,When the menace known as the Joker wreaks havo...,"[Action, Crime, Drama]",['Christopher Nolan'],"[['Christian Bale, Heath Ledger, Aaron Eckha...",9.0,0.196343
667,The Other Guys,2010,Two mismatched New York City detectives seize ...,"[Action, Comedy, Crime]",['Adam McKay'],"[['Will Ferrell, Mark Wahlberg, Derek Jeter,...",6.6,0.194919
4658,Batwoman,2019,Kate Kane seeks justice for Gotham City as Bat...,"[Action, Adventure, Crime]",,"[['Ruby Rose, Camrus Johnson, Rachel Skarste...",3.5,0.192147
1768,Elementary,2012-2019,"A modern take on the cases of Sherlock Holmes,...","[Crime, Drama, Mystery]",,"[['Jonny Lee Miller, Lucy Liu, Aidan Quinn, ...",7.9,0.18799
