In [1]:
# import dependencies

import pandas as pd
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
import regex
import joblib
import unidecode
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
# read data

df = pd.read_csv('2000-2021.csv')
df.head()

Unnamed: 0,id,title,genre,director,stars,overview,rating,votes
0,tt0468569,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart",When the menace known as the Joker wreaks havo...,9.0,2465320
1,tt1375666,Inception,"Action, Adventure, Sci-Fi",Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellio...",A thief who steals corporate secrets through t...,8.8,2210913
2,tt0120737,The Lord of the Rings: The Fellowship of the Ring,"Action, Adventure, Drama",Peter Jackson,"Elijah Wood, Ian McKellen, Orlando Bloom",A meek Hobbit from the Shire and eight compani...,8.8,1757730
3,tt0167260,The Lord of the Rings: The Return of the King,"Action, Adventure, Drama",Peter Jackson,"Elijah Wood, Viggo Mortensen, Ian McKellen",Gandalf and Aragorn lead the World of Men agai...,8.9,1736701
4,tt0816692,Interstellar,"Adventure, Drama, Sci-Fi",Christopher Nolan,"Matthew McConaughey, Anne Hathaway, Jessica Ch...",A team of explorers travel through a wormhole ...,8.6,1661792


In [3]:
# replacing accents

df['title']=df['title'].apply(lambda x: unidecode.unidecode(x))
df['genre']=df['genre'].apply(lambda x: unidecode.unidecode(x))
df['director']=df['director'].apply(lambda x: unidecode.unidecode(x))
df['stars']=df['stars'].apply(lambda x: unidecode.unidecode(x))
df['director']=df['director'].apply(lambda x: unidecode.unidecode(x))
df['overview']=df['overview'].apply(lambda x: unidecode.unidecode(x))

In [4]:
# removing unwanted characters

def clean(x):
    x = x.lower()
    x = regex.sub(r'<a class=[\'"]?([^\'" >]+)', '', x)
    x = regex.sub(r'href=[\'"]?([^\'" >]+)', '', x)
    x = regex.sub(r'<a href=[\'"]?([^\'" >]+)', '', x)
    x = x.replace('prev-page  Previous', '')
    x = x.replace('"', '')
    x = x.replace('</a>', '')
    x = x.replace('>', '')
    x = x.replace("[^a-zA-Z]", " ")
    x = x.replace("\s\s+", " ")
    x = x.replace('.', '')
    x = x.replace(',', '')

    return x

df['genre'] = df['genre'].apply(clean)
df['director'] = df['director'].apply(clean)
df['overview'] = df['overview'].apply(clean)
df['stars'] = df['stars'].apply(clean)

In [5]:
# combining different columns

def join(data):
    combine = data['title'] + " " + data['director'] + " " + data[
        'genre'] + " " + data['stars'] + " " + data['overview']

    return combine


df['combined'] = join(df)
df['combined'] = df['combined'].apply(clean)

In [6]:
# lemmatizing the combined column

lem = WordNetLemmatizer()
corpus = []
for i in df['combined']:
    i = i.split()
    i = [
        lem.lemmatize(word) for word in i
        if not word in set(stopwords.words('english'))
    ]
    i = ' '.join(i)
    corpus.append(i)

df['combined'] = corpus

In [7]:
df.head()

Unnamed: 0,id,title,genre,director,stars,overview,rating,votes,combined
0,tt0468569,The Dark Knight,action crime drama,christopher nolan,christian bale heath ledger aaron eckhart,when the menace known as the joker wreaks havo...,9.0,2465320,dark knight christopher nolan action crime dra...
1,tt1375666,Inception,action adventure sci-fi,christopher nolan,leonardo dicaprio joseph gordon-levitt elliot ...,a thief who steals corporate secrets through t...,8.8,2210913,inception christopher nolan action adventure s...
2,tt0120737,The Lord of the Rings: The Fellowship of the Ring,action adventure drama,peter jackson,elijah wood ian mckellen orlando bloom,a meek hobbit from the shire and eight compani...,8.8,1757730,lord rings: fellowship ring peter jackson acti...
3,tt0167260,The Lord of the Rings: The Return of the King,action adventure drama,peter jackson,elijah wood viggo mortensen ian mckellen,gandalf and aragorn lead the world of men agai...,8.9,1736701,lord rings: return king peter jackson action a...
4,tt0816692,Interstellar,adventure drama sci-fi,christopher nolan,matthew mcconaughey anne hathaway jessica chas...,a team of explorers travel through a wormhole ...,8.6,1661792,interstellar christopher nolan adventure drama...


In [9]:
# tfidf vectorizer to convert text to vector

tf = TfidfVectorizer(stop_words='english',
                     tokenizer=word_tokenize,
                     max_features=6000)
vectors = tf.fit_transform(df['combined']).toarray()

In [10]:
vectors.shape

(5998, 6000)

In [11]:
# svd to reduce dimension

svd = TruncatedSVD(n_components=2500, random_state=2)
vector_reduced = svd.fit_transform(vectors)

In [12]:
cos = cosine_similarity(vector_reduced)
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [13]:
def get_recommendations(title, cos=cos):
    idx = indices[title]
    sim_scores = list(enumerate(cos[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

In [14]:
get_recommendations('Spider-Man')

123               Spider-Man 2
152               Spider-Man 3
119     The Amazing Spider-Man
2095           Superhero Movie
137     Spider-Man: Homecoming
Name: title, dtype: object

In [15]:
joblib.dump(df, 'movies_list.sav')
joblib.dump(cos, 'similarity.sav', compress=3)

['similarity.sav']