# recommender system
1. Load the imbd movie dataset
2. Data preprocessing
    1. Remove the missing values
    2. Remove the duplicates
    3. Remove the special characters
    4. Tokenization - split the sentence into words
    5. Remove the stopwords - words that do not add any meaning to the sentence
    6. Remove the short words - words that are less than 3 characters
    7. Remove the numbers - words that are numbers
    8. Lemmatization - words that are in different forms
    9. Stemming - words that are in different forms
    10. CountVectorization - convert the words into vectors
    11. TF-IDF - term frequency - inverse document frequency
    12. Cosine similarity - measure the similarity between two vectors
3. Build the recommender system

In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('imdb_top_1000.csv')
df

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BNGEwMT...,Breakfast at Tiffany's,1961,A,115 min,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,166544,
996,https://m.media-amazon.com/images/M/MV5BODk3Yj...,Giant,1956,G,201 min,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,34075,
997,https://m.media-amazon.com/images/M/MV5BM2U3Yz...,From Here to Eternity,1953,Passed,118 min,"Drama, Romance, War",7.6,"In Hawaii in 1941, a private is cruelly punish...",85.0,Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,43374,30500000
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97 min,"Drama, War",7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,


In [3]:
df.isnull().sum()

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


Series_Title, Released_Year, Genre, Director, Star1, Star2, Star3, Star4, Overview

In [5]:
df['data'] = df['Series_Title'] + ' ' + df['Released_Year'] + ' ' + df['Genre'] + ' ' + df['Director'] + ' ' + df['Star1'] + ' ' + df['Star2'] + ' ' + df['Star3'] + ' ' + df['Star4']+ ' ' + df['Overview']

In [6]:
df

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross,data
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469,The Shawshank Redemption 1994 Drama Frank Dara...
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411,"The Godfather 1972 Crime, Drama Francis Ford C..."
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444,"The Dark Knight 2008 Action, Crime, Drama Chri..."
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000,"The Godfather: Part II 1974 Crime, Drama Franc..."
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000,"12 Angry Men 1957 Crime, Drama Sidney Lumet He..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BNGEwMT...,Breakfast at Tiffany's,1961,A,115 min,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,166544,,"Breakfast at Tiffany's 1961 Comedy, Drama, Rom..."
996,https://m.media-amazon.com/images/M/MV5BODk3Yj...,Giant,1956,G,201 min,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,34075,,"Giant 1956 Drama, Western George Stevens Eliza..."
997,https://m.media-amazon.com/images/M/MV5BM2U3Yz...,From Here to Eternity,1953,Passed,118 min,"Drama, Romance, War",7.6,"In Hawaii in 1941, a private is cruelly punish...",85.0,Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,43374,30500000,"From Here to Eternity 1953 Drama, Romance, War..."
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97 min,"Drama, War",7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,,"Lifeboat 1944 Drama, War Alfred Hitchcock Tall..."


In [7]:
# Data Cleaning
df['data'] = df['data'].str.replace('[^\w\s]','') # remove punctuation
df['data'] = df['data'].str.lower() # convert to lowercase
df['data']

  df['data'] = df['data'].str.replace('[^\w\s]','') # remove punctuation


0      the shawshank redemption 1994 drama frank dara...
1      the godfather 1972 crime drama francis ford co...
2      the dark knight 2008 action crime drama christ...
3      the godfather part ii 1974 crime drama francis...
4      12 angry men 1957 crime drama sidney lumet hen...
                             ...                        
995    breakfast at tiffanys 1961 comedy drama romanc...
996    giant 1956 drama western george stevens elizab...
997    from here to eternity 1953 drama romance war f...
998    lifeboat 1944 drama war alfred hitchcock tallu...
999    the 39 steps 1935 crime mystery thriller alfre...
Name: data, Length: 1000, dtype: object

In [8]:
from nltk.corpus import stopwords
def remove_stopwords(text):
    blob = TextBlob(text)
    words = blob.words
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

In [9]:
df['clean_data'] = df['data'].apply(remove_stopwords)

In [10]:
df['clean_data']

0      shawshank redemption 1994 drama frank darabont...
1      godfather 1972 crime drama francis ford coppol...
2      dark knight 2008 action crime drama christophe...
3      godfather part ii 1974 crime drama francis for...
4      12 angry men 1957 crime drama sidney lumet hen...
                             ...                        
995    breakfast tiffanys 1961 comedy drama romance b...
996    giant 1956 drama western george stevens elizab...
997    eternity 1953 drama romance war fred zinnemann...
998    lifeboat 1944 drama war alfred hitchcock tallu...
999    39 steps 1935 crime mystery thriller alfred hi...
Name: clean_data, Length: 1000, dtype: object

In [11]:
vec = TfidfVectorizer()
vec_matrix = vec.fit_transform(df['clean_data'])
vec_matrix

<1000x10670 sparse matrix of type '<class 'numpy.float64'>'
	with 29389 stored elements in Compressed Sparse Row format>

In [12]:
sim = cosine_similarity(vec_matrix, vec_matrix)
print(f"shape of similarity matrix:{sim.shape}")

shape of similarity matrix:(1000, 1000)


In [13]:
def get_index_by_movie(title):
    temp = pd.read_csv('imdb_top_1000.csv', index_col='Series_Title')
    # make index to lower case
    temp.index = temp.index.str.lower()
    title = title.lower()
    if title in temp.index :
        return temp.index.get_loc(title)
    else:
        return -1

In [14]:
def recommend_movie(title, size=10):
    idx = get_index_by_movie(title)
    if idx == -1:
        return "No recommendation for this movie"
    else:
        sim_scores = list(enumerate(sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[-1], reverse=True)
        sim_scores = sim_scores[1:size+1]
        movie_indices = [i[0] for i in sim_scores]
        return df['Series_Title'].iloc[movie_indices].to_list()

In [15]:
recommend_movie('Top Gun')

'No recommendation for this movie'

In [16]:
df['Series_Title'].tolist()

['The Shawshank Redemption',
 'The Godfather',
 'The Dark Knight',
 'The Godfather: Part II',
 '12 Angry Men',
 'The Lord of the Rings: The Return of the King',
 'Pulp Fiction',
 "Schindler's List",
 'Inception',
 'Fight Club',
 'The Lord of the Rings: The Fellowship of the Ring',
 'Forrest Gump',
 'Il buono, il brutto, il cattivo',
 'The Lord of the Rings: The Two Towers',
 'The Matrix',
 'Goodfellas',
 'Star Wars: Episode V - The Empire Strikes Back',
 "One Flew Over the Cuckoo's Nest",
 'Hamilton',
 'Gisaengchung',
 'Soorarai Pottru',
 'Interstellar',
 'Cidade de Deus',
 'Sen to Chihiro no kamikakushi',
 'Saving Private Ryan',
 'The Green Mile',
 'La vita è bella',
 'Se7en',
 'The Silence of the Lambs',
 'Star Wars',
 'Seppuku',
 'Shichinin no samurai',
 "It's a Wonderful Life",
 'Joker',
 'Whiplash',
 'The Intouchables',
 'The Prestige',
 'The Departed',
 'The Pianist',
 'Gladiator',
 'American History X',
 'The Usual Suspects',
 'Léon',
 'The Lion King',
 'Terminator 2: Judgment D

final version


In [32]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from joblib import dump, load



def remove_stopwords(text):
    blob = TextBlob(text)
    words = blob.words
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def recommend_movie(title, size=10):
    df = pd.read_csv('datasets/imdb_top_1000.csv', index_col='Series_Title')
    vec = load('models/tfidf.joblib')
    sim = load('models/similarity.joblib')
    idx = get_index_by_movie(df, title)
    if idx == -1:
        return "No recommendation for this movie"
    else:
        sim_scores = list(enumerate(sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[-1], reverse=True)
        sim_scores = sim_scores[1:size+1]
        movie_indices = [i[0] for i in sim_scores]
        # df.info()
        return df.iloc[movie_indices].index.to_list()

def get_index_by_movie(temp, title):
    # make index to lower case
    temp.index = temp.index.str.lower()
    title = title.lower()
    if title in temp.index :
        return temp.index.get_loc(title)
    else:
        return -1

def main():
    df = pd.read_csv('datasets/imdb_top_1000.csv')
    # Data Cleaning
    df['data'] = df['Series_Title'] + ' ' + df['Released_Year'] + ' ' + df['Genre'] + ' ' + df['Director'] + ' ' + df['Star1'] + ' ' + df['Star2'] + ' ' + df['Star3'] + ' ' + df['Star4']+ ' ' + df['Overview']
    df['data'] = df['data'].str.replace('[^\w\s]','', regex=True) # remove punctuation
    df['data'] = df['data'].str.lower() # convert to lowercase
    df['clean_data'] = df['data'].apply(remove_stopwords)
    vec = TfidfVectorizer()
    vec_matrix = vec.fit_transform(df['clean_data'])
    sim = cosine_similarity(vec_matrix, vec_matrix)
    dump(vec, 'models/tfidf.joblib')
    dump(sim, 'models/similarity.joblib')

In [33]:
recommend_movie('The Dark Knight')

['batman begins',
 'the dark knight rises',
 'the prestige',
 'joker',
 'brokeback mountain',
 'batman: mask of the phantasm',
 'kill bill: vol. 1',
 'empire of the sun',
 'the machinist',
 'the man who would be king']