# Content

- The purpose of the above code is to build a simple movie recommendation system based on content similarity. 
- The code aims to build a content-based movie recommendation system that leverages textual information (such as plots, genres, actors, directors) to suggest similar movies to users. The recommendation system allows users to input a movie title and receive personalized recommendations based on the content of the movies.

In [1]:
# Ignore warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [2]:
#install rake function
!pip install rake_nltk



In [3]:
# Imports
import pandas as pd
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('punkt')  # Download necessary nltk resources

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mohdz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Read the dataset
df = pd.read_csv('IMDB_Top250Engmovies2_OMDB_Detailed.csv')

In [5]:
print(df)

     Unnamed: 0                     Title  Year      Rated     Released  \
0             1  The Shawshank Redemption  1994          R  14 Oct 1994   
1             2             The Godfather  1972          R  24 Mar 1972   
2             3    The Godfather: Part II  1974          R  20 Dec 1974   
3             4           The Dark Knight  2008      PG-13  18 Jul 2008   
4             5              12 Angry Men  1957   APPROVED  01 Apr 1957   
..          ...                       ...   ...        ...          ...   
245         246          The Lost Weekend  1945  NOT RATED  01 Jan 1946   
246         247             Short Term 12  2013          R  23 Aug 2013   
247         248           His Girl Friday  1940   APPROVED  18 Jan 1940   
248         249        The Straight Story  1999          G  03 Nov 1999   
249         250       Slumdog Millionaire  2008          R  25 Dec 2008   

     Runtime                   Genre                      Director  \
0    142 min            Crime

In [6]:
# Remove punctuations from Plot
df['Plot'] = df['Plot'].str.replace('[^ws]','')

In [7]:
print(df)

     Unnamed: 0                     Title  Year      Rated     Released  \
0             1  The Shawshank Redemption  1994          R  14 Oct 1994   
1             2             The Godfather  1972          R  24 Mar 1972   
2             3    The Godfather: Part II  1974          R  20 Dec 1974   
3             4           The Dark Knight  2008      PG-13  18 Jul 2008   
4             5              12 Angry Men  1957   APPROVED  01 Apr 1957   
..          ...                       ...   ...        ...          ...   
245         246          The Lost Weekend  1945  NOT RATED  01 Jan 1946   
246         247             Short Term 12  2013          R  23 Aug 2013   
247         248           His Girl Friday  1940   APPROVED  18 Jan 1940   
248         249        The Straight Story  1999          G  03 Nov 1999   
249         250       Slumdog Millionaire  2008          R  25 Dec 2008   

     Runtime                   Genre                      Director  \
0    142 min            Crime

In [8]:
# Extract key words from Plot to a list using Rake
df['Key_words'] = ''   # Initializing a new column
r = Rake()   # Using Rake to remove stop words

for index, row in df.iterrows():
    r.extract_keywords_from_text(row['Plot'])   # Extract key words 
    key_words_dict_scores = r.get_word_degrees()    # Get dictionary with key words and their similarity scores
    row['Key_words'] = list(key_words_dict_scores.keys())   # Assign it to new column

In [9]:
df['Key_words']

0       
1       
2       
3       
4       
      ..
245     
246     
247     
248     
249     
Name: Key_words, Length: 250, dtype: object

In [10]:
# Extract all genre into a list, only the first three actors into a list, and all directors into a list
df['Genre'] = df['Genre'].map(lambda x: x.split(','))
df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])
df['Director'] = df['Director'].map(lambda x: x.split(','))

# Create unique names by merging firstname & surname into one word, & convert to lowercase 
for index, row in df.iterrows():
    row['Genre'] = [x.lower().replace(' ','') for x in row['Genre']]
    row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
    row['Director'] = [x.lower().replace(' ','') for x in row['Director']]


In [11]:
df['Director']

0                     [Frank Darabont]
1               [Francis Ford Coppola]
2               [Francis Ford Coppola]
3                  [Christopher Nolan]
4                       [Sidney Lumet]
                    ...               
245                     [Billy Wilder]
246            [Destin Daniel Cretton]
247                     [Howard Hawks]
248                      [David Lynch]
249    [Danny Boyle,  Loveleen Tandan]
Name: Director, Length: 250, dtype: object

In [12]:
# Combine 4 lists (4 columns) of key words into 1 sentence under Bag_of_words column
df['Bag_of_words'] = ''
columns = ['Genre', 'Director', 'Actors', 'Key_words']

for index, row in df.iterrows():
    words = ''
    for col in columns:
        words += ' '.join(row[col]) + ' '
    df.at[index, 'Bag_of_words'] = words
    

In [13]:
df['Bag_of_words']

0      Crime  Drama Frank Darabont Tim Robbins  Morga...
1      Crime  Drama Francis Ford Coppola Marlon Brand...
2      Crime  Drama Francis Ford Coppola Al Pacino  R...
3      Action  Crime  Drama Christopher Nolan Christi...
4      Crime  Drama Sidney Lumet Martin Balsam  John ...
                             ...                        
245    Drama  Film-Noir Billy Wilder Ray Milland  Jan...
246    Drama Destin Daniel Cretton Brie Larson  John ...
247    Comedy  Drama  Romance Howard Hawks Cary Grant...
248    Biography  Drama David Lynch Sissy Spacek  Jan...
249    Drama Danny Boyle  Loveleen Tandan Dev Patel  ...
Name: Bag_of_words, Length: 250, dtype: object

In [14]:
# Strip white spaces in front and behind, replace multiple whitespaces (if any)
df['Bag_of_words'] = df['Bag_of_words'].str.strip().str.replace('   ', ' ').str.replace('  ', ' ')

df = df[['Title','Bag_of_words']]


In [15]:
df

Unnamed: 0,Title,Bag_of_words
0,The Shawshank Redemption,Crime Drama Frank Darabont Tim Robbins Morgan ...
1,The Godfather,Crime Drama Francis Ford Coppola Marlon Brando...
2,The Godfather: Part II,Crime Drama Francis Ford Coppola Al Pacino Rob...
3,The Dark Knight,Action Crime Drama Christopher Nolan Christian...
4,12 Angry Men,Crime Drama Sidney Lumet Martin Balsam John Fi...
...,...,...
245,The Lost Weekend,Drama Film-Noir Billy Wilder Ray Milland Jane ...
246,Short Term 12,Drama Destin Daniel Cretton Brie Larson John G...
247,His Girl Friday,Comedy Drama Romance Howard Hawks Cary Grant R...
248,The Straight Story,Biography Drama David Lynch Sissy Spacek Jane ...


In [16]:
# Generate the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['Bag_of_words'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)


In [17]:
cosine_sim

array([[1.        , 0.19069252, 0.19069252, ..., 0.09534626, 0.09534626,
        0.09534626],
       [0.19069252, 1.        , 0.63636364, ..., 0.09090909, 0.09090909,
        0.09090909],
       [0.19069252, 0.63636364, 1.        , ..., 0.09090909, 0.09090909,
        0.09090909],
       ...,
       [0.09534626, 0.09090909, 0.09090909, ..., 1.        , 0.09090909,
        0.09090909],
       [0.09534626, 0.09090909, 0.09090909, ..., 0.09090909, 1.        ,
        0.09090909],
       [0.09534626, 0.09090909, 0.09090909, ..., 0.09090909, 0.09090909,
        1.        ]])

In [18]:
indices = pd.Series(df['Title'])

In [19]:
# Function to recommend movies based on cosine similarity
def recommend(title, cosine_sim = cosine_sim):
    recommended_movies = []
    idx = indices[indices == title].index[0]   # Get the index of the movie title matching the input movie
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)   # Similarity scores in descending order
    top_5_indices = list(score_series.iloc[1:6].index)   # Get the indices of top 5 most similar movies
    # [1:6] to exclude 0 (index 0 is the input movie itself)
    
    for i in top_5_indices:   # Append the titles of top 5 similar movies to the recommended_movies list
        recommended_movies.append(list(df['Title'])[i])
        
    return recommended_movies


In [20]:
# Example usage
recommendations = recommend('The Avengers')
print("Recommended movies:")
print(recommendations)


Recommended movies:
['Spider-Man: Homecoming', 'Guardians of the Galaxy Vol. 2', 'Guardians of the Galaxy', 'The Terminator', 'Aliens']


In [21]:
# User input for movie title
user_input = input("Enter the movie title you want recommendations for: ")

# Check if the movie title exists in the dataset
if user_input in df['Title'].values:
    recommendations = recommend(user_input)
    print("Recommended movies:")
    print(recommendations)
else:
    print("Movie not found in dataset.")

Enter the movie title you want recommendations for: Forrest Gump
Recommended movies:
['Dead Poets Society', 'Saving Private Ryan', 'The Sting', 'Catch Me If You Can', 'Toy Story 3']


In [22]:
# User input for movie title
user_input = input("Enter the movie title you want recommendations for: ")

# Check if the movie title exists in the dataset
if user_input in df['Title'].values:
    recommendations = recommend(user_input)
    print("Recommended movies:")
    print(recommendations)
else:
    print("Movie not found in dataset.")

Enter the movie title you want recommendations for: 12 Angry Men
Recommended movies:
['Dog Day Afternoon', 'On the Waterfront', 'Network', 'To Kill a Mockingbird', 'Taxi Driver']
