In [141]:
# Import python libraries
import pandas as pd
import pickle
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [142]:
# Load dataset to pandas dataframe
movies = pd.read_csv('../datasets/movies_meta.csv')

DATA VISUALIZATION

In [143]:
#movies.head(5)

In [144]:
#movies.shape

In [145]:
#movies.info()

In [146]:
# Checking the Null
#movies.isnull().sum()

In [147]:
# Deleting all the duplicated rows
movies.drop_duplicates(inplace=True)

In [148]:
# Number of repeated movies
#movies[['title','release_date']].duplicated().sum()

In [149]:
# Ignore repeated movies
movies.drop_duplicates(subset=['title','release_date'],inplace=True)

In [150]:
# Genres, Overviews, Credits, and Keywords as the key predict, fill up the null value
movies['genres'] = movies['genres'].fillna('')
movies['overview'] = movies['overview'].fillna('')
movies['credits'] = movies['credits'].fillna('')
movies['production_companies'] = movies['production_companies'].fillna('')
movies['keywords'] = movies['keywords'].fillna('')
movies['tagline'] = movies['tagline'].fillna('')

# Drop rows with all the columns needed
null_value = movies[(movies['genres']=='') & (movies['overview']=='') & (movies['credits']=='') & (movies['keywords']=='')].index
movies.drop(null_value,inplace=True)

# Drop rows with all the columns with no genres
null_value = movies[(movies['genres']=='')].index
movies.drop(null_value, inplace=True)

# Drop rows with all the columns with no overview
null_value = movies[(movies['overview']=='')].index
movies.drop(null_value, inplace=True)

null_value = movies[(movies['release_date'].isnull()) & (movies['status'] =='Released')].index
movies.drop(null_value, inplace=True)

# No duplication
#movies.isnull().sum()

In [151]:
# Remove '-' from each column in the dataframe.
# Genres
movies['genres'] = movies['genres'].str.replace(' ', '')
movies['genres'] = movies['genres'].str.replace('-', ' ')
# Credits
movies['credits'] = movies['credits'].str.replace(' ', '')
movies['credits'] = movies['credits'].str.replace('-', ' ')
# Production Companies
movies['production_companies'] = movies['production_companies'].str.replace(' ', '')
movies['production_companies'] = movies['production_companies'].str.replace('-', ' ')
# Overview and Keywords
movies['overview']= movies['overview'].str.replace('-', ' ')
movies['keywords'] = movies['keywords'].str.replace('-', ' ')


In [152]:
# Select the key columns that'll be used while building the model

movies['tags'] = movies['genres'].copy() + " " + movies['overview'].copy() + " " + movies['credits'].copy() + " " + movies['keywords'].copy() + " " +  movies['production_companies'].copy() + " " + movies['tagline'].copy()

movies['tags'] = movies['tags'].str.replace('[^\\w\\s]','')


In [153]:
#movies.head(5)

In [154]:
#movies.info()

In [155]:
nltk.download('punkt_tab')
nltk.download("wordnet")
nltk.download("omw-1.4")

# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()

def lemm(text):
    # Tokenize and lemmatize each word with POS "verb"
    new_text = []
    for word in word_tokenize(text):  # Tokenize the text
        new_text.append(wnl.lemmatize(word, pos="v"))  # Lemmatize each word

    return " ".join(new_text)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\zaval/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\zaval/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\zaval/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [156]:
print("Running... (may take a couple of minutes)")
movies['tags'] = movies['tags'].apply(lemm)

In [157]:
# Clean 'vote_count' column by replacing empty strings with 0
movies['vote_count'] = movies['vote_count'].replace('', 0).astype(int)

# Filter movies with vote_count more than or equal to 150
movies = movies[movies['vote_count'] >= 150].reset_index()

# Filter movies with vote_average more than or equal to 4.5
movies = movies[movies['vote_average'] >= 4.5].reset_index()

In [158]:
movies_copy = pd.DataFrame()

movies_copy['id'] = movies['id'].copy()
movies_copy['title'] = movies['title'].copy()
movies_copy['tags'] = movies['tags'].copy()
movies_copy['poster_path'] = movies['poster_path'].copy()
movies_copy['release_date'] = movies['release_date'].copy()

#movies_copy.head(5)

Unnamed: 0,id,title,tags,poster_path,release_date
0,615656,Meg 2: The Trench,Action ScienceFiction Horror An exploratory di...,/4m1Au3YkjqsxF8iwQy0fPYSxE0h.jpg,2023-08-02
1,758323,The Pope's Exorcist,Horror Mystery Thriller Father Gabriele Amorth...,/9JBEPLTPSm0d1mbEcLxULjJq9Eh.jpg,2023-04-05
2,533535,Deadpool & Wolverine,Action Comedy ScienceFiction A listless Wade W...,/8cdWjvZQUExUUTzyp4t6EDMubfO.jpg,2024-07-24
3,667538,Transformers: Rise of the Beasts,Action Adventure ScienceFiction When a new thr...,/gPbM0MK8CP8A174rmUwGsADNYKD.jpg,2023-06-06
4,693134,Dune: Part Two,ScienceFiction Adventure Follow the mythic jou...,/czembW0Rk1Ke7lCJGahbOhdCuhV.jpg,2024-02-27


In [159]:
#movies_copy.shape

In [160]:
# Create a TfidfVectorizer object to transform the movie tags into a Tf-idf representation
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_copy['tags'])

In [161]:
# Calculate the cosine similarity matrix between the movies
cosine_similarity = cosine_similarity(tfidf_matrix)

In [162]:
# Create a dataframe with the cosine similarity scores
similarity = pd.DataFrame(cosine_similarity, index=movies_copy['id'], columns=movies_copy['id'])

In [163]:
# Take the pickle dump of the results for later use

pickle.dump(movies_copy,open('../pickle/movies_meta.pkl','wb'))
pickle.dump(tfidf,open('../pickle/tfidf_model.pkl','wb'))
pickle.dump(tfidf_matrix,open('../pickle/tfidf_matrix.pkl','wb'))
pickle.dump(similarity,open('../pickle/similarity.pkl','wb'))

print("Done")