In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import streamlit as st

In [2]:
movies_df = pd.read_csv("movie_recommendations_2024_final.csv")
movies_df.head()

Unnamed: 0.1,Unnamed: 0,MovieName,StoryLine
0,0,Carry-On,A mysterious traveler blackmails a young TSA a...
1,1,Red One,"After Santa Claus is kidnapped, the North Pole..."
2,2,Pushpa: The Rule - Part 2,Pushpa struggles to sustain his sandalwood smu...
3,3,Black Doves,Helen embarks on a passionate affair with a ma...
4,4,No Good Deed,It tells the story of three very different fam...


In [3]:
# Remove the 'Unnamed: 0' column
movies_df.head()

Unnamed: 0.1,Unnamed: 0,MovieName,StoryLine
0,0,Carry-On,A mysterious traveler blackmails a young TSA a...
1,1,Red One,"After Santa Claus is kidnapped, the North Pole..."
2,2,Pushpa: The Rule - Part 2,Pushpa struggles to sustain his sandalwood smu...
3,3,Black Doves,Helen embarks on a passionate affair with a ma...
4,4,No Good Deed,It tells the story of three very different fam...


In [4]:
movies_df.isnull().sum()

Unnamed: 0    0
MovieName     0
StoryLine     0
dtype: int64

# Data Preprocessing and Analysis:

# Text Cleaning (NLP):

In [11]:
# Download necessary NLTK data files
nltk.download('punkt') # This model helps split text into sentences or words based on language rules (like punctuation and spacing).
nltk.download('stopwords') # Removing stopwords from a text to reduce noise in data.
nltk.download('wordnet') # Used for lemmatization (converting words to their base or dictionary form) and semantic analysis.

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [15]:
# Function to preprocess storylines
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation, numbers, and special characters using regex
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords and apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    #     Lemmatization is the process of reducing a word to its base or dictionary form (i.e., its lemma). For example:
    # "running" -> "run"
    # "better" -> "good"
    
    # Join the tokens back into a single string
    return ' '.join(tokens)

In [17]:
# Apply preprocessing to the 'Storyline' column
movies_df['cleaned_storyline'] = movies_df['StoryLine'].apply(preprocess_text)

In [21]:
movies_df.head()

Unnamed: 0.1,Unnamed: 0,MovieName,StoryLine,cleaned_storyline
0,0,Carry-On,A mysterious traveler blackmails a young TSA a...,mysterious traveler blackmail young tsa agent ...
1,1,Red One,"After Santa Claus is kidnapped, the North Pole...",santa claus kidnapped north pole head security...
2,2,Pushpa: The Rule - Part 2,Pushpa struggles to sustain his sandalwood smu...,pushpa struggle sustain sandalwood smuggling b...
3,3,Black Doves,Helen embarks on a passionate affair with a ma...,helen embarks passionate affair man idea secre...
4,4,No Good Deed,It tells the story of three very different fam...,tell story three different family vying buy sp...


In [23]:
movies_df['MovieName'] = movies_df['MovieName'].astype(str)
movies_df['StoryLine'] = movies_df['StoryLine'].astype(str)
movies_df['cleaned_storyline'] = movies_df['cleaned_storyline'].astype(str)

In [25]:
movies_df.dtypes

Unnamed: 0            int64
MovieName            object
StoryLine            object
cleaned_storyline    object
dtype: object

# Text Representation:

In [28]:
# In Natural Language Processing (NLP), 
# text data needs to be converted into a numerical format because most machine learning models 
# and algorithms can only work with numbers. This process is known as text representation.

In [30]:
# Count Vectorizer: Converts each document (in this case, movie storyline) into a vector of word counts. 
# It creates a matrix where each row represents a document, 
# and each column represents a word. The value in the matrix is the frequency of the word in the document.

In [32]:
# Initialize the Count Vectorizer
count_vectorizer = CountVectorizer(max_features=1000)

In [34]:
# Fit and transform the cleaned storylines into a Count Vectorizer matrix
count_matrix = count_vectorizer.fit_transform(movies_df['cleaned_storyline'])

In [36]:
# Convert the matrix into a DataFrame for easier inspection
count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

In [38]:
# Add the Count Vectorizer DataFrame as a new column in the original movies_df
movies_df['Count_Vector'] = list(count_df.values)

In [40]:
movies_df.head()

Unnamed: 0.1,Unnamed: 0,MovieName,StoryLine,cleaned_storyline,Count_Vector
0,0,Carry-On,A mysterious traveler blackmails a young TSA a...,mysterious traveler blackmail young tsa agent ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,Red One,"After Santa Claus is kidnapped, the North Pole...",santa claus kidnapped north pole head security...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,Pushpa: The Rule - Part 2,Pushpa struggles to sustain his sandalwood smu...,pushpa struggle sustain sandalwood smuggling b...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,Black Doves,Helen embarks on a passionate affair with a ma...,helen embarks passionate affair man idea secre...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,No Good Deed,It tells the story of three very different fam...,tell story three different family vying buy sp...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Cosine Similarity:

In [43]:
from sklearn.metrics.pairwise import cosine_similarity

In [45]:
# Extract the Count_Vector column as a matrix
count_matrix = np.array(movies_df['Count_Vector'].to_list())

In [47]:
count_matrix

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [49]:
# Calculate the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(count_matrix)

In [51]:
cosine_sim_matrix

array([[1.        , 0.1754116 , 0.        , ..., 0.08451543, 0.        ,
        0.07254763],
       [0.1754116 , 1.        , 0.        , ..., 0.        , 0.        ,
        0.06362848],
       [0.        , 0.        , 1.        , ..., 0.08908708, 0.        ,
        0.        ],
       ...,
       [0.08451543, 0.        , 0.08908708, ..., 1.        , 0.        ,
        0.06131393],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.07254763, 0.06362848, 0.        , ..., 0.06131393, 0.        ,
        1.        ]])

In [53]:
# Function to get ranked movie names for a given movie
def get_ranked_movies(index, cosine_sim_matrix, movies_df, top_n=5):
    # Get the similarity scores for the given movie
    sim_scores = list(enumerate(cosine_sim_matrix[index]))
    
    # Sort the movies by similarity score in descending order
    sorted_sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top N most similar movies (excluding itself)
    top_movies = sorted_sim_scores[1:top_n+1]  # Skip the first one, as it's the movie itself
    
    # Retrieve the movie names
    ranked_movies = [movies_df.iloc[i]['MovieName'] for i, score in top_movies]
    return ranked_movies

In [55]:
# Add a new column with ranked movies
movies_df['Ranked_Movies'] = [
    get_ranked_movies(i, cosine_sim_matrix, movies_df) for i in range(len(movies_df))
]

In [57]:
movies_df.head()

Unnamed: 0.1,Unnamed: 0,MovieName,StoryLine,cleaned_storyline,Count_Vector,Ranked_Movies
0,0,Carry-On,A mysterious traveler blackmails a young TSA a...,mysterious traveler blackmail young tsa agent ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Terrifier 3, No Time to Die, Skeleton Crew, D..."
1,1,Red One,"After Santa Claus is kidnapped, the North Pole...",santa claus kidnapped north pole head security...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Violent Night, The Santa Class, The Santa Cla..."
2,2,Pushpa: The Rule - Part 2,Pushpa struggles to sustain his sandalwood smu...,pushpa struggle sustain sandalwood smuggling b...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Pushpa: The Rise - Part 1, Alien: Romulus, Ma..."
3,3,Black Doves,Helen embarks on a passionate affair with a ma...,helen embarks passionate affair man idea secre...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Ghosts, Moana 2, A Man on the Inside, Spider-..."
4,4,No Good Deed,It tells the story of three very different fam...,tell story three different family vying buy sp...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Here, Beetlejuice Beetlejuice, Thukra Ke Mera..."


In [59]:
movies_df_clone = movies_df

In [61]:
movies_df_clone.head()

Unnamed: 0.1,Unnamed: 0,MovieName,StoryLine,cleaned_storyline,Count_Vector,Ranked_Movies
0,0,Carry-On,A mysterious traveler blackmails a young TSA a...,mysterious traveler blackmail young tsa agent ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Terrifier 3, No Time to Die, Skeleton Crew, D..."
1,1,Red One,"After Santa Claus is kidnapped, the North Pole...",santa claus kidnapped north pole head security...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Violent Night, The Santa Class, The Santa Cla..."
2,2,Pushpa: The Rule - Part 2,Pushpa struggles to sustain his sandalwood smu...,pushpa struggle sustain sandalwood smuggling b...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Pushpa: The Rise - Part 1, Alien: Romulus, Ma..."
3,3,Black Doves,Helen embarks on a passionate affair with a ma...,helen embarks passionate affair man idea secre...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Ghosts, Moana 2, A Man on the Inside, Spider-..."
4,4,No Good Deed,It tells the story of three very different fam...,tell story three different family vying buy sp...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Here, Beetlejuice Beetlejuice, Thukra Ke Mera..."


In [63]:
movies_df_clone = movies_df_clone.drop(columns=['Unnamed: 0', 'Count_Vector', 'Ranked_Movies'])

In [65]:
movies_df_clone.head()

Unnamed: 0,MovieName,StoryLine,cleaned_storyline
0,Carry-On,A mysterious traveler blackmails a young TSA a...,mysterious traveler blackmail young tsa agent ...
1,Red One,"After Santa Claus is kidnapped, the North Pole...",santa claus kidnapped north pole head security...
2,Pushpa: The Rule - Part 2,Pushpa struggles to sustain his sandalwood smu...,pushpa struggle sustain sandalwood smuggling b...
3,Black Doves,Helen embarks on a passionate affair with a ma...,helen embarks passionate affair man idea secre...
4,No Good Deed,It tells the story of three very different fam...,tell story three different family vying buy sp...


In [69]:
movies_df_clone['MovieName'] = movies_df_clone['MovieName'].astype(str)
movies_df_clone['StoryLine'] = movies_df_clone['StoryLine'].astype(str)
movies_df_clone['cleaned_storyline'] = movies_df_clone['cleaned_storyline'].astype(str)

In [71]:
movies_df_clone.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   MovieName          250 non-null    object
 1   StoryLine          250 non-null    object
 2   cleaned_storyline  250 non-null    object
dtypes: object(3)
memory usage: 6.0+ KB


In [73]:
movies_df_clone.to_csv("imdb_movies_datas_002.csv")

In [77]:
movies_df.head()

Unnamed: 0.1,Unnamed: 0,MovieName,StoryLine,cleaned_storyline,Count_Vector,Ranked_Movies
0,0,Carry-On,A mysterious traveler blackmails a young TSA a...,mysterious traveler blackmail young tsa agent ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Terrifier 3, No Time to Die, Skeleton Crew, D..."
1,1,Red One,"After Santa Claus is kidnapped, the North Pole...",santa claus kidnapped north pole head security...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Violent Night, The Santa Class, The Santa Cla..."
2,2,Pushpa: The Rule - Part 2,Pushpa struggles to sustain his sandalwood smu...,pushpa struggle sustain sandalwood smuggling b...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Pushpa: The Rise - Part 1, Alien: Romulus, Ma..."
3,3,Black Doves,Helen embarks on a passionate affair with a ma...,helen embarks passionate affair man idea secre...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Ghosts, Moana 2, A Man on the Inside, Spider-..."
4,4,No Good Deed,It tells the story of three very different fam...,tell story three different family vying buy sp...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Here, Beetlejuice Beetlejuice, Thukra Ke Mera..."


In [79]:
movies_df = movies_df.drop(columns  = ['Unnamed: 0'])

In [81]:
movies_df['MovieName'] = movies_df['MovieName'].astype(str)
movies_df['StoryLine'] = movies_df['StoryLine'].astype(str)
movies_df['cleaned_storyline'] = movies_df['cleaned_storyline'].astype(str)

In [83]:
movies_df.head()

Unnamed: 0,MovieName,StoryLine,cleaned_storyline,Count_Vector,Ranked_Movies
0,Carry-On,A mysterious traveler blackmails a young TSA a...,mysterious traveler blackmail young tsa agent ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Terrifier 3, No Time to Die, Skeleton Crew, D..."
1,Red One,"After Santa Claus is kidnapped, the North Pole...",santa claus kidnapped north pole head security...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Violent Night, The Santa Class, The Santa Cla..."
2,Pushpa: The Rule - Part 2,Pushpa struggles to sustain his sandalwood smu...,pushpa struggle sustain sandalwood smuggling b...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Pushpa: The Rise - Part 1, Alien: Romulus, Ma..."
3,Black Doves,Helen embarks on a passionate affair with a ma...,helen embarks passionate affair man idea secre...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Ghosts, Moana 2, A Man on the Inside, Spider-..."
4,No Good Deed,It tells the story of three very different fam...,tell story three different family vying buy sp...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Here, Beetlejuice Beetlejuice, Thukra Ke Mera..."


In [85]:
movies_df.to_csv("imdb_movies_data_final_003.csv")