In [None]:
# Acquiring Data
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from tkinter import messagebox, ttk
from tkinter import *
from tkinter.scrolledtext import ScrolledText
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display


# Loading the movie dataset
movie_data = pd.read_csv("movie_data.csv")
movie_data.head()

# Purpose : Read CSV and display it

# Library Imports:

# import pandas as pd: Imports Pandas for data manipulation.
# from sklearn.feature_extraction.text import CountVectorizer: Imports CountVectorizer for text data processing.
# from sklearn.metrics.pairwise import cosine_similarity: Imports cosine_similarity for calculating similarity.
# from sklearn.cluster import KMeans: Imports KMeans for clustering.
# from tkinter import messagebox, ttk: Imports components for GUI (message boxes and themed widgets).
# from tkinter import *: Imports all Tkinter components.
# from tkinter.scrolledtext import ScrolledText: Imports ScrolledText for a scrollable text box.
# import matplotlib.pyplot as plt: Imports Matplotlib for plotting.
# from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg: Imports a Matplotlib backend for Tkinter.


In [None]:
movie_data.info()

# Purpose : Infromation about the dataframe

In [None]:
movie_data.isnull().sum()

# Purpose : Null value count in dataframe

In [None]:
movie_data.isnull()

# Purpose : Null value count in dataframe (True/False)

In [None]:
# Handling missing values in specified features
features = ['director_name', 'duration', 'actor_2_name', 'genres', 'actor_1_name']
for feature in features:
    movie_data[feature] = movie_data[feature].fillna('')
    
# Purpose : Handling null values/ Striping spaces

In [None]:
movie_data.select_dtypes(include='object').nunique()

# Purpose : Checking unique values in each column

In [None]:
plt.figure(figsize=(10,5))
movie_data['genres'].value_counts().head(30).plot(kind='bar')
plt.show()

# Purpose : Showing graph for genres column in bar chart

In [None]:
default_value = 0  # Replace with your desired default value

# Fill NaN values with the default value
movie_data['title_year'] = movie_data['title_year'].fillna(default_value)

# Convert the 'title_year' column to int64
movie_data['title_year'] = movie_data['title_year'].astype('int64')

# Purpose : title_year in dataframe was float64 column converted it to int64

In [None]:
plt.figure(figsize=(10,5))
movie_data['title_year'].value_counts().head(15).plot(kind='bar')
plt.show()

# Purpose : Showing graph for title_year column in bar chart

In [None]:
# Assuming you have a DataFrame named movie_data and summary_stat is the summary statistics DataFrame
summary_stat = movie_data.describe()

# Extract max and min values for 'imdb_score,' 'duration,' and 'num_voted_users'
max_imdb_score = summary_stat.loc['max', 'imdb_score']
min_imdb_score = summary_stat.loc['min', 'imdb_score']

max_num_voted_users = summary_stat.loc['max', 'num_voted_users']
min_num_voted_users = summary_stat.loc['min', 'num_voted_users']

# Display the results
print(f"Maximum IMDb Score: {max_imdb_score}")
print(f"Minimum IMDb Score: {min_imdb_score}")


print(f"Maximum Number of Voted Users: {max_num_voted_users}")
print(f"Minimum Number of Voted Users: {min_num_voted_users}")

# Purpose : Min Max value for imdb_score,num_voted_users

In [None]:

# Count the occurrences of each country and language
country_counts = movie_data['country'].value_counts().sort_values(ascending=False).head(5)
language_counts = movie_data['language'].value_counts().sort_values(ascending=False).head(5)

# Create a bar graph for the top 5 countries
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
country_counts.plot(kind='bar', color='skyblue')
plt.title('Top 5 Countries with Most Movies')
plt.xlabel('Country')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45, ha='right')

# Create a bar graph for the top 5 languages
plt.subplot(1, 2, 2)
language_counts.plot(kind='bar', color='lightcoral')
plt.title('Top 5 Languages of Movies')
plt.xlabel('Language')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45, ha='right')


country_counts = movie_data['country'].value_counts().sort_values(ascending=False).head(5)
language_counts = movie_data['language'].value_counts().sort_values(ascending=False).head(5)

# Create a pie chart for the top 5 countries
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
country_pie = plt.pie(country_counts, labels=None, startangle=90, colors=plt.cm.Paired.colors)
plt.title('Top 5 Countries with Most Movies')
plt.legend(labels=[f'{country} ({count})' for country, count in zip(country_counts.index, country_counts)], loc='upper right')

# Create a pie chart for the top 5 languages
plt.subplot(1, 2, 2)
language_pie = plt.pie(language_counts, labels=None, startangle=90, colors=plt.cm.Paired.colors)
plt.title('Top 5 Languages of Movies')
plt.legend(labels=[f'{language} ({count})' for language, count in zip(language_counts.index, language_counts)], loc='upper right')




plt.tight_layout() 
plt.show()


# Purpose : Bar and pie chart for Visulation of the dataframe columns

In [None]:
# Define a function to determine the movie status
def determine_movie_status(score):
    if 8 <= score <= 10:
        return 'Blockbuster'
    elif 6 <= score < 8:
        return 'Super Hit'
    elif 4 <= score < 6:
        return 'Hit'
    elif 0 <= score < 4:
        return 'Flop'
    else:
        return 'Unknown'

# Apply the function to create the 'movie_status' column
movie_data['movie_status'] = movie_data['imdb_score'].apply(determine_movie_status)

# Display the updated DataFrame
movie_data.head()

cat_vars = ['movie_status']

num_cols = len(cat_vars)

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
axs = axs.flatten()

for i, var in enumerate(cat_vars):
    sns.countplot(x=var, data=movie_data, ax=axs[i])
    axs[i].set_title(var)

if num_cols < len(axs):
    for i in range(num_cols, len(axs)):
        fig.delaxes(axs[i])

fig.tight_layout()
plt.show()


# Purpose : Checking the Movie status as Blockbuster, Super Hit, Hit, Flop


In [None]:

# Define a function to determine the movie status for a specific year
def determine_movie_status(score, year):
    if 8 <= score <= 10:
        return 'Blockbuster'
    elif 6 <= score < 8:
        return 'Super Hit'
    elif 4 <= score < 6:
        return 'Hit'
    elif 0 <= score < 4:
        return 'Flop'
    else:
        return 'Unknown'

# Apply the function to create the 'movie_status' column for a specific year
def get_movie_status_for_year(movie_data, year):
    year_data = movie_data[movie_data['title_year'] == year]
    if year_data.empty:
        raise ValueError(f"No movies found for the year {year}")
    
    year_data = year_data.copy()
    year_data['movie_status'] = year_data['imdb_score'].apply(lambda x: determine_movie_status(x, year))
    return year_data

try:
    year_to_check = int(input("Enter the movie release year to display movie status: "))
    filtered_data = get_movie_status_for_year(movie_data, year_to_check)

    cat_vars = ['movie_status']

    num_cols = len(cat_vars)

    fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
    axs = axs.flatten()

    for i, var in enumerate(cat_vars):
        sns.countplot(x=var, data=filtered_data, ax=axs[i])
        axs[i].set_title(var)

    if num_cols < len(axs):
        for i in range(num_cols, len(axs)):
            fig.delaxes(axs[i])

    fig.tight_layout()
    plt.show()

except ValueError as e:
    print(e)
    

# Purpose :Same with a user input for yearwise data, Checking the Movie status as Blockbuster, Super Hit, Hit, Flop

In [None]:

# Filter out rows with missing values in relevant columns
filtered_data = movie_data[['director_name', 'title_year', 'imdb_score']].dropna()

# Group the data by director and count the number of IMDb scores for each director
director_scores_count = filtered_data.groupby('director_name')['imdb_score'].count()

# Get a list of directors with more than 5 IMDb scores
directors_with_enough_scores = director_scores_count[director_scores_count > 5].index

# Set a default value for the director dropdown
default_director = "Ang Lee"

# Create a dropdown widget for director selection
director_dropdown = widgets.Dropdown(
    options=sorted(directors_with_enough_scores),
    value=default_director,
    description='Select Director:'
)

# Function to update the plot based on the selected director
def update_plot(selected_director):
    plt.figure(figsize=(12, 8))
    
    # Filter data for the selected director
    director_data = filtered_data[filtered_data['director_name'] == selected_director]
    
    # Use scatter plot to show only dots
    plt.scatter(director_data['title_year'], director_data['imdb_score'], label=selected_director, marker='o', color='blue')
    plt.title(f'Director Progress: {selected_director}')
    plt.xlabel('Year')
    plt.ylabel('IMDb Score')
    plt.legend()
    plt.grid(True)
    plt.show()

# Connect the update_plot function to the dropdown's on_change event
widgets.interactive(update_plot, selected_director=director_dropdown)


# Purpose : Show IMDB score for each direcrtor with more that 5 IMDB score available

In [None]:

# Filter out rows with missing values in relevant columns
filtered_data = movie_data[['title_year', 'movie_title']].dropna()

# Create a dropdown widget for title_year selection
title_year_dropdown = widgets.Dropdown(
    options=sorted(filtered_data['title_year'].unique()),
    value=filtered_data['title_year'].min(),
    description='Select Title Year:'
)

# Function to update the bar graph based on the selected title_year
def update_bar_graph(selected_title_year):
    plt.figure(figsize=(12, 8))
    
    # Filter data for the selected title_year
    year_data = filtered_data[filtered_data['title_year'] == selected_title_year]
    
    # Count the number of movies for the selected year
    movie_count = len(year_data)
    
    # Plot a bar graph
    year_data['movie_title'].value_counts().plot(kind='bar', color='purple')
    
    plt.title(f'Movie Count for Title Year: {selected_title_year} (Total: {movie_count} movies)')
    plt.xlabel('Movie Title')
    plt.ylabel('Count')
    plt.grid(axis='y')
    plt.show()

# Connect the update_bar_graph function to the dropdown's on_change event
widgets.interactive(update_bar_graph, selected_title_year=title_year_dropdown)

# Purpose : Show 

In [None]:
# Function to clean movie titles
def clean_movie_title(title):
    # Add your cleaning logic here, if needed
    return title.strip()  # Example: Stripping leading and trailing whitespaces


In [None]:
# Exploring Data
# Certainly! This function, named combine_movie_features, is designed to create a combined string of relevant movie features from a given row of a DataFrame.

def combine_movie_features(row):
    try:
        director = row['director_name'] if isinstance(row['director_name'], str) else ''
        duration = str(row['duration'])
        actor_2 = row['actor_2_name'] if isinstance(row['actor_2_name'], str) else ''
        genres = row["genres"] if isinstance(row["genres"], str) else ''
        actor_1 = row["actor_1_name"] if isinstance(row["actor_1_name"], str) else ''
        return director + " " + duration + " " + actor_2 + " " + genres + " " + actor_1 
    except Exception as e:
        print("Error:", e)
        return ''

#     Feature Extraction:

# The function extracts specific movie features from the given row of the DataFrame:
# director: The director's name, or an empty string if it is not a string.
# duration: The duration of the movie, converted to a string.
# actor_2: The name of the second actor, or an empty string if it is not a string.
# genres: The genres of the movie, or an empty string if it is not a string.
# actor_1: The name of the first actor, or an empty string if it is not a string.
    
# Concatenation:
# The extracted features are concatenated into a single string using spaces as separators.

# Return Value:
# The final output of the function is the combined string of movie features.

# Purpose:
# The purpose of this function is to create a consolidated string representation of relevant movie features. This combined string is often used in natural language processing (NLP) tasks, such as text vectorization, where a collection of text data needs to be converted into a format suitable for machine learning algorithms.
    
# Function to get the index from the movie title
def get_index_from_movie_title(title):
    try:
        title = clean_movie_title(title)  # Use the clean_movie_title function
        return movie_data[movie_data["movie_title"].str.strip() == title].index.values[0]
    except:
        print("Invalid Choice: Please enter a valid movie name")
        return -1

    
# movie_data["movie_title"].str.strip() == title: This condition checks for movies in the DataFrame (movie_data) where the cleaned movie title matches the title in the "movie_title" column after stripping whitespaces.
# movie_data[...]: This selects the rows in the DataFrame where the condition is true.
# .index.values[0]: This retrieves the index of the first (and presumably only) movie that satisfies the condition.

# Return Value:
# The function returns the index of the movie with the specified title if it is found. If not found or if there is an error, it returns -1.



In [None]:
def get_movie_list(user_selected_movie):
    # Creating a new column in the movie_data DataFrame to store the combined features of each movie
    movie_data["combined_features"] = movie_data.apply(combine_movie_features, axis=1  # Initializing the CountVectorizer, which will be used to convert text data into a matrix of token counts
    cv = CountVectorizer()
    count_matrix_value = cv.fit_transform(movie_data["combined_features"]) # Using CountVectorizer to transform the combined features into a matrix of token counts

    # Computing Cosine Similarity
    cosine_sim = cosine_similarity(count_matrix_value) 
    # This line calculates the cosine similarity between the rows (movies) of the count_matrix_value matrix. 

    # Applying K-Means Clustering
    number_of_clusters = 5 # This line sets the variable number_of_clusters to 5
    kmeans = KMeans(n_clusters=number_of_clusters, n_init=10, random_state=42) # n_init: The number of times the K-Means algorithm will be run with different centroid seeds, It will run for 10 times.
    movie_data['cluster'] = kmeans.fit_predict(cosine_sim) # This line performs the actual clustering.
    #  This fits the K-Means model to the cosine similarity matrix (cosine_sim) and assigns each movie to a cluster.
    #  The fit_predict method returns an array where each element represents the cluster assignment for the corresponding movie in the dataset.
    #  movie_data['cluster']: This line adds a new column called "cluster" to the movie_data DataFrame and populates it with the cluster assignments obtained from the K-Means algorithm.
    # Purpose: The purpose of this code is to group movies with similar content into clusters based on their cosine similarity values.                          
    movie_index = get_index_from_movie_title(user_selected_movie)  # Fix here
    if movie_index == -1:
        return []
    
    # Retrieving movies from the same cluster
    similar_movies = movie_data[movie_data['cluster'] == movie_data.loc[movie_index, 'cluster']].copy()
    #  This part creates a boolean mask by comparing the cluster assignment of each movie in the movie_data DataFrame with the cluster assignment of the user-selected movie.                                                   
    #  .copy(): This creates a copy of the DataFrame to avoid potential issues with modifying the original DataFrame. 
    # Purpoe :  the purpose of this code is to extract a subset of movies (similar_movies) from the movie_data DataFrame that share the same cluster assignment as the user-selected movie.                                                   
                                                       
    # Sorting movies by cosine similarity
    similar_movies.loc[:, 'cosine_similarity'] = cosine_sim[movie_index][similar_movies.index].copy()
    similar_movies = similar_movies.sort_values(by='cosine_similarity', ascending=False)
    # cosine_sim[movie_index]: This part retrieves the row in the cosine similarity matrix (cosine_sim) corresponding to the user-selected movie. 
    # cosine_sim[movie_index][similar_movies.index]: This further narrows down the similarity scores to only include movies in the similar_movies subset. It retrieves the cosine similarity scores between the user-selected movie and each movie in the similar_movies DataFrame.
    # This line sorts the similar_movies DataFrame based on the "cosine_similarity" column in descending order. This means that movies with higher cosine similarity scores (more similar to the user-selected movie) will appear at the top of the DataFrame.                                                   
   
    # Cleaning movie titles
    similar_movies["movie_title"] = similar_movies["movie_title"].apply(clean_movie_title)

    # Calculating cosine similarity as a percentage
    max_similarity = similar_movies['cosine_similarity'].max()
    similar_movies['cosine_similarity_percentage'] = (similar_movies['cosine_similarity'] / max_similarity) * 100

    movie_list = list(zip(similar_movies["movie_title"].head(20), similar_movies['cosine_similarity_percentage'].head(20)))
    return movie_list



In [None]:
# Machine Learning
def recommend_movies(movie_title):
    for widget in user_interface_bottom_frame.winfo_children():  #This line iterates over all the child widgets (components) that are currently present within the Tkinter frame named user_interface_bottom_frame. The winfo_children() method is used to retrieve a list of all the child widgets of a Tkinter widget.
        widget.destroy() #Inside the loop, the destroy() method is called on each widget. This method is used to destroy (remove) a Tkinter widget from the user interface.
    #The purpose of this code is to clear the contents of the user_interface_bottom_frame before updating or displaying new information. This is a common practice in GUI applications to ensure that the interface is clean and ready for new content.
    
    
    if movie_title == "":
        print("Invalid Choice: Please enter a valid movie name")
    else:
        movie_list = get_movie_list(movie_title)
        if movie_list:
            # Create a figure and axes for the first graph (Bar chart)
            fig1, ax1 = plt.subplots(figsize=(8, 5))

            # Extract movie titles and cosine similarity values
            # The zip(*movie_list) is used to unzip the movie_list into two separate tuples: titles and similarities. Each tuple contains information about the recommended movies.
            # titles will contain the titles of the recommended movies.
            # similarities will contain the cosine similarity percentages corresponding to each recommended movie.
            titles, similarities = zip(*movie_list) 
            # Purpose :This part of the code is preparing for the creation of a bar chart to visualize movie recommendations.

            # Plot a bar chart for Cosine Similarity
            ax1.barh(titles, similarities, color='skyblue')
            ax1.set_xlabel('Cosine Similarity (%)')
            ax1.set_title('Movie Recommendations - Cosine Similarity')
            # Purpose :The purpose of this code is to create a horizontal bar chart using Matplotlib to visually represent the cosine similarity percentages of the recommended movies. 

            # Display the bar chart in the Tkinter window
            # FigureCanvasTkAgg is a Matplotlib class that provides a canvas for embedding Matplotlib figures into Tkinter.
            canvas1 = FigureCanvasTkAgg(fig1, master=user_interface_bottom_frame) 
            canvas1.draw()
            canvas1.get_tk_widget().pack(side=TOP, fill=BOTH, expand=True)
            # get_tk_widget() retrieves the Tkinter widget associated with the Matplotlib canvas (canvas1).
            # pack() is a Tkinter geometry manager that organizes widgets in blocks before placing them in the parent widget (user_interface_bottom_frame)
            # Purpose : The purpose of this code is to integrate the Matplotlib bar chart into the Tkinter graphical user interface. 
            

            # Add a vertical scrollbar for the bar chart
            vertical_scrollbar1 = ttk.Scrollbar(user_interface_bottom_frame, orient="vertical", command=canvas1.get_tk_widget().yview)
            vertical_scrollbar1.pack(side='right', fill='y')
            canvas1.get_tk_widget().configure(yscrollcommand=vertical_scrollbar1.set)
            # Purpose : The purpose of this code is to provide vertical scrolling functionality for the Matplotlib bar chart embedded in the Tkinter window. 
            # If the content of the chart exceeds the available space, users can use the vertical scrollbar to navigate through the hidden portions of the chart.
            
            
            # Create a figure and axes for the second graph (Pie chart)
            fig2, ax2 = plt.subplots(figsize=(8, 5))
            # Purpose : The purpose of this code is to set up a new Matplotlib figure (fig2) and axes (ax2) pair specifically for creating a pie chart

            # Plot a pie chart for the top 5 recommendations
            ax2.pie(similarities[:5], labels=titles[:5], autopct='%1.1f%%', startangle=90, colors=['lightcoral', 'lightgreen', 'lightblue', 'lightyellow', 'lightskyblue'])
            ax2.set_title('Top 5 Recommendations')
            # Purpoes:  It will provide the top 5 Recommendations in Pie chart 

            # Display the pie chart in the Tkinter window
            canvas2 = FigureCanvasTkAgg(fig2, master=user_interface_bottom_frame)
            canvas2.draw()
            canvas2.get_tk_widget().pack(side=TOP, fill=BOTH, expand=True)
            # Purpose and Logic Explained for Fig1 here similar purpose and logic is there

            # Add a vertical scrollbar for the pie chart
            vertical_scrollbar2 = ttk.Scrollbar(user_interface_bottom_frame, orient="vertical", command=canvas2.get_tk_widget().yview)
            vertical_scrollbar2.pack(side='right', fill='y')
            canvas2.get_tk_widget().configure(yscrollcommand=vertical_scrollbar2.set)
            # Purpose and Logic Explained for Fig1 here similar purpose and logic is there

        else:
            user_interface_text_box.insert(END, "Invalid Choice: Please enter a valid movie name\n")



In [None]:
# Setting up the graphical user interface(Data Mining)
# Creates the main window of the Tkinter application. The Tk() constructor initializes a new Tkinter window.
root = Tk()
root.title("Movie Recommendation System")
# Purpoes:  to create the main window for the Tkinter graphical user interface (GUI) and set its title to "Movie Recommendation System." 

user_interface_top_frame = Frame(root) # Creates a Tkinter Frame widget (user_interface_top_frame) that will serve as a container within the main window (root).
user_interface_top_frame.pack(fill=X) # 
label_header = Label(user_interface_top_frame, text="Movie Recommendation System", fg="#DC5B21", bg="#E4DBBF")
label_header.config(font=("Didot 24 bold"))
label_header.pack(fill=X)
# Purpose : it displays the title of the application. Stylistic configurations, such as font and colors, are applied to enhance the visual appearance of the header. 

user_interface_bottom_frame = Frame(root)
user_interface_bottom_frame.pack(side=BOTTOM, fill=BOTH, expand=True)  # The use of fill=BOTH and expand=True ensures that the frame expands both horizontally and vertically, adapting to the size of the available space.
# Purpose : The code creates a bottom frame in the Tkinter window to act as a container for widgets, charts, or visual elements, and it ensures the frame expands to fill available space at the bottom.

label_instruction = Label(root, text="Enter a movie of your choice:-  ")
entry_user_input = Entry(root, width=30)
button_recommend = Button(root, text="Get recommendations", command=lambda: recommend_movies(entry_user_input.get()))
# Purpose: Together, these widgets form an input section where users can enter a movie title and request recommendations by clicking the "Get recommendations" button.
# Here Lambda will be a small inline function that will call recommend_movies() once click on button.

label_instruction.pack(side=LEFT)
entry_user_input.pack(side=LEFT)
button_recommend.pack(side=BOTTOM)
# Purpose : The code determines the spatial arrangement of the widgets in the main window, creating a simple horizontal layout


# Using ScrolledText for the output box
user_interface_text_box = ScrolledText(root, height=10, width=10, wrap=WORD) # ScrolledText ensures that the text box is equipped with vertical and horizontal scrollbars, allowing users to navigate through the content when it exceeds the visible area.
user_interface_text_box.pack(side=TOP, pady=10, fill=BOTH, expand=True)  # Allowing the text box to expand

root.mainloop()

# Purpose : The code establishes a scrollable text box at the top of the main window, providing an area where text output or messages can be displayed.

