Mounting the drive and importing libraries

In [None]:
# =========================
# STEP 1: MOUNT GOOGLE DRIVE
# =========================
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Visualization settings
plt.style.use("ggplot")
sns.set_palette("Set2")

In [None]:
# =========================
# MODULE 1: DATA LOADING (FROM DRIVE)
# =========================

import pandas as pd

def load_data_from_drive(filepath):
    """
    Loads dataset from Google Drive.

    Parameters:
    filepath (str): Full path to CSV file in Drive

    Returns:
    pandas DataFrame
    """
    df = pd.read_csv(filepath)
    return df

In [None]:
def inspect_data(df):
    """
    Basic inspection of dataset.
    """
    print("Dataset Shape:", df.shape)
    print("\nColumns:\n", df.columns)
    print("\nFirst 5 rows:\n", df.head())
    print("\nLast 5 rows:\n", df.tail())
    print("\nMissing values:\n", df.isnull().sum())

In [None]:
# Full path to your dataset
DATA_PATH = "/content/drive/MyDrive/datasets/netflix_titles.csv"

# Load dataset
df = load_data_from_drive(DATA_PATH)

# Inspect dataset
inspect_data(df)



In [None]:
def inspect_data(df):
    print("üîπ Shape:", df.shape)
    print("\nüîπ Columns:\n", df.columns)
    print("\nüîπ Data Types:\n", df.dtypes)
    print("\nüîπ Missing Values:\n", df.isnull().sum())
    print("\nüîπ Duplicate Rows:", df.duplicated().sum())

inspect_data(df)

In [None]:
df['director'] = df['director'].fillna("Unknown")
df['cast'] = df['cast'].fillna("Unknown")
df['country'] = df['country'].fillna("Unknown")
df['rating'] = df['rating'].fillna("Not Rated")
df['date_added'] = df['date_added'].fillna(pd.NaT)

In [None]:
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['year_added'] = df['date_added'].dt.year

In [None]:
plt.figure(figsize=(6,4))
df['type'].value_counts().plot(kind='bar')
plt.title("Movies vs TV Shows on Netflix")
plt.xlabel("Type")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(y=df['rating'], order=df['rating'].value_counts().index)
plt.title("Content Rating Distribution")
plt.xlabel("Count")
plt.ylabel("Rating")
plt.show()

In [None]:
country_counts = (
    df['country']
    .str.split(', ')
    .explode()
    .value_counts()
    .head(10)
)

plt.figure(figsize=(8,5))
country_counts.plot(kind='bar')
plt.title("Top 10 Countries by Netflix Content")
plt.xlabel("Country")
plt.ylabel("Number of Titles")
plt.show()

In [None]:
yearly_content = df['year_added'].value_counts().sort_index()

plt.figure(figsize=(8,5))
yearly_content.plot()
plt.title("Netflix Content Added Over Years")
plt.xlabel("Year")
plt.ylabel("Number of Titles")
plt.show()

In [None]:
genres = df['listed_in'].str.split(', ').explode()

plt.figure(figsize=(8,5))
genres.value_counts().head(10).plot(kind='bar')
plt.title("Top 10 Genres on Netflix")
plt.xlabel("Genre")
plt.ylabel("Count")
plt.show()

In [None]:
movies = df[df['type'] == 'Movie'].copy()
movies['duration_minutes'] = movies['duration'].str.replace(' min', '').astype(float)

plt.figure(figsize=(8,5))
sns.histplot(movies['duration_minutes'], bins=30, kde=True)
plt.title("Movie Duration Distribution")
plt.xlabel("Duration (minutes)")
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x='type', y='release_year', data=df)
plt.title("Release Year vs Content Type")
plt.show()

In [None]:
# =========================
# MODULE 2: PREPROCESSING
# =========================

def preprocess_data(df):
    """
    Cleans and prepares text data for recommendation.
    """

    # Select relevant columns
    features = ['title', 'type', 'listed_in', 'description', 'cast', 'director']
    df = df[features]

    # Fill missing values
    for col in ['listed_in', 'description', 'cast', 'director']:
        df[col] = df[col].fillna("")

    # Combine text features
    df['combined_features'] = (
        df['listed_in'] + " " +
        df['description'] + " " +
        df['cast'] + " " +
        df['director']
    )

    return df


In [None]:
def preprocess_data(df):
    df = df.copy()

    df['description'] = df['description'].fillna("")

    # Clean title for matching
    df['title_clean'] = (
        df['title']
        .str.lower()
        .str.strip()
    )

    return df

In [None]:
# =========================
# MODULE 3: FEATURE ENGINEERING (TF-IDF)
# =========================

from sklearn.feature_extraction.text import TfidfVectorizer

def create_tfidf_matrix(df):
    """
    Converts combined text features into a TF-IDF matrix.
    """

    tfidf = TfidfVectorizer(
        stop_words='english',
        max_features=5000
    )

    tfidf_matrix = tfidf.fit_transform(df['combined_features'])

    return tfidf_matrix

In [None]:
print(create_tfidf_matrix)

In [None]:
# =========================
# MODULE 4: SIMILARITY COMPUTATION
# =========================

from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(tfidf_matrix):
    """
    Computes cosine similarity between all items.

    Parameters:
    tfidf_matrix : sparse matrix
        TF-IDF feature matrix

    Returns:
    similarity_matrix : ndarray
        Cosine similarity matrix
    """
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

In [None]:
# =========================
# MODULE 5: RECOMMENDATION ENGINE
# =========================

def recommend(title, df, similarity_matrix, top_n=10):
    """
    Recommends top N similar movies/shows for a given title.

    Parameters:
    title (str): Movie or TV show title
    df (DataFrame): Preprocessed Netflix dataset
    similarity_matrix (ndarray): Cosine similarity matrix
    top_n (int): Number of recommendations

    Returns:
    list of recommended titles
    """

    # Check if title exists in dataset
    if title not in df['title'].values:
        return f"'{title}' not found in dataset."

    # Get index of the given title
    idx = df[df['title'] == title].index[0]

    # Get similarity scores for this title
    similarity_scores = list(enumerate(similarity_matrix[idx]))

    # Sort by similarity score (descending)
    similarity_scores = sorted(
        similarity_scores,
        key=lambda x: x[1],
        reverse=True
    )

    # Exclude the title itself and get top N results
    similarity_scores = similarity_scores[1: top_n + 1]

    # Get recommended titles
    recommendations = [df.iloc[i[0]]['title'] for i in similarity_scores]

    return recommendations



In [None]:
def recommend(title, df, similarity_matrix, top_n=10):
    title = title.lower()

    if title not in df['title'].values:
        print("‚ùå Title not found in dataset")
        return None

    idx = df[df['title'] == title].index[0]
    similarity_scores = list(enumerate(similarity_matrix[idx]))

    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:top_n+1]

    indices = [i[0] for i in similarity_scores]
    scores = [i[1] for i in similarity_scores]

    recommendations = df.iloc[indices][['title']]
    recommendations['similarity_score'] = scores

    return recommendations


In [None]:
def recommend(title, df, similarity_matrix, top_n=10):
    title = title.lower().strip()

    # ---------- Exact match ----------
    if title in df['title_clean'].values:
        idx = df[df['title_clean'] == title].index[0]

    # ---------- Partial match ----------
    else:
        matches = df[df['title_clean'].str.contains(title)]

        if matches.empty:
            print("‚ùå Movie / TV show not found.")
            print("üîç Try one of these popular titles:")
            print(df['title'].sample(5).values)
            return None

        print("‚ö† Exact title not found. Using closest match:")
        print(matches['title'].head(3).values)
        idx = matches.index[0]

    # ---------- Similarity computation ----------
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:top_n+1]

    indices = [i[0] for i in similarity_scores]
    scores = [i[1] for i in similarity_scores]

    recommendations = df.iloc[indices][['title']]
    recommendations['similarity_score'] = scores

    return recommendations


In [None]:
def user_recommendation_system(df, similarity_matrix):
    user_input = input("üé• Enter a movie or TV show title: ").strip()

    result = recommend(user_input, df, similarity_matrix)

    if result is None:
        return

    print("\n‚úÖ Recommended Titles:\n")
    print(result)

    # ---- Graph ----
    plt.figure(figsize=(10,5))
    plt.barh(
        result['title'],
        result['similarity_score']
    )
    plt.xlabel("Similarity Score")
    plt.ylabel("Title")
    plt.title(f"Recommendations Similar to '{user_input.title()}'")
    plt.gca().invert_yaxis()
    plt.show()


In [None]:
user_recommendation_system(df, similarity_matrix)