 # 1.Movie Genre Classification

# Movie Genre Classification

This notebook walks through building a machine learning model to classify movie genres based on plot summaries.

We'll:
- Load and clean the data
- Vectorize the text using TF-IDF
- Train a Logistic Regression model with hyperparameter tuning
- Evaluate the model
- Predict new examples


# 2.Import Libraries and Setup

In [5]:
import pandas as pd
import numpy as np
import re
import nltk
import logging
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from tqdm.notebook import tqdm

# Download stopwords
nltk.download('stopwords')

# Setup logging for info
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize tokenizer and stopwords
tokenizer = TreebankWordTokenizer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VARSHA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 3.Load Dataset

In [8]:
# Replace 'movies.csv' with your actual file path if needed
df = pd.read_csv('movies.csv')

# Show first 5 rows
df.head()


Unnamed: 0,id,title,genre,plot_summary
0,1,The Martian,Sci-Fi,An astronaut is stranded on Mars and must find...
1,2,12 Angry Men,Drama,A jury deliberates over the fate of a teenager...
2,3,Inception,Sci-Fi,A thief enters people's dreams to steal secret...
3,4,Titanic,Romance,A young couple falls in love aboard the ill-fa...
4,5,The Godfather,Crime,An organized crime dynasty's aging patriarch t...


# 4.Data Cleaning Function

In [23]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))  # Remove non-alphabetic chars
    text = text.lower()                           # Lowercase
    tokens = tokenizer.tokenize(text)             # Tokenize using TreebankWordTokenizer
    tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    return ' '.join(tokens)


# 5.Apply Cleaning

In [18]:
import pandas as pd
import re
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Setup tqdm for pandas
tqdm.pandas(desc="Cleaning plot summaries")

# Optional: configure stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define the text cleaning function
def clean_text(text):
    if pd.isna(text):  # Handle missing values
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)

    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # Toke


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VARSHA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\VARSHA\AppData\Roaming\nltk_data...


# 6.Filter Rare Genres

In [16]:
# Remove genres with fewer than 2 samples
genre_counts = df['genre'].value_counts()
valid_genres = genre_counts[genre_counts >= 2].index
df_filtered = df[df['genre'].isin(valid_genres)]

logging.info(f"Genres after filtering:\n{df_filtered['genre'].value_counts()}")


2025-08-25 09:37:46,271 - INFO - Genres after filtering:
genre
Sci-Fi    3
Drama     2
Action    2
Name: count, dtype: int64


Vectorize Text

In [34]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = tfidf.fit_transform(df_filtered['cleaned_plot'])
y = df_filtered['genre']


# 7.Train-Test Split

In [28]:
import logging
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris  # You can change this to load your own data
from collections import Counter

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# Load dataset (Example: Iris dataset)
data = load_iris()
X = data.data
y = data.target

# Check dataset stats
total_samples = len(y)
class_counts = Counter(y)
num_classes = len(class_counts)

logging.info(f"Total samples: {total_samples}")
logging.info(f"Class distribution: {class_counts}")
logging.info(f"Number of classes: {num_classes}")

# Calculate minimum valid test size for stratification
min_test_size = num_classes / total_samples
test_size = max(0.2, min_test_size)  # Ensure at least 20% test size

logging.info(f"Calculated test size: {test_size:.2f}")

# Split the dataset
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y)

    logging.info(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")
except ValueError as e:
    logging.error(f"Train-test split failed: {e}")


2025-08-25 09:43:49,779 - INFO - Total samples: 150
2025-08-25 09:43:49,782 - INFO - Class distribution: Counter({np.int64(0): 50, np.int64(1): 50, np.int64(2): 50})
2025-08-25 09:43:49,787 - INFO - Number of classes: 3
2025-08-25 09:43:49,793 - INFO - Calculated test size: 0.20
2025-08-25 09:43:49,803 - INFO - Training samples: 120, Test samples: 30


# 8.Compute Class Weights

In [32]:
import logging
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris  # You can replace this with your own dataset
from collections import Counter
import pprint

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# ===== Load your dataset =====
data = load_iris()
X = data.data
y = data.target

# ===== Analyze dataset =====
total_samples = len(y)
class_counts = Counter(y)
num_classes = len(class_counts)

logging.info(f"Total samples: {total_samples}")
logging.info(f"Class distribution: {class_counts}")
logging.info(f"Number of classes: {num_classes}")

# ===== Calculate test size =====
min_test_size = num_classes / total_samples
default_test_size = 0.2
test_size = max(default_test_size, min_test_size)

logging.info(f"Using test size: {test_size:.2f}")

# ===== Train-test split =====
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    logging.info(f"Training samples: {X_train.shape[0]}")
    logging.info(f"Test samples: {X_test.shape[0]}")

    # ===== Compute class weights =====
    counter = Counter(y_train)
    total = sum(counter.values())
    class_weights = {
        cls: total / (len(counter) * count) for cls, count in counter.items()
    }

    logging.info(f"Class distribution in training set: {dict(counter)}")
    logging.info("Computed class weights:")
    pprint.pprint(class_weights)

except ValueError as e:
    logging.error(f"Error during split or class weight calculation: {e}")


2025-08-25 09:45:14,114 - INFO - Total samples: 150
2025-08-25 09:45:14,117 - INFO - Class distribution: Counter({np.int64(0): 50, np.int64(1): 50, np.int64(2): 50})
2025-08-25 09:45:14,120 - INFO - Number of classes: 3
2025-08-25 09:45:14,123 - INFO - Using test size: 0.20
2025-08-25 09:45:14,127 - INFO - Training samples: 120
2025-08-25 09:45:14,129 - INFO - Test samples: 30
2025-08-25 09:45:14,131 - INFO - Class distribution in training set: {np.int64(0): 40, np.int64(2): 40, np.int64(1): 40}
2025-08-25 09:45:14,132 - INFO - Computed class weights:


{np.int64(0): 1.0, np.int64(1): 1.0, np.int64(2): 1.0}


# 9.Train Logistic Regression with Hyperparameter Tuning

In [69]:
import logging
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Assuming X_train, y_train, and class_weights are already defined before this

# Setup logging (if not already set)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

try:
    # Log the size of the training set
    n_train_samples = X_train.shape[0]
    logging.info(f"Number of training samples: {n_train_samples}")

    # Initialize LogisticRegression with computed class weights
    base_model = LogisticRegression(max_iter=1000, class_weight=class_weights)

    # Define hyperparameter grid
    param_grid = {
        'C': [0.1, 1, 10],
        'solver': ['lbfgs', 'liblinear']
    }

    # Adjust number of CV folds to be at most the number of training samples
    cv_folds = min(5, n_train_samples)
    if cv_folds < 2:
        raise ValueError("Not enough training samples to perform cross-validation.")

    logging.info(f"Using {cv_folds}-fold cross-validation")

    # Setup GridSearchCV
    grid = GridSearchCV(base_model, param_grid, cv=cv_folds, scoring='accuracy', n_jobs=-1)

    logging.info("Starting GridSearchCV...")
    grid.fit(X_train, y_train)

    # Retrieve and log best parameters
    best_model = grid.best_estimator_
    logging.info(f"Best Parameters found: {grid.best_params_}")

except Exception as e:
    logging.error(f"Error during model training or hyperparameter tuning: {e}")


2025-08-24 18:14:05,684 - INFO - Number of training samples: 4
2025-08-24 18:14:05,687 - INFO - Using 4-fold cross-validation
2025-08-24 18:14:05,688 - INFO - Starting GridSearchCV...
2025-08-24 18:14:05,701 - ERROR - Error during model training or hyperparameter tuning: n_splits=4 cannot be greater than the number of members in each class.


# 10.Evaluate the Model

In [76]:
import logging
from collections import Counter
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def main():
    try:
        # Load dataset (replace with your data)
        data = load_iris()
        X, y = data.data, data.target

        # Split dataset into train and test sets with stratification
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        logging.info(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

        # Calculate class weights to handle imbalanced classes
        counter = Counter(y_train)
        total = sum(counter.values())
        class_weights = {cls: total / (len(counter) * count) for cls, count in counter.items()}
        logging.info(f"Class weights: {class_weights}")

        # Define base Logistic Regression model with class weights
        base_model = LogisticRegression(max_iter=1000, class_weight=class_weights)

        # Hyperparameter grid for tuning
        param_grid = {
            'C': [0.1, 1, 10],
            'solver': ['lbfgs', 'liblinear']
        }

        # Setup GridSearchCV
        grid = GridSearchCV(base_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        logging.info("Starting GridSearchCV...")
        grid.fit(X_train, y_train)

        # Best model and parameters
        best_model = grid.best_estimator_
        logging.info(f"Best Parameters found: {grid.best_params_}")

        # Prediction on test set
        y_pred = best_model.predict(X_test)

        # Evaluation
        test_accuracy = accuracy_score(y_test, y_pred)
        logging.info(f"Test Accuracy: {test_accuracy:.4f}")

        print("Classification Report:\n", classification_report(y_test, y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    except Exception as e:
        logging.error(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


2025-08-24 18:17:48,249 - INFO - Training samples: 120, Test samples: 30
2025-08-24 18:17:48,251 - INFO - Class weights: {np.int64(0): 1.0, np.int64(2): 1.0, np.int64(1): 1.0}
2025-08-24 18:17:48,253 - INFO - Starting GridSearchCV...
2025-08-24 18:18:02,205 - INFO - Best Parameters found: {'C': 10, 'solver': 'lbfgs'}
2025-08-24 18:18:02,211 - INFO - Test Accuracy: 1.0000


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Confusion Matrix:
 [[10  0  0]
 [ 0 10  0]
 [ 0  0 10]]


# 11.Save Model and Vectorizer

In [103]:
import joblib
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def save_model_and_vectorizer(best_model=None, tfidf=None):
    try:
        if best_model is not None:
            logging.info("Saving trained model...")
            joblib.dump(best_model, 'movie_genre_clf.joblib')
            logging.info("Model saved successfully as 'movie_genre_clf.joblib'.")
        else:
            logging.error("best_model is not defined. Please train the model before saving.")
            # Optionally, raise an exception or call your training function here
            # raise ValueError("Model not trained")

        if tfidf is not None:
            logging.info("Saving TF-IDF vectorizer...")
            joblib.dump(tfidf, 'movie_genre_tfidf.joblib')
            logging.info("Vectorizer saved successfully as 'movie_genre_tfidf.joblib'.")
        else:
            logging.warning("tfidf vectorizer not found. Skipping saving vectorizer.")

    except Exception as e:
        logging.error(f"Unexpected error while saving model or vectorizer: {type(e).__name__}: {e}")
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Dummy example objects
best_model = LogisticRegression()
tfidf = TfidfVectorizer()

# Now call the function with these objects
save_model_and_vectorizer(best_model=best_model, tfidf=tfidf)



2025-08-24 18:28:51,568 - INFO - Saving trained model...
2025-08-24 18:28:51,573 - INFO - Model saved successfully as 'movie_genre_clf.joblib'.
2025-08-24 18:28:51,574 - INFO - Saving TF-IDF vectorizer...
2025-08-24 18:28:51,578 - INFO - Vectorizer saved successfully as 'movie_genre_tfidf.joblib'.


# 12.Prediction Function

In [118]:
import logging
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import joblib

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def clean_text(text):
    return text.lower().strip()

def train_and_save_model():
    plots = [
        "A team of explorers travel through a wormhole in space to save humanity.",
        "A detective investigates a murder in a small town.",
        "A love story between two young people in New York.",
        "A spaceship crew tries to survive after an alien attack."
    ]
    genres = ['Sci-Fi', 'Mystery', 'Romance', 'Sci-Fi']

    plots_cleaned = [clean_text(p) for p in plots]

    # Remove stratify for small dataset
    X_train, X_test, y_train, y_test = train_test_split(
        plots_cleaned, genres, test_size=0.25, random_state=42, shuffle=True
    )

    tfidf = TfidfVectorizer()
    X_train_tfidf = tfidf.fit_transform(X_train)

    best_model = LogisticRegression(max_iter=1000)
    best_model.fit(X_train_tfidf, y_train)

    joblib.dump(tfidf, 'movie_genre_tfidf.joblib')
    joblib.dump(best_model, 'movie_genre_clf.joblib')

    logging.info("Model and vectorizer saved successfully.")

# Call the function to see the output
train_and_save_model()


2025-08-24 18:35:35,918 - INFO - Model and vectorizer saved successfully.
