In [None]:
# Task 1 - Movie Genre Classification

# Step 1: Importing Libraries
# ---------------------------------------------------
# In this step, I am importing all the required libraries for data loading, cleaning, and model building.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Step 2: Load the Dataset
# ---------------------------------------------------
# The dataset is taken from Kaggle (IMDB Genre Classification Dataset).
# I have already downloaded the CSV file and placed it in my working directory.

df = pd.read_csv("Genre Classification Dataset/train_data.txt", sep=' ::: ', engine='python', names=['id', 'movie', 'genre', 'plot'])
# change filename as per your file
df.head()

# Step 3: Basic Data Info
# ---------------------------------------------------
print("Shape of dataset:", df.shape)
print("Columns:", df.columns)
print(df['genre'].value_counts())

# Step 4: Data Cleaning
# ---------------------------------------------------
# Here I am cleaning the plot text by removing punctuations, special characters, and stopwords.

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special chars
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

df['clean_plot'] = df['plot'].apply(clean_text)

# Step 5: Splitting Data
# ---------------------------------------------------
# I am using 80% data for training and 20% for testing.

X = df['clean_plot']
y = df['genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Text Vectorization
# ---------------------------------------------------
# Converting text into numerical form using TF-IDF (Term Frequency â€“ Inverse Document Frequency).

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("Shape of TF-IDF Matrix:", X_train_tfidf.shape)

# Step 7: Model Training
# ---------------------------------------------------
# I chose Logistic Regression since it performs well on text classification tasks.

model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

# Step 8: Model Evaluation
# ---------------------------------------------------
# Checking model performance using accuracy and classification report.

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 9: Sample Prediction
# ---------------------------------------------------
# Let's test with a sample movie plot.

sample_plot = ["A young boy discovers he has magical powers and attends a school for wizards."]
sample_clean = [clean_text(sample_plot[0])]
sample_tfidf = tfidf.transform(sample_clean)
prediction = model.predict(sample_tfidf)

print("\nSample Prediction:", prediction[0])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/syedmadni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Shape of dataset: (54214, 4)
Columns: Index(['id', 'movie', 'genre', 'plot'], dtype='object')
genre
drama          13613
documentary    13096
comedy          7447
short           5073
horror          2204
thriller        1591
action          1315
western         1032
reality-tv       884
family           784
adventure        775
music            731
romance          672
sci-fi           647
adult            590
crime            505
animation        498
sport            432
talk-show        391
fantasy          323
mystery          319
musical          277
biography        265
history          243
game-show        194
news             181
war              132
Name: count, dtype: int64
Shape of TF-IDF Matrix: (43371, 5000)
Accuracy: 0.580282209720557

Classification Report:
               precision    recall  f1-score   support

      action       0.51      0.26      0.35       263
       adult       0.77      0.21      0.34       112
   adventure       0.43      0.14      0.21       139

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
