# Project: Movie Genre Classification

## Problem Statement
### Build a machine learning model that predicts the genre of a movie based on its plot description

# Step 1: Import required Libraries

In [2]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, classification_report

# Step 2: Load Dataset

In [4]:
train_data = pd.read_csv("train_data.txt", sep =" ::: ", engine = 'python', names=["ID", "TITLE", "GENRE", "DESCRIPTION"])

test_data = pd.read_csv("test_data.txt", sep =" ::: ", engine = 'python', names=["ID", "TITLE", "DESCRIPTION"])

test_solution = pd.read_csv("test_data_solution.txt", sep =" ::: ", engine = 'python', names=["ID", "TITLE", "GENRE", "DESCRIPTION"])


# Step 3: Text Preprocessing

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    return text

train_data["clean_desc"] = train_data["DESCRIPTION"].apply(clean_text)
test_data["clean_desc"] = test_data["DESCRIPTION"].apply(clean_text)

# Step 4: Covert Text - TF-IDF Features

In [9]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

X_train = tfidf.fit_transform(train_data["clean_desc"])
y_train = train_data["GENRE"]

X_test = tfidf.transform(test_data["clean_desc"])

# Step 5: Train Model

## 1.Naive Bayes(Fast + Good for text)

In [10]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

y_pred_nb = nb_model.predict(X_test)

# 2.Logistic Regression(Better Accuracy)

In [11]:
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

# 3.Support vector Machine(Hight Performance)

In [13]:
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

# Step 6: Evaluate Model

In [14]:
y_test_actual = test_solution["GENRE"]

print("Accuracy:", accuracy_score(y_test_actual, y_pred_lr))

print(classification_report(y_test_actual, y_pred_lr))

Accuracy: 0.5825461254612546


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

      action       0.48      0.28      0.36      1314
       adult       0.59      0.23      0.33       590
   adventure       0.59      0.16      0.25       775
   animation       0.49      0.06      0.11       498
   biography       0.00      0.00      0.00       264
      comedy       0.53      0.58      0.55      7446
       crime       0.36      0.04      0.07       505
 documentary       0.67      0.85      0.75     13096
       drama       0.54      0.77      0.64     13612
      family       0.50      0.09      0.15       783
     fantasy       0.57      0.06      0.11       322
   game-show       0.91      0.51      0.66       193
     history       0.00      0.00      0.00       243
      horror       0.64      0.57      0.60      2204
       music       0.68      0.45      0.54       731
     musical       0.25      0.02      0.03       276
     mystery       0.30      0.01      0.02       318
        news       0.71    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Step 7: Save Predictions

In [15]:
submission = test_data.copy()
submission["predicted_Genre"] = y_pred_lr
submission.to_csv("submission.csv", index=False)