In [1]:
#Import Required Libraries

import re
import string
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score

In [4]:
#2.Loading Labeled Dataset
print("\n==========================================")
print("STEP 1:LOADING DATASET")
print("==========================================")

df = pd.read_csv("train_data.csv",
                 sep = ":::",
                 engine = "python",
                 names = ["id", "title", "genre", "text"]
                )

print("Dataset Loaded Successfully.")
print("Columns Available:", list(df.columns))
print(f"Total records found:{len(df)}\n")


STEP 1:LOADING DATASET
Dataset Loaded Successfully.
Columns Available: ['id', 'title', 'genre', 'text']
Total records found:54214



In [5]:
#3.Data Cleaning
print("\n==========================================")
print("STEP 2:Data Cleaning (Basic)")
print("==========================================\n")

df = df[["text", "genre"]]
df.dropna(subset=["text", "genre"], inplace=True)

df["text"] = df["text"].str.strip()
df["genre"] = df["genre"].str.strip()

print("2A.TEXT AND COLUMNS SUCCESSFULLY CLEANED.")
print("SAMPLE DATA AFTER BASIC CLEANING:-")
print(df.head(5))
print("------------------------------------------\n")


STEP 2:Data Cleaning (Basic)

2A.TEXT AND COLUMNS SUCCESSFULLY CLEANED.
SAMPLE DATA AFTER BASIC CLEANING:-
                                                text     genre
0  Listening in to a conversation between his doc...     drama
1  A brother and sister with a past incestuous re...  thriller
2  As the bus empties the students for their fiel...     adult
3  To help their unemployed father make ends meet...     drama
4  The film's title refers not only to the un-rec...     drama
------------------------------------------



In [6]:
#4.Text preprocessing function
def clean_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r"\d+", "", sentence)
    sentence = sentence.translate(
        str.maketrans("", "", string.punctuation)
    )
    sentence = " ".join(sentence.split())
    return sentence

df["clean_text"] = df["text"].apply(clean_sentence)
print("2B.TEXT PROCESSING SUCCESSFULLY COMPLETED.")
print("SAMPLE TEXT BEFORE AND AFTER PREPROCESSING:-")
sample_preview = df[["text","clean_text"]].head(5)
print(sample_preview)
print("------------------------------------------\n")


2B.TEXT PROCESSING SUCCESSFULLY COMPLETED.
SAMPLE TEXT BEFORE AND AFTER PREPROCESSING:-
                                                text  \
0  Listening in to a conversation between his doc...   
1  A brother and sister with a past incestuous re...   
2  As the bus empties the students for their fiel...   
3  To help their unemployed father make ends meet...   
4  The film's title refers not only to the un-rec...   

                                          clean_text  
0  listening in to a conversation between his doc...  
1  a brother and sister with a past incestuous re...  
2  as the bus empties the students for their fiel...  
3  to help their unemployed father make ends meet...  
4  the films title refers not only to the unrecov...  
------------------------------------------



In [7]:
#5.Feature-label Split

X = df["clean_text"]  # Input features (text)
y = df["genre"]       # Target labels (genre)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    shuffle = True
    )

print("\n==========================================")
print(" STEP 3: DATA SPLITTING ")
print("==========================================\n")

print(f"Data used to TRAIN (teach) the model : {len(X_train)} samples")
print(f"Data used to TEST  (check) the model : {len(X_test)} samples\n")



 STEP 3: DATA SPLITTING 

Data used to TRAIN (teach) the model : 43371 samples
Data used to TEST  (check) the model : 10843 samples



In [8]:
#6.Converting data from text to numerical format
print("\n==========================================")
print(" STEP 4: TF-IDF VECTORIZATION ")
print("==========================================")
# Show example BEFORE conversion
print("-" * 70)
print("TEXT BEFORE CONVERSION (HUMAN-READABLE):-")
print("-" * 70)
print(X_train.iloc[2], "\n")

# TF-IDF setup
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec  = vectorizer.transform(X_test)

# Show example AFTER conversion
print("-" * 70)
print("TEXT AFTER CONVERSION (MACHINE READABLE NUMBERS):-")
print("-" * 70)
print(X_train_vec[2])


 STEP 4: TF-IDF VECTORIZATION 
----------------------------------------------------------------------
TEXT BEFORE CONVERSION (HUMAN-READABLE):-
----------------------------------------------------------------------
the onehour special examines this socially ethnically religiously and economically diverse state that just like in could determine the outcome of the presidential election wilmore is intent on exploring how this unique state that produced two revered american institutions nascar and hooters will choose the next leader of the free world 

----------------------------------------------------------------------
TEXT AFTER CONVERSION (MACHINE READABLE NUMBERS):-
----------------------------------------------------------------------
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 22 stored elements and shape (1, 5000)>
  Coords	Values
  (0, 3129)	0.25344944408414305
  (0, 4194)	0.162163112922227
  (0, 1580)	0.20131220727851937
  (0, 4147)	0.25941959407053256
  (0, 1

In [9]:

#7.Training Model

# ===============================
# STEP 5: MODEL TRAINING
# ===============================

print("\n" + "-" * 70)
print("STEP 5: MODEL TRAINING (NAIVE BAYES CLASSIFIER)")
print("-" * 70 + "\n")

model = MultinomialNB()
model.fit(X_train_vec, y_train)

print("Status : Model training completed")
print("Input  : TF-IDF numeric vectors")
print("Output : Learned word–genre patterns\n")


----------------------------------------------------------------------
STEP 5: MODEL TRAINING (NAIVE BAYES CLASSIFIER)
----------------------------------------------------------------------

Status : Model training completed
Input  : TF-IDF numeric vectors
Output : Learned word–genre patterns



In [10]:
#8.Model Evaluation
# Accuracy:
# Measures overall correctness of the model.
# (How many predictions were correct out of total predictions)

# F1 Score:
# Balances precision and recall.
# Useful when classes are imbalanced.

# Confusion Matrix:
# Shows correct vs incorrect predictions for each class.
# Helps analyze where the model gets confused.
print("==========================================")
print(" STEP 6: MODEL EVALUATION ")
print("==========================================")
y_pred = model.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy Score : {accuracy:.4f}")
print(f"F1 Score       : {f1:.4f}")
print("Confusion Matrix:")
print(cm)


 STEP 6: MODEL EVALUATION 
Accuracy Score : 0.5211
F1 Score       : 0.4415
Confusion Matrix:
[[  20    0    0    0    0   18    0   55  154    0    0    0    0    8
     0    0    0    0    0    0    0    2    3    0    2    0    1]
 [   0    6    7    0    0   37    0    9   49    0    0    0    0    0
     0    0    0    0    0    0    0    3    0    0    1    0    0]
 [   3    0   10    0    0   10    0   38   66    0    0    0    0    7
     0    0    0    0    0    0    1    2    0    0    0    0    2]
 [   0    0    0    0    0   18    0   38   45    0    0    0    0    0
     0    0    0    0    0    0    0    3    0    0    0    0    0]
 [   0    0    0    0    0    2    0   39   20    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0]
 [   2    0    1    0    0  630    0  156  641    0    0    0    0    6
     1    0    0    0    1    0    0    5    0    0    0    0    0]
 [   1    0    0    0    0    9    0   10   83    0    0    0    0 