<a href="https://colab.research.google.com/github/vineethkunar/LargeLanguageModels/blob/main/LLM_Code_2ndAssgn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required NLP and evaluation libraries from Hugging Face
!pip install -q transformers datasets evaluate
# Suppress deprecation warnings to keep the notebook output clean
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Import core data manipulation libraries
import pandas as pd
import numpy as np
# Import text processing libraries for NLP
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Import visualization libraries
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
# Import scikit-learn utilities for feature extraction, evaluation, and data preparation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# Import system utilities
import os
# Import Hugging Face Transformers for BERT model
from transformers import BertForSequenceClassification
# Import PyTorch for tensor operations and GPU support
import torch
# Import Hugging Face Datasets for preparing inputs
from datasets import Dataset
# Import tokenizer, model, and training tools from Transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
# Import evaluation tools for model performance
import evaluate

In [None]:
# Load the training dataset from a GitHub URL into a DataFrame
Sub_Train_Df = pd.read_csv("https://raw.githubusercontent.com/vineethkunar/LargeLanguageModels/refs/heads/main/train.csv")
# Load the test dataset from a GitHub URL into a DataFrame
Sub_Test_Df = pd.read_csv("https://raw.githubusercontent.com/vineethkunar/LargeLanguageModels/refs/heads/main/test.csv")

In [None]:
# Display the first five rows of the training DataFrame to inspect the data
print(Sub_Train_Df.head())

In [None]:
# Generate summary statistics for numerical columns in the training dataset
Sub_Train_Df.describe()

In [None]:
# Get the number of rows and columns in the training dataset
Sub_Train_rows, Sub_Train_Cols = Sub_Train_Df.shape
# Print the dataset dimensions in a readable format
print(f"The shape of the original dataset is {Sub_Train_rows} reviews with {Sub_Train_Cols} columns.")

In [None]:
# Remove the 'Id' column from the training dataset
Sub_Train_Df = Sub_Train_Df.drop(columns=['Id']
# Remove the 'Id' column from the test dataset
Sub_Test_Df = Sub_Test_Df.drop(columns=['Id'])

In [None]:
# Print the count of missing (null) values in each column of the training dataset
Sub_Train_Df.isna().sum()

In [None]:
# Print the data types and non-null counts of each column in the training dataset
Sub_Train_Df.info()

In [None]:
# Download necessary NLTK resources for text preprocessing
nltk.download('stopwords')
nltk.download('wordnet')

# Create a lemmatizer for word normalization
subject_lemmatizer = WordNetLemmatizer()

# Load English stopwords to remove common filler words
filtered_stopwords = set(stopwords.words('english'))

# Function to clean and prepare text for subject classification
def prepare_subject_text(raw_input):
    """
    Cleans and normalizes the input text to make it suitable for subject classification.
    It performs case normalization, removes non-alphabetic characters, filters out stopwords,
    and applies lemmatization to reduce words to their base form.

    Parameters:
    raw_input (str): A raw string of text representing the subject-related comment or description.

    Returns:
    str: A cleaned and lemmatized version of the input text.
    """
    cleaned_text = raw_input.lower()
    cleaned_text = re.sub(r'[^a-z\s]', '', cleaned_text)
    tokens = cleaned_text.split()
    meaningful_tokens = [subject_lemmatizer.lemmatize(word) for word in tokens if word not in filtered_stopwords]
    return ' '.join(meaningful_tokens)

# Apply the text cleaning function to the training comments
Sub_Train_Df['Processed_Subject_Text'] = Sub_Train_Df['Comment'].apply(prepare_subject_text)

# Show the original and processed columns side-by-side
Sub_Train_Df[['Comment', 'Processed_Subject_Text']]


In [None]:
# Add a new column that stores the character length of each original comment
Sub_Train_Df['len_orgcomment'] = Sub_Train_Df['Comment'].apply(len)

In [None]:
# Check how many rows in the dataset are exact duplicates
print(f"Number of duplicate rows: {Sub_Train_Df.duplicated().sum()}")
# Remove duplicate rows from the training DataFrame
Sub_Train_Df = Sub_Train_Df.drop_duplicates()
# Print the new shape of the dataset after removing duplicates
print(f"Shape after removing duplicates: {Sub_Train_Df.shape}")

In [None]:
# Display the first five rows of the cleaned and updated training dataset
Sub_Train_Df.head()

In [None]:
# Print the list of column names in the dataset after preprocessing steps
Sub_Train_Df.columns

In [None]:
# Check the distribution of classes in the target 'Topic' column
Sub_count = Sub_Train_Df['Topic'].value_counts()
# Print the count of each label (subject/topic)
print("Topic Distribution:\n", Sub_count)

In [None]:
# Plot the distribution of subject topics in the dataset
plt.figure(figsize=(6, 4))
sns.countplot(x='Topic', data=Sub_Train_Df)
plt.title("TOpic Distribution")
plt.xlabel("Topic")
plt.ylabel("Count")
plt.show()


In [None]:
# Add a new column with the word count of each preprocessed comment
Sub_Train_Df['len_ppcomment'] = Sub_Train_Df['Processed_Subject_Text'].apply(lambda x: len(x.split()))

In [None]:
# Plot the distribution of word counts in comments used for subject classification
plt.figure(figsize=(8, 5))
sns.histplot(Sub_Train_Df['len_ppcomment'], bins=30, kde=True)
plt.title("Word Count Distribution in Subject Classification Comments")
plt.xlabel("Number of Words in Comment")
plt.ylabel("Number of Comments")
plt.show()

In [None]:
# Encode the target subject labels as integers for model compatibility
subject_label_encoder = LabelEncoder()
Sub_Train_Df["Encoded_Label"] = subject_label_encoder.fit_transform(Sub_Train_Df["Topic"])
Sub_Test_Df["Encoded_Label"] = subject_label_encoder.transform(Sub_Test_Df["Topic"])
# Assign the updated DataFrames to new clearly named variables for training and testing
Subject_Train_Data = Sub_Train_Df
Subject_Test_Data = Sub_Test_Df
# Convert the DataFrames into Hugging Face Dataset objects for model input
Subject_Train_Set = Dataset.from_pandas(Subject_Train_Data)
Subject_Test_Set = Dataset.from_pandas(Subject_Test_Data)

In [None]:
# Select the pre-trained model and tokenizer for subject classification
subject_model_name = "bert-base-uncased"
subject_tokenizer = AutoTokenizer.from_pretrained(subject_model_name)

In [None]:
# Define a function to tokenize and encode the input comments for the model
def encode_subject_batch(batch):
    tokens = subject_tokenizer(
        batch["Comment"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    tokens["labels"] = batch["Encoded_Label"]
    return tokens

# Apply the tokenizer to the training and test datasets
Subject_Train_Set = Subject_Train_Set.map(encode_subject_batch, batched=True)
Subject_Test_Set = Subject_Test_Set.map(encode_subject_batch, batched=True)


In [None]:
# Load the pre-trained BERT model for subject classification with 3 output labels
subject_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3)

In [None]:
# Define training parameters for subject classification
subject_batch_size = 64
subject_logging_steps = len(Subject_Train_Set) // subject_batch_size
subject_model_id = subject_model_name.split("/")[-1]

In [None]:
# Disable Weights & Biases logging during training
os.environ["WANDB_DISABLED"] = "true"
# Set up training arguments for the subject classification model
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

In [None]:
# Define the evaluation metric function for subject classification
subject_accuracy_metric = evaluate.load("accuracy")

def compute_subject_metrics(prediction_batch):
    """
    Computes the accuracy score for subject classification predictions.
    It takes the model's raw output scores and true subject labels, converts scores
    into predicted class indices, and evaluates them using the accuracy metric.

    Parameters:
    prediction_batch (tuple): A tuple containing model output scores and true subject labels.

    Returns:
    dict: A dictionary containing the accuracy score.
    """
    model_scores, true_subjects = prediction_batch
    predicted_subjects = torch.argmax(torch.tensor(model_scores), dim=-1)
    return subject_accuracy_metric.compute(predictions=predicted_subjects, references=true_subjects)

In [None]:
# Initialize the Trainer for subject classification and start model training
subject_trainer = Trainer(
    model=subject_model,
    args=subject_training_args,
    train_dataset=Subject_Train_Set,
    eval_dataset=Subject_Test_Set,
    compute_metrics=compute_subject_metrics)
# Begin training the model
subject_trainer.train()

In [None]:
# Print the list of original subject class names from the label encoder
print("Class Names:", subject_label_encoder.classes_)

In [None]:
# Evaluate the trained model on the subject classification test set
subject_eval_results = subject_trainer.evaluate()
# Print the evaluation metrics in a readable format
print("Evaluation Results:")
for metric_name, metric_value in subject_eval_results.items():
    print(f"{metric_name}: {metric_value:.4f}")

In [None]:
# Generate predictions on the test set using the trained subject classification model
subject_predictions = subject_trainer.predict(Subject_Test_Set)
# Extract true labels and predicted labels
true_subject_labels = subject_predictions.label_ids
predicted_subject_labels = torch.argmax(torch.tensor(subject_predictions.predictions), axis=1).numpy()
# Compute and display the confusion matrix
conf_matrix = confusion_matrix(true_subject_labels, predicted_subject_labels)
conf_matrix_display = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
conf_matrix_display.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix for Subject Classification")
plt.grid(False)
plt.show()

In [None]:
# Generate and print the classification report for subject prediction
subject_report = classification_report(true_subject_labels, predicted_subject_labels, digits=4)
print("Classification Report:\n")
print(subject_report)

In [None]:
# Extract original comment texts from the test dataset
sample_subject_texts = Subject_Test_Set["Comment"]
# Display 5 sample predictions with their true and predicted labels
print("\nSample Predictions on Test Set :\n")
for i in range(5):
    print(f"Text: {sample_subject_texts[i]}")
    print(f"Predicted Label: {predicted_subject_labels[i]}, True Label: {true_subject_labels[i]}")
    print("-" * 60)