<a href="https://colab.research.google.com/github/thevedantt/SvaraAI-Internship-Assignment/blob/main/PART_A_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load and preprocess both columns

import pandas as pd
import re

# Step 1: Load dataset
df = pd.read_csv("/content/reply_classification_dataset.csv")

# Step 2: Normalize column names
df.columns = df.columns.str.strip().str.lower()

# Step 3: Preprocess labels
df['label'] = df['label'].astype(str).str.strip().str.lower()

# Step 3a: Convert labels to numeric
label_mapping = {'positive': 1, 'negative': 0, 'neutral': 2}
df['label'] = df['label'].map(label_mapping)

# Optional: Drop rows with labels that were not in the mapping
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)  # convert to integer


# Step 4: Preprocess reply text
def clean_reply(text):
    text = str(text).lower()  # lowercase
    # Normalize common special characters
    text = text.replace("â€™", "'").replace("â€œ", '"').replace("â€", '"')
    text = re.sub(r"http\S+|www\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-z\s']", "", text)       # keep letters and apostrophes
    text = re.sub(r"\s+", " ", text).strip()   # remove extra spaces
    return text

df['reply'] = df['reply'].apply(clean_reply)

# Step 5: Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# Drop rows with missing labels
df = df.dropna(subset=['label'])

# Step 6: Show unique labels
print("\nUnique labels after cleaning:", df['label'].unique())

# Step 7: Display first 5 and last 5 cleaned rows
print("\nFirst 5 rows:\n")
print(df.head(5))

print("\nLast 5 rows:\n")
print(df.tail(5))

# Step 8: Save cleaned data to a new CSV
df.to_csv("data.csv", index=False)
print("\nCleaned data with numeric labels saved to data.csv")



Missing values per column:
 reply    0
label    0
dtype: int64

Unique labels after cleaning: [2 1 0]

First 5 rows:

                                               reply  label
0                             can we discuss pricing      2
1  im excited to explore this further plz send co...      1
2                   we not looking for new solutions      0
3                  could u clarify features included      2
4             lets schedule a meeting to dive deeper      1

Last 5 rows:

                                     reply  label
2124        ill forward this to my manager      2
2125        can you share more information      2
2126    send me the details and ill review      2
2127     what exactly does your product do      2
2128  i am not the right person to contact      0

Cleaned data with numeric labels saved to data.csv


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Load dataset
df = pd.read_csv("data.csv")

# Drop duplicate replies
df = df.drop_duplicates(subset=["reply"])

# Train / Test split (hold out true unseen test set)
train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df['label'], random_state=42
)

# Train / Validation split (for early stopping)
train_df, val_df = train_test_split(
    train_df, test_size=0.2, stratify=train_df['label'], random_state=42
)

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

# Vectorizer
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=1500,
    min_df=5,
    max_df=0.6,
    stop_words="english"
)

X_train = vectorizer.fit_transform(train_df["reply"])
y_train = train_df["label"]

X_val = vectorizer.transform(val_df["reply"])
y_val = val_df["label"]

X_test = vectorizer.transform(test_df["reply"])
y_test = test_df["label"]

# Compute class weights manually
classes = np.unique(y_train)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)
class_weight_dict = dict(zip(classes, class_weights))

# Map sample weights for y_train
sample_weight = np.array([class_weight_dict[label] for label in y_train])

# Model
clf = SGDClassifier(
    loss="log_loss",
    max_iter=1,
    penalty="elasticnet",
    alpha=1e-3,       # stronger regularization
    l1_ratio=0.3,
    random_state=42,
    warm_start=True
)

# Cross-validation (sanity check)
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring="f1_weighted")
print("CV F1 scores:", scores)
print("Mean F1:", scores.mean())

# Training with early stopping
best_val_f1 = 0
patience = 2
wait = 0
epochs = 10

for epoch in range(epochs):
    clf.partial_fit(X_train, y_train, classes=classes, sample_weight=sample_weight)

    val_pred = clf.predict(X_val)
    val_report = classification_report(y_val, val_pred, output_dict=True, zero_division=0)
    val_f1 = val_report["weighted avg"]["f1-score"]

    print(f"Epoch {epoch+1} - Val F1: {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping triggered")
            break

# Final test evaluation
test_pred = clf.predict(X_test)
print("\nTest Set Performance:")
print(classification_report(y_test, test_pred, zero_division=0))


Train size: 202
Validation size: 51
Test size: 64
CV F1 scores: [0.94957056 0.90326217 0.97511244 0.9024106  0.92533333]
Mean F1: 0.9311378206162899
Epoch 1 - Val F1: 0.9408
Epoch 2 - Val F1: 0.9608
Epoch 3 - Val F1: 0.9608




Epoch 4 - Val F1: 0.9608
Early stopping triggered

Test Set Performance:
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        17
           1       0.91      0.95      0.93        22
           2       1.00      0.76      0.86        25

    accuracy                           0.89        64
   macro avg       0.90      0.90      0.89        64
weighted avg       0.91      0.89      0.89        64



In [24]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.calibration import CalibratedClassifierCV
import numpy as np
import joblib
import os

# Load dataset
df = pd.read_csv("data.csv")
print("Original dataset size:", len(df))

# Train/Test split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
print("Train size:", len(train_df), " | Test size:", len(test_df))

# Train/Validation split
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)
print("Train size:", len(train_df), " | Validation size:", len(val_df), " | Test size:", len(test_df))

# Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=2000, min_df=5, max_df=0.6, stop_words="english")
X_train = vectorizer.fit_transform(train_df["reply"])
y_train = train_df["label"]

X_val = vectorizer.transform(val_df["reply"])
y_val = val_df["label"]

X_test = vectorizer.transform(test_df["reply"])
y_test = test_df["label"]

# Compute class weights
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))
sample_weight = np.array([class_weight_dict[label] for label in y_train])

# Model
clf = SGDClassifier(loss="log_loss", max_iter=1, penalty="elasticnet", alpha=1e-3, l1_ratio=0.3, random_state=42, warm_start=True)

# Cross-validation
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring="f1_weighted")
print("CV F1 scores:", scores)
print("Mean CV F1:", scores.mean())

# Training with early stopping
best_val_f1 = 0
patience = 2
wait = 0
epochs = 10

for epoch in range(epochs):
    clf.partial_fit(X_train, y_train, classes=classes, sample_weight=sample_weight)
    val_pred = clf.predict(X_val)
    val_f1 = classification_report(y_val, val_pred, output_dict=True, zero_division=0)["weighted avg"]["f1-score"]
    val_acc = accuracy_score(y_val, val_pred)
    print(f"Epoch {epoch+1} - Val Accuracy: {val_acc:.4f}, Val F1: {val_f1:.4f}")
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping triggered")
            break

# Calibrate model
calibrated_clf = CalibratedClassifierCV(estimator=clf, cv=5, method="sigmoid")
calibrated_clf.fit(X_train, y_train, sample_weight=sample_weight)

# Final test evaluation
test_pred = calibrated_clf.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)
print("\nTest Accuracy:", round(test_acc, 4))
print("\nTest Set Performance:")
print(classification_report(y_test, test_pred, zero_division=0))


# Part 5: Example Predictions
print("\n--- Example Predictions with Final Model ---")
example_replies = [
    "Looking forward to the demo tomorrow.",
    "I'll need to check with my team.",
    "Can we discuss pricing??",
    "no thanks"
]

X_example = vectorizer.transform(example_replies)
example_probs = calibrated_clf.predict_proba(X_example)
example_pred = calibrated_clf.predict(X_example)

label_mapping_rev = {1: "positive", 0: "negative", 2: "neutral"}

for reply, pred, probs in zip(example_replies, example_pred, example_probs):
    conf_dict = {label_mapping_rev[i]: np.float64(prob) for i, prob in enumerate(probs)}
    print(f"\nReply: '{reply}'")
    print(f"  Predicted Label: '{label_mapping_rev[pred]}'")
    print(f"  Confidence per class: {conf_dict}")


# Save the baseline model
os.makedirs("./baseline_model", exist_ok=True)
joblib.dump(calibrated_clf, "./baseline_model/calibrated_sgd_model.pkl")
joblib.dump(vectorizer, "./baseline_model/vectorizer.pkl")
print("Model and vectorizer saved in './baseline_model'")


Original dataset size: 2129
Train size: 1703  | Test size: 426
Train size: 1362  | Validation size: 341  | Test size: 426
CV F1 scores: [0.98540333 0.98534622 0.99264617 0.99632353 0.97792096]
Mean CV F1: 0.9875280411333414
Epoch 1 - Val Accuracy: 0.9971, Val F1: 0.9971
Epoch 2 - Val Accuracy: 0.9971, Val F1: 0.9971
Epoch 3 - Val Accuracy: 0.9971, Val F1: 0.9971
Early stopping triggered

Test Accuracy: 0.9883

Test Set Performance:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       142
           1       0.99      0.99      0.99       142
           2       0.99      0.98      0.99       142

    accuracy                           0.99       426
   macro avg       0.99      0.99      0.99       426
weighted avg       0.99      0.99      0.99       426


--- Example Predictions with Final Model ---

Reply: 'Looking forward to the demo tomorrow.'
  Predicted Label: 'positive'
  Confidence per class: {'negative': np.float64(0.062927677

In [29]:
# --- IMPORTANT CELL: Fine-tuning and saving DistilBERT Reply Classifier ---
import os
import torch
import pandas as pd
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from datasets import Dataset

# --- 1. Load Dataset ---
df = pd.read_csv("data.csv")
print("Original dataset size:", len(df))

# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# --- 2. Preprocessing & Tokenization ---
MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["reply"], padding="max_length", truncation=True, max_length=10)

print("\nTokenizing datasets...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
print("Tokenization complete.")

# --- 3. Model Setup ---
print("\nLoading DistilBERT model...")
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

# --- 4. Metrics ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# --- 5. Training ---
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir="./results_max_length_10",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs_max_length_10",
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("\nStarting fine-tuning with max_length=10...")
trainer.train()

# --- 6. Evaluation ---
print("\nFinal evaluation on the test set:")
eval_results = trainer.evaluate()
print(f"Final validation accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"Final validation loss: {eval_results['eval_loss']:.4f}")

# --- 7. Save model and tokenizer ---
SAVE_PATH = "/content/distilbert_reply_classifier"
os.makedirs(SAVE_PATH, exist_ok=True)
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print(f"Model and tokenizer saved in '{SAVE_PATH}'")

print("\nTraining complete.")



Original dataset size: 2129
Train size: 1703, Test size: 426

Tokenizing datasets...


Map:   0%|          | 0/1703 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

Tokenization complete.

Loading DistilBERT model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Setting up training arguments...

Starting fine-tuning with max_length=10...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8513,0.216793,0.969484
2,0.0586,0.03512,0.992958
3,0.0046,0.035609,0.992958


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8513,0.216793,0.969484
2,0.0586,0.03512,0.992958
3,0.0046,0.035609,0.992958
4,0.0016,0.039718,0.992958
5,0.0008,0.044188,0.992958



Final evaluation on the test set:


Final validation accuracy: 0.9930
Final validation loss: 0.0351
Model and tokenizer saved in '/content/distilbert_reply_classifier'

Training complete.


In [30]:
import shutil

# Path to the model folder
model_folder = "/content/distilbert_reply_classifier"

# Path for the zip file
zip_path = "/content/distilbert_reply_classifier.zip"

# Create a zip of the folder
shutil.make_archive(base_name=zip_path.replace(".zip", ""), format='zip', root_dir=model_folder)

print(f"Model folder zipped successfully at: {zip_path}")


Model folder zipped successfully at: /content/distilbert_reply_classifier.zip


In [34]:
# Imports
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset

#  Define your example sentences
sentences = [
    "I really like your idea.",
    "Great job on the presentation!",
    "I don’t agree with your suggestion.",
    "This approach won’t work.",
    "Please send me the report by tomorrow.",
    "The document has been updated.",
    "Although there are some flaws, I think your idea has potential.",
    "While I appreciate your effort, this plan might not be suitable for our team.",
    "I’m happy with the progress, but we need to fix these critical issues first.",
    "You did a great job, yet some points still require attention."
    "Can you provide the latest figures?",
    "Let me know if you need any assistance."
]

# 2. Load tokenizer and model
model_path = "/content/distilbert_reply_classifier"  # adjust if needed
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.eval()  # evaluation mode

# 3. Create a Dataset
class ReplyDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

dataset = ReplyDataset(sentences, tokenizer)
loader = DataLoader(dataset, batch_size=8)

# 4. Make predictions
label_map = {0: "negative", 1: "positive", 2: "neutral"}  # adjust according to your model

all_preds = []
with torch.no_grad():
    for batch in loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.tolist())

# 5. Create DataFrame with results
results_df = pd.DataFrame({
    "reply": sentences,
    "predicted_label": [label_map[p] for p in all_preds]
})

print(results_df)


                                                reply predicted_label
0                            I really like your idea.        positive
1                      Great job on the presentation!        positive
2                 I don’t agree with your suggestion.        negative
3                           This approach won’t work.        negative
4              Please send me the report by tomorrow.         neutral
5                      The document has been updated.        negative
6   Although there are some flaws, I think your id...        negative
7   While I appreciate your effort, this plan migh...        negative
8   I’m happy with the progress, but we need to fi...        negative
9   You did a great job, yet some points still req...         neutral
10            Let me know if you need any assistance.        negative
