In [None]:
import pandas as pd

# Load the augmented dataset
data_path = '/content/augmented_dataset.csv'  # Replace with your actual path if different
df_augmented = pd.read_csv(data_path)

# Display the first few rows
print(df_augmented.head())

# Check the class distribution
print("\nClass Distribution:")
print(df_augmented['queue'].value_counts())


        queue  priority language                                subject  \
0    Hardware       2.0       en  Wireless Mouse suddenly stops working   
1    Hardware       2.0       fr          Problème de connexions IP PBX   
2    Hardware       2.0       de        Problem mit meinem SFX-Netzteil   
3  Accounting       2.0       en             Invoice Adjustment Request   
4    Software       2.0       en    Issue with Arbitrum: UI not loading   

                                                text  \
0  Dear Support Team, I've been using the Wireles...   
1  Bonjour, nous rencontrons un problème avec not...   
2  Sehr geehrte Damen und Herren, mein SFX-Netzte...   
3  Dear Customer Support,\nI recently received my...   
4  Hello Support Team,\nI've been experiencing an...   

                      translated_subject  \
0  Wireless Mouse suddenly stops working   
1             PBX IP Connections Problem   
2          Problem with my SFX interface   
3             Invoice Adjustment Req

In [None]:
from sklearn.model_selection import train_test_split

# Features and labels
X = df_augmented['combined_text']
y = df_augmented['queue']

# Encode the labels
y_encoded = y.astype('category').cat.codes
label_map = dict(enumerate(y.astype('category').cat.categories))

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

print("Data split into training and test sets.")


Data split into training and test sets.


In [None]:
from transformers import BertTokenizer
import torch

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_data(texts, labels):
    inputs = tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    inputs['labels'] = torch.tensor(labels.values)
    return inputs

# Tokenize training and test data
train_encodings = tokenize_data(X_train, y_train)
test_encodings = tokenize_data(X_test, y_test)

print("Data tokenization complete.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Data tokenization complete.


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Create TensorDatasets
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    train_encodings['labels']
)

test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    test_encodings['labels']
)

print("Datasets created.")


Datasets created.


In [None]:
batch_size = 16

# Create DataLoaders
train_loader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

test_loader = DataLoader(
    test_dataset,
    sampler=SequentialSampler(test_dataset),
    batch_size=batch_size
)

print("DataLoaders initialized.")


DataLoaders initialized.


In [None]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Load BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_map),
    output_attentions=False,
    output_hidden_states=False
)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print("Model loaded and moved to device:", device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to device: cpu


In [None]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Total number of training steps
epochs = 3
total_steps = len(train_loader) * epochs

# Learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

print("Optimizer and scheduler set up.")


Optimizer and scheduler set up.




In [None]:
import numpy as np

# Function to calculate accuracy
def flat_accuracy(preds, labels):
    """
    Compute accuracy by comparing predictions with true labels.
    Args:
        preds: Logits from the model (numpy array of shape [batch_size, num_classes]).
        labels: True labels (numpy array of shape [batch_size]).
    Returns:
        Accuracy as a float.
    """
    pred_flat = np.argmax(preds, axis=1).flatten()  # Get the predicted class indices
    labels_flat = labels.flatten()  # Flatten true labels
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


In [None]:
from tqdm.notebook import tqdm

# Training loop
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    print("-------------------------------")

    # Training phase
    model.train()
    total_train_loss = 0
    train_accuracy = 0

    for batch in tqdm(train_loader, desc="Training"):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device).long()  # Ensure labels are integers

        # Zero the gradients
        model.zero_grad()

        # Forward pass
        outputs = model(
            b_input_ids,
            attention_mask=b_input_mask,
            labels=b_labels
        )

        loss = outputs.loss  # Compute the loss
        logits = outputs.logits  # Predicted logits

        # Backward pass
        total_train_loss += loss.item()  # Accumulate training loss
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update model weights
        optimizer.step()
        scheduler.step()

        # Calculate training accuracy
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()
        train_accuracy += flat_accuracy(logits, label_ids)

    avg_train_loss = total_train_loss / len(train_loader)
    avg_train_accuracy = train_accuracy / len(train_loader)

    print(f"Training loss: {avg_train_loss:.3f}")
    print(f"Training accuracy: {avg_train_accuracy:.3f}")

    # Evaluation phase
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in tqdm(test_loader, desc="Evaluating"):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device).long()

        with torch.no_grad():
            outputs = model(
                b_input_ids,
                attention_mask=b_input_mask,
                labels=b_labels
            )

        loss = outputs.loss  # Compute the loss
        logits = outputs.logits
        total_eval_loss += loss.item()

        # Calculate validation accuracy
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(test_loader)
    avg_val_loss = total_eval_loss / len(test_loader)

    print(f"Validation loss: {avg_val_loss:.3f}")
    print(f"Validation accuracy: {avg_val_accuracy:.3f}")



Epoch 1/3
-------------------------------


Training:   0%|          | 0/45 [00:00<?, ?it/s]

Training loss: 0.771
Training accuracy: 0.701


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

Validation loss: 0.327
Validation accuracy: 0.979

Epoch 2/3
-------------------------------


Training:   0%|          | 0/45 [00:00<?, ?it/s]

Training loss: 0.162
Training accuracy: 0.986


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

Validation loss: 0.040
Validation accuracy: 1.000

Epoch 3/3
-------------------------------


Training:   0%|          | 0/45 [00:00<?, ?it/s]

Training loss: 0.032
Training accuracy: 1.000


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

Validation loss: 0.020
Validation accuracy: 1.000


In [None]:
from sklearn.metrics import classification_report

# Switch the model to evaluation mode
model.eval()

# Initialize lists to store predictions and true labels
all_preds = []
all_labels = []

# Loop through the test set
with torch.no_grad():
    for batch in test_loader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Get model predictions
        outputs = model(
            b_input_ids,
            attention_mask=b_input_mask
        )

        # Predicted class
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)

        # Store predictions and true labels
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(b_labels.cpu().numpy())

# Generate classification report
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=list(label_map.values())))



Classification Report:
              precision    recall  f1-score   support

  Accounting       1.00      1.00      1.00        60
    Hardware       1.00      1.00      1.00        60
    Software       1.00      1.00      1.00        60

    accuracy                           1.00       180
   macro avg       1.00      1.00      1.00       180
weighted avg       1.00      1.00      1.00       180



In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine_tuned_bert')
tokenizer.save_pretrained('./fine_tuned_bert')

print("Model and tokenizer saved successfully!")


Model and tokenizer saved successfully!


In [None]:
def predict_text(text, model, tokenizer, label_map):
    """
    Predict the class of a given input text using the fine-tuned BERT model.
    Args:
        text (str): Input text.
        model: Fine-tuned BERT model.
        tokenizer: Tokenizer used for BERT.
        label_map (dict): Mapping of label indices to class names.
    Returns:
        str: Predicted class name.
    """
    model.eval()

    # Tokenize and encode the input text
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(device)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, axis=1).cpu().numpy()

    # Map prediction index to class label
    return label_map[preds[0]]

# Example usage
label_map = {idx: label for idx, label in enumerate(y_train.astype('category').cat.categories)}
text = "My software is not responding."
predicted_class = predict_text(text, model, tokenizer, label_map)
print(f"Predicted Class: {predicted_class}")



Predicted Class: 2


In [None]:
def predict_text(text, model, tokenizer, label_map):
    """
    Predict the class of a given input text using the fine-tuned BERT model.
    Args:
        text (str): Input text.
        model: Fine-tuned BERT model.
        tokenizer: Tokenizer used for BERT.
        label_map (dict): Mapping of label indices to class names.
    Returns:
        str: Predicted class name.
    """
    model.eval()

    # Tokenize and encode the input text
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(device)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, axis=1).cpu().numpy()

    # Map prediction index to class label
    return label_map[preds[0]]

# Interactive loop for user input
while True:
    # Get user input
    user_text = input("Enter a ticket description (or type 'exit' to quit): ")
    if user_text.lower() == "exit":
        print("Exiting...")
        break

    # Predict the class
    predicted_class = predict_text(user_text, model, tokenizer, label_map)
    print(f"Predicted Class: {predicted_class}\n")


Enter a ticket description (or type 'exit' to quit): How do I fix a DLL error on my system
Predicted Class: 2

Enter a ticket description (or type 'exit' to quit): "I need help installing the latest software update."
Predicted Class: 2

Enter a ticket description (or type 'exit' to quit): "I need help updating my antivirus program.
Predicted Class: 2

Enter a ticket description (or type 'exit' to quit): The monitor displays a black screen when I turn it on
Predicted Class: 1

Enter a ticket description (or type 'exit' to quit): "The hard drive is making strange clicking noises.
Predicted Class: 1

Enter a ticket description (or type 'exit' to quit): How can I reconcile the monthly expenses for the department?
Predicted Class: 0

Enter a ticket description (or type 'exit' to quit): I need help generating a financial report for the last quarter
Predicted Class: 0

Enter a ticket description (or type 'exit' to quit): I have a question about my billing statement."
Predicted Class: 0

Enter

In [None]:
import torch

def predict_text_with_response(text, model, tokenizer, label_map):
    """
    Predict the class of a given input text and return a response.
    Args:
        text (str): Input text.
        model: Fine-tuned BERT model.
        tokenizer: Tokenizer used for BERT.
        label_map (dict): Mapping of label indices to class names.
    Returns:
        tuple: Predicted class, confidence score, and response.
    """
    model.eval()

    # Tokenize and encode the input text
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(device)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        confidence, pred_class = torch.max(probs, dim=1)

    predicted_label = label_map[pred_class.item()]
    confidence_score = confidence.item()

    # Define responses for each class
    responses = {
        "Accounting": "This seems to be an accounting-related issue. Can you provide more details, such as billing or payroll specifics?",
        "Hardware": "It looks like a hardware-related issue. Please specify the device causing the problem.",
        "Software": "This seems to be a software-related issue. Could you mention the software you're facing issues with?",
    }

    response = responses.get(predicted_label, "I'm not sure about this issue. Could you provide more details?")
    return predicted_label, confidence_score, response


In [None]:
# Define your label map (ensure it matches your training labels)
label_map = {
    0: "Accounting",
    1: "Hardware",
    2: "Software"
}

# Chatbot loop
print("Welcome to the IT Ticket Classifier  Type 'exit' to quit.\n")

while True:
    # Get user input
    user_input = input("You: ")

    # Exit condition
    if user_input.lower() == "exit":
        print(" Goodbye! Feel free to reach out again.")
        break

    # Predict the class
    predicted_label, confidence, bot_response = predict_text_with_response(user_input, model, tokenizer, label_map)

    # Print the response
    print(f"Chatbot: {bot_response} (Predicted Class: {predicted_label}, Confidence: {confidence:.2f})\n")


Welcome to the IT Ticket Classifier  Type 'exit' to quit.

You: MY HEART IS BROKEN
Chatbot: It looks like a hardware-related issue. Please specify the device causing the problem. (Predicted Class: Hardware, Confidence: 0.47)

You: MY CALCULATOR IS GIVING MY CALCULATION NOT WORKING
Chatbot: It looks like a hardware-related issue. Please specify the device causing the problem. (Predicted Class: Hardware, Confidence: 0.91)

You: MY CALCULATOR APPLICATION IS NOT CORRECTLY CALCULATING THE QUESTION
Chatbot: This seems to be a software-related issue. Could you mention the software you're facing issues with? (Predicted Class: Software, Confidence: 0.83)

You: IM UNABLE TO SEND WATSAPP STICKERS
Chatbot: This seems to be a software-related issue. Could you mention the software you're facing issues with? (Predicted Class: Software, Confidence: 0.48)

You: I CANT SEE ANYTHING
Chatbot: It looks like a hardware-related issue. Please specify the device causing the problem. (Predicted Class: Hardware,

In [None]:
import shutil

shutil.make_archive('fine_tuned_bert', 'zip', './fine_tuned_bert')
print("Zipped model saved as fine_tuned_bert.zip")


Zipped model saved as fine_tuned_bert.zip
