![servicedesk](servicedesk.png)

CleverSupport is a company at the forefront of AI innovation, specializing in the development of AI-driven solutions to enhance customer support services. Their latest endeavor is to engineer a text classification system that can automatically categorize customer complaints. 

Your role as a data scientist involves the creation of a sophisticated machine learning model that can accurately assign complaints to specific categories, such as mortgage, credit card, money transfers, debt collection, etc.

In [59]:
from collections import Counter
import nltk, json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics import Accuracy, Precision, Recall

In [60]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/repl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [61]:
# Import data and labels
with open("words.json", 'r') as f1:
    words = json.load(f1)
with open("text.json", 'r') as f2:
    text = json.load(f2)
labels = np.load('labels.npy')

In [62]:
# print first 20 elements from the words list
# these are our vocab words - 10146 of them
print(len(words))
print(words[0:20])

10146
['_PAD', '_UNK', 'the', '.', 'i', 'to', ',', 'and', 'a', 'my', 'that', 'of', 'was', 'in', 'on', 'they', 'for', 'me', 'not', 'this']


In [63]:
# print first 20 elements from the text list
# these are tokenized sentences - 5000 of them
print(len(text))
print(text[0:2])

5000
[['i', 'called', 'because', 'i', 'have', 'been', 'receiving', '7', 'to', '8', 'calls', 'a', 'day', 'from', 'them', 'regarding', 'a', 'debt', 'and', 'the', 'representative', 'called', 'me', 'a', 'liar', 'after', 'i', 'asked', 'about', 'settling', 'my', 'account'], ['this', 'call', 'took', 'place', 'at', 'around', 'noon', ',', 'i', 'had', 'noticed', 'that', 'there', 'were', 'some', 'unusual', 'activities', 'on', 'my', 'credit', 'report', '.', 'i', 'called', 'midland', 'funding', 'to', 'find', 'out', 'why', 'i', 'had', '3', 'chargeoff', 'that', 'were', 'not', 'there', 'before', '.', 'after', 'a', 'few', 'rings', ',', 'i', 'was', 'greeted', 'with', 'hello', 'my', 'name', 'is', 'and', 'i', 'am', 'debt', 'collection', 'agent', '.', 'i', 'had', 'asked', 'numerous', 'times', 'as', 'to', 'his', 'full', 'name', 'without', 'any', 'luck', '.', 'he', 'kept', 'interrupting', 'me', 'and', 'saying', 'i', 'owed', 'this', 'much', 'and', 'i', 'have', 'to', 'pay', 'in', 'full', 'or', 'payment', '.', 

In [64]:
# print first 20 elements from the labels list
# these are labels for each sentence - 5000 of them
print(len(labels))
print(labels[0:2])

5000
[2 2]


In [65]:
print(np.unique(labels))
num_classes = len(np.unique(labels))

[0 1 2 3 4]


In [66]:
# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

In [67]:
# Looking up the mapping dictionary and assigning the index to the respective words
for i, sentence in enumerate(text):
    text[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]

In [68]:
print(len(text))
print(text[0:2])

5000
[[4, 62, 78, 4, 20, 48, 476, 779, 5, 953, 215, 8, 149, 26, 45, 316, 8, 81, 7, 2, 230, 62, 17, 8, 5025, 68, 4, 103, 95, 6772, 9, 23], [19, 89, 276, 445, 41, 441, 3766, 6, 4, 32, 577, 10, 88, 58, 217, 2952, 1608, 14, 9, 29, 118, 3, 4, 62, 1255, 927, 5, 312, 76, 154, 4, 32, 275, 7917, 10, 58, 18, 88, 191, 3, 68, 8, 382, 5471, 6, 4, 12, 7918, 22, 1325, 9, 152, 21, 7, 4, 67, 81, 197, 326, 3, 4, 32, 103, 574, 165, 31, 5, 218, 264, 152, 167, 66, 2556, 3, 73, 596, 0, 17, 7, 307, 4, 343, 19, 442, 7, 4, 20, 5, 85, 13, 264, 46, 43, 3, 4, 12, 49, 5, 137, 104, 11, 447, 8, 178, 206, 2, 81, 12, 109, 193, 3, 4, 378, 70, 73, 92, 645, 177, 24, 204, 14, 9, 152, 3, 73, 50, 18, 645, 3, 59, 73, 91, 12, 295, 225, 9, 152, 3, 4, 509, 32, 85, 46, 2, 197, 80, 1285, 14, 16, 779, 173, 3, 73, 604, 10, 70, 4, 180, 2, 215, 5, 17, 5, 392, 4, 32, 5, 1154, 5, 85, 3, 4, 91, 697, 4, 20, 2, 323, 5, 378, 95, 2, 539, 7, 177, 24, 462, 95, 3, 73, 91, 6, 697, 61, 169, 2, 360, 70, 61, 697, 85, 3, 61, 617, 7, 3137, 80, 33, 3

In [69]:
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

text = pad_input(text, 50)

In [70]:
print(len(text))
print(text[0:2])

5000
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    4   62   78    4   20   48  476  779    5  953
   215    8  149   26   45  316    8   81    7    2  230   62   17    8
  5025   68    4  103   95 6772    9   23]
 [  19   89  276  445   41  441 3766    6    4   32  577   10   88   58
   217 2952 1608   14    9   29  118    3    4   62 1255  927    5  312
    76  154    4   32  275 7917   10   58   18   88  191    3   68    8
   382 5471    6    4   12 7918   22 1325]]


**Preparing Training & Testing Data Data**

In [71]:
# Splitting dataset
# here inputs: text and labels are lists
# ouputs: train_text, test_text, train_label, test_label are numpy arrays
train_text, test_text, train_label, test_label = train_test_split(text, labels, test_size=0.2, random_state=42)

train_data = TensorDataset(torch.from_numpy(train_text), torch.from_numpy(train_label).long())
test_data = TensorDataset(torch.from_numpy(test_text), torch.from_numpy(test_label).long())

In [72]:
print(type(train_data))
print(len(train_data))
train_data[0]

<class 'torch.utils.data.dataset.TensorDataset'>
4000


(tensor([   4,   32,    8,  162,  190,   23,   41,  124,   30,    6,   60,  117,
          301,   51,   48,  198,    7,  909,    5,   33,  198,   90,    5,  275,
            8,  149,    5, 1847,   17, 3695,    6, 1646,    6,    7, 1866,    6,
            5,    2,  384,   11, 4383,   11, 1458, 1208, 2724, 3029,   26,    2,
         1045, 1013]),
 tensor(0))

In [73]:
# You can achieve the same functionality of TensorDataset by creating a custom dataset using torch.utils.data.Dataset. Here’s how you can rewrite that line using a custom dataset class:

from torch.utils.data import Dataset

class CustomTextDataset(Dataset):
    def __init__(self, text_data, labels):
        self.text_data = torch.from_numpy(text_data)  # Convert NumPy array to tensor
        self.labels = torch.from_numpy(labels).long()  # Convert labels to tensor (long for classification)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        return self.text_data[idx], self.labels[idx]

# Create dataset instance
train_dataset = CustomTextDataset(train_text, train_label)

# Key Differences from TensorDataset:
# 1.More Flexibility - You can add data augmentation, preprocessing, or additional logic in __getitem__.
# 2.Custom Processing - Allows dynamic modifications instead of a fixed TensorDataset.
# 3.Scalability - Useful when working with large datasets where on-the-fly transformations are needed.

In [74]:
print(type(train_dataset))
print(len(train_dataset))
train_dataset[0]

<class '__main__.CustomTextDataset'>
4000


(tensor([   4,   32,    8,  162,  190,   23,   41,  124,   30,    6,   60,  117,
          301,   51,   48,  198,    7,  909,    5,   33,  198,   90,    5,  275,
            8,  149,    5, 1847,   17, 3695,    6, 1646,    6,    7, 1866,    6,
            5,    2,  384,   11, 4383,   11, 1458, 1208, 2724, 3029,   26,    2,
         1045, 1013]),
 tensor(0))

In [75]:
batch_size = 400
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) # shuffle false for test

In [76]:
#print(next(iter(train_loader))[0:2]) # this is printing the first 2 batches
batch = next(iter(train_loader))
inputs, labels = batch  # Unpack
print(inputs[:2], labels[:2])  # this will now only print first 2 samples

tensor([[   4,  751,    8,   43,   22, 1324,    7, 2741,    8,   43,  593,   68,
           41,  568, 5549,  215,   22,   45,    3,    4,   32,    5,  234,  131,
            8,  913, 2266, 5366,    5,   86,   14,    2,  593,    3,    4,  390,
            5,   45,   10,    4,  254,    2,  112,  174,    5,   33, 3167,    5,
            2,  409],
        [   4, 1702,   90,    8,  949,   81,   22,  132,  134,  441,    6,   27,
           25, 3022,   28,    3,    4,   98,   32,  217,  519,    3,   13,    4,
          172,  131,    8, 1575,  718,   81,  546,   79,    6,    5,  241,   76,
            8,   43,  593,    3,    2,  519,   58,   18,  109,   13,  264,    6,
           44,  187]]) tensor([4, 0])


**Build a CNN classifier**

In [77]:
class TicketClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TicketClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(embed_dim, num_classes)
        
    def forward(self, text):
        embedded = self.embedding(text).permute(0, 2, 1)
        conved = F.relu(self.conv(embedded))
        conved = conved.mean(dim=2)
        return self.fc(conved)
        # •	CrossEntropyLoss expects raw logits (not probabilities).
	    # •	If you’ve applied softmax before passing to CrossEntropyLoss, it will lead to incorrect gradients.
	    # •	Fix: Remove softmax in the model’s forward(), as CrossEntropyLoss already applies it internally.

**Train the CNN classifier on train_data**

In [78]:
vocab_size = len(word2idx) + 1 
# Adding +1 is done to include a special token that wasn’t in the original vocabulary mapping, such as:
# •	Padding token (<PAD>): Needed when using batch processing, so all sequences have the same length.
# •	Unknown token (<UNK>): Assigned to words not in the vocabulary during inference.
print(vocab_size)
embed_dim = 64

10147


In [79]:
model = TicketClassifier(vocab_size, embed_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

In [90]:
model.train()
for epoch in range(3):
    running_loss, num_processed = 0,0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        output = model(inputs)
        # print(output.shape) # torch.Size([400, 5])
        ## For multi-class classification, your model output should have shape (batch_size, num_classes), where each row contains logits for the classes.
        # print(labels.shape) # torch.Size([400])
        ## Your labels should have shape (batch_size,) with class indices (not one-hot encoded).
        loss = criterion(output, labels)
        ## If model output is missing the second dimension (num_classes), it won’t work for multi-class classification.
        ## If labels are one-hot encoded (instead of class indices), CrossEntropyLoss will not work.
        ### Expected: (batch_size,)
	    ### Wrong: (batch_size, num_classes)
	    ### Fix: Convert one-hot labels to class indices using torch.argmax(labels, dim=-1)
        
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        num_processed += len(inputs)
    print(f"Epoch: {epoch+1}, Loss: {running_loss/num_processed}")
    # num_processed is 4000, len(train_loader) is only 10 so cant use that


Epoch: 1, Loss: 5.51960361917736e-07
Epoch: 2, Loss: 4.874967344221659e-07
Epoch: 3, Loss: 4.508773636189289e-07


**Test & Evaluate the CNN Classifier**

In [81]:
accuracy_metric = Accuracy(task='multiclass', num_classes=num_classes)
precision_metric = Precision(task='multiclass', num_classes=num_classes, average=None)
recall_metric = Recall(task='multiclass', num_classes=num_classes, average=None)

In [94]:
model.eval()

predicted = []
true_labels = []

accuracy_metric.reset()
precision_metric.reset()
recall_metric.reset()

with torch.no_grad():
    for i, (inputs, labels) in enumerate(test_loader):
        output = model(inputs)
        # selects the class with the highest score, 
        # It’s needed because the model outputs raw logits, not class labels
        cat = torch.argmax(output, dim=-1) 
        # Why Not Use softmax Instead?
        ## softmax converts logits into probabilities, but it doesn’t change the ranking of class scores.
        ## argmax already finds the most probable class directly from logits.
        ## If we only need class labels (not confidence scores), argmax is simpler and faster than softmax.
        predicted.extend(cat.tolist()) # store predictions: refer CNN pytorch pg10 for "extend" function use
        true_labels.extend(labels.tolist()) # store labels for verifying accuracy
        
        accuracy_metric.update(cat, labels)
        precision_metric.update(cat, labels)
        recall_metric.update(cat, labels)
    
accuracy = accuracy_metric.compute().item()
precision = precision_metric.compute().tolist()
recall = recall_metric.compute().tolist()
print('Accuracy:', accuracy)
print('Precision (per class):', precision)
print('Recall (per class):', recall)

Accuracy: 0.796999990940094
Precision (per class): [0.7213930487632751, 0.75, 0.8133333325386047, 0.8087431788444519, 0.8866994976997375]
Recall (per class): [0.7552083134651184, 0.74210524559021, 0.8472222089767456, 0.7708333134651184, 0.8571428656578064]


In [95]:
# Compute manual accuracy for verification
correct = sum(p == l for p, l in zip(predicted, true_labels)) 
accuracy_manual = correct / len(true_labels)
print(f"Manual Accuracy: {accuracy_manual:.4f}") # Should match torchmetrics accuracy

Manual Accuracy: 0.7970
