![servicedesk](servicedesk.png)

CleverSupport is a company at the forefront of AI innovation, specializing in the development of AI-driven solutions to enhance customer support services. Their latest endeavor is to engineer a text classification system that can automatically categorize customer complaints. 

Your role as a data scientist involves the creation of a sophisticated machine learning model that can accurately assign complaints to specific categories, such as mortgage, credit card, money transfers, debt collection, etc.

In [2]:
!pip install torchmetrics

Defaulting to user installation because normal site-packages is not writeable
Collecting torchmetrics
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl.metadata (19 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.3.post0-py3-none-any.whl.metadata (4.7 kB)
Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.3.post0-py3-none-any.whl (26 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.3.post0 torchmetrics-1.4.0.post0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [3]:
from collections import Counter
import nltk, json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch import nn

import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics import Accuracy, Precision, Recall

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/repl/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# Import data and labels
with open("words.json", 'r') as f1:
    words = json.load(f1)
with open("text.json", 'r') as f2:
    text = json.load(f2)
labels = np.load('labels.npy')

In [6]:
for i in text[:1]:
    print('xxxxx')
    print(i)

xxxxx
['i', 'called', 'because', 'i', 'have', 'been', 'receiving', '7', 'to', '8', 'calls', 'a', 'day', 'from', 'them', 'regarding', 'a', 'debt', 'and', 'the', 'representative', 'called', 'me', 'a', 'liar', 'after', 'i', 'asked', 'about', 'settling', 'my', 'account']


In [7]:
# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

# Looking up the mapping dictionary and assigning the index to the respective words
for i, sentence in enumerate(text):
    text[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]
    
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

text = pad_input(text, 50)

In [9]:
text.shape

(5000, 50)

In [10]:
# from collections import Counter 
# Counter(labels)

In [11]:
# Splitting dataset
train_text, test_text, train_label, test_label = train_test_split(text, labels, test_size=0.2, random_state=42)

train_data = TensorDataset(torch.from_numpy(train_text), torch.from_numpy(train_label).long())
test_data = TensorDataset(torch.from_numpy(test_text), torch.from_numpy(test_label).long())

In [34]:
import torch.nn as nn

class RNN_classifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers):
        super(RNN_classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(embed_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 5)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [32]:
print(torch.from_numpy(train_text).shape)
print(torch.from_numpy(train_text).unsqueeze(-1).shape)

torch.Size([4000, 50])
torch.Size([4000, 50, 1])


In [36]:
batch_size = 400
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

# Assuming `word2idx` and `train_data`, `test_data` are defined
vocab_size = len(word2idx) + 1
embed_dim = 64
hidden_size = 6
num_layers = 1
model = RNN_classifier(vocab_size, embed_dim, hidden_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
model.train()
for i in range(10):
    running_loss, num_processed = 0,0
    for inputs, labels in train_loader:
        model.zero_grad()
        output = model(inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        num_processed += len(inputs)
    print(f"Epoch: {i + 1}, Loss: {running_loss / num_processed}")


Epoch: 1, Loss: 0.004223566025495529
Epoch: 2, Loss: 0.004164794117212296
Epoch: 3, Loss: 0.00412275105714798
Epoch: 4, Loss: 0.004089788258075714
Epoch: 5, Loss: 0.004063237667083741
Epoch: 6, Loss: 0.004040512889623642
Epoch: 7, Loss: 0.004020933121442795
Epoch: 8, Loss: 0.004004344373941421
Epoch: 9, Loss: 0.003989219009876251
Epoch: 10, Loss: 0.003976047515869141


In [38]:
# Evaluate model on test set
model.eval()
accuracy_metric = Accuracy(task='multiclass', num_classes=5)
precision_metric = Precision(task='multiclass', num_classes=5, average=None)
recall_metric = Recall(task='multiclass', num_classes=5, average=None)

for inputs, labels in test_loader:
    with torch.no_grad():
        output = model(inputs)
        cat = torch.argmax(output, dim=-1)
        accuracy_metric(cat, labels)
        precision_metric(cat, labels)
        recall_metric(cat, labels)

accuracy = accuracy_metric.compute().item()
precision = precision_metric.compute().tolist()
recall = recall_metric.compute().tolist()
print('Accuracy:', accuracy)
print('Precision (per class):', precision)
print('Recall (per class):', recall)

Accuracy: 0.20600000023841858
Precision (per class): [0.1726190447807312, 0.20608898997306824, 0.1666666716337204, 0.2460317462682724, 0.20879121124744415]
Recall (per class): [0.1510416716337204, 0.46315789222717285, 0.004629629664123058, 0.1614583283662796, 0.27142858505249023]


In [24]:
# Define the classifier class
class TicketClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, target_size):
        super(TicketClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(embed_dim, target_size)

    def forward(self, text):
        embedded = self.embedding(text).permute(0, 2, 1)
        conved = F.relu(self.conv(embedded))
        conved = conved.mean(dim=2) 
        return self.fc(conved)


vocab_size = len(word2idx) + 1
target_size = len(np.unique(labels))
embedding_dim = 64

# Create an instance of the TicketClassifier class
model = TicketClassifier(vocab_size, embedding_dim, target_size)

lr = 0.05
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

epochs = 3

# Train the model
model.train()
for i in range(epochs):
    running_loss, num_processed = 0,0
    for inputs, labels in train_loader:
        model.zero_grad()
        output = model(inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        num_processed += len(inputs)
    print(f"Epoch: {i+1}, Loss: {running_loss/num_processed}")


accuracy_metric = Accuracy(task='multiclass', num_classes=5)
precision_metric = Precision(task='multiclass', num_classes=5, average=None)
recall_metric = Recall(task='multiclass', num_classes=5, average=None)

# Evaluate model on test set
model.eval()
predicted = []

for i, (inputs, labels) in enumerate(test_loader):
    output = model(inputs)
    cat = torch.argmax(output, dim=-1)
    predicted.extend(cat.tolist())
    accuracy_metric(cat, labels)
    precision_metric(cat, labels)
    recall_metric(cat, labels)

accuracy = accuracy_metric.compute().item()
precision = precision_metric.compute().tolist()
recall = recall_metric.compute().tolist()
print('Accuracy:', accuracy)
print('Precision (per class):', precision)
print('Recall (per class):', recall)

Epoch: 1, Loss: 0.0039164722561836245
Epoch: 2, Loss: 0.0018052333444356918
Epoch: 3, Loss: 0.0007527070641517639
Accuracy: 0.8019999861717224
Precision (per class): [0.7388888597488403, 0.7268292903900146, 0.8095238208770752, 0.849397599697113, 0.8807339668273926]
Recall (per class): [0.6927083134651184, 0.7842105031013489, 0.8657407164573669, 0.734375, 0.9142857193946838]
