# Build an AI model to predict customer satisfaction rating. 
Q1: Design an AI model to predict the customer satisfaction rating

**Process:**
- Feature use: Customer Age group, Customer Gender, Ticket Content, Resolution, Ticket Sentiment, User Intent
    - Customer Age group: categorical
    - Customer Gender: categorical
    - Ticket Content: text
    - Resolution: text
    - Ticket Sentiment: categorical
    - User Intent: categorical
- Target: Rating [1-5]
- Pipeline: Preprocess Ticket Content and Resolution, then embed 

In [1]:
import pandas as pd
df = pd.read_csv("../customer_support_tickets.csv")

In [2]:
df.loc[df["Customer Age"] < 30, "Customer Age Group"] = "Younger"
df.loc[(df["Customer Age"] >= 30) & (df["Customer Age"] < 45), "Customer Age Group"] = "Middle"
df.loc[df["Customer Age"] >= 45, "Customer Age Group"] = "Elder"

In [3]:
import re
import nltk
import spacy
import torch
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from transformers import AutoTokenizer
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter

# Remove non-informative phrase
phrases_to_remove = [
    r"i'?m having an issue with (the )?\{?product_purchased\}?\s*\.?",
    r"please assist\.?",
    r"let me know\.?",
    r"thank you\.?",
]
phrase_pattern = re.compile(r"|".join(phrases_to_remove), re.IGNORECASE)

def clean_text(text):
    text = phrase_pattern.sub("", text)
    text = re.sub(r"\s+", " ", text)

    return text

def ticket_content_wrapper(ticket):
    return f'Ticket Type: {ticket["Ticket Type"]}, Ticket Subject: {ticket["Ticket Subject"]}, Ticket Description: {ticket["Ticket Description Processed"]}'
    
df["Ticket Description Processed"] = df["Ticket Description"].apply(clean_text)
df['Ticket Content'] = df.apply(ticket_content_wrapper, axis=1)

  return torch._C._cuda_getDeviceCount() > 0


In [4]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

class TicketDataset(Dataset):
    def __init__(self, dataframe, tokenizer_name="bert-base-uncased", max_length=128):
        self.df = dataframe.reset_index(drop=True)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length

        self.age_map = {age: i for i, age in enumerate(self.df['Customer Age Group'].unique())}
        self.gender_map = {gender: i for i, gender in enumerate(self.df['Customer Gender'].unique())}
        self.label_map = {rating: i for i, rating in enumerate(sorted(self.df['Customer Satisfaction Rating'].unique()))}
        self.inverse_label_map = {v: k for k, v in self.label_map.items()}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        age_group = torch.tensor(self.age_map[row['Customer Age Group']], dtype=torch.long)
        gender = torch.tensor(self.gender_map[row['Customer Gender']], dtype=torch.long)

        ticket_tokens = self.tokenizer(
            row['Ticket Content'],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        resolution_tokens = self.tokenizer(
            row['Resolution'],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        target = torch.tensor(self.label_map[row['Customer Satisfaction Rating']], dtype=torch.long)

        return {
            "age_group": age_group,
            "gender": gender,
            "ticket_input_ids": ticket_tokens["input_ids"].squeeze(0),
            "ticket_attention_mask": ticket_tokens["attention_mask"].squeeze(0),
            "resolution_input_ids": resolution_tokens["input_ids"].squeeze(0),
            "resolution_attention_mask": resolution_tokens["attention_mask"].squeeze(0),
            "target": target
        }

In [5]:
df_closed = df[df["Ticket Status"] == "Closed"]
ticket_dataset = TicketDataset(dataframe=df_closed, tokenizer_name="bert-base-uncased", max_length=128)

ticket_dataloader = DataLoader(
    ticket_dataset,
    batch_size=32,
    shuffle=True,     
)

batch = next(iter(ticket_dataloader))
for key, value in batch.items():
    print(f"{key}: {value.shape}")

age_group: torch.Size([32])
gender: torch.Size([32])
ticket_input_ids: torch.Size([32, 128])
ticket_attention_mask: torch.Size([32, 128])
resolution_input_ids: torch.Size([32, 128])
resolution_attention_mask: torch.Size([32, 128])
target: torch.Size([32])


In [6]:
import torch.nn as nn
from transformers import BertModel

class RatingPredictor(nn.Module):
    def __init__(self, 
                 bert_model_name="bert-base-uncased", 
                 age_vocab_size=3, 
                 gender_vocab_size=3, 
                 embed_dim=32,
                 num_classes=5):
        super(RatingPredictor, self).__init__()

        self.bert = BertModel.from_pretrained(bert_model_name)
        self.age_embed = nn.Embedding(age_vocab_size, embed_dim)
        self.gender_embed = nn.Embedding(gender_vocab_size, embed_dim)

        total_input_dim = 2 * self.bert.config.hidden_size + 2 * embed_dim

        self.classifier = nn.Sequential(
            nn.Linear(total_input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, num_classes) 
        )

    def forward(self, 
                ticket_input_ids, ticket_attention_mask, 
                resolution_input_ids, resolution_attention_mask, 
                age_group, gender):

        ticket_out = self.bert(ticket_input_ids, attention_mask=ticket_attention_mask)
        resolution_out = self.bert(resolution_input_ids, attention_mask=resolution_attention_mask)

        ticket_cls = ticket_out.last_hidden_state[:, 0, :]
        resolution_cls = resolution_out.last_hidden_state[:, 0, :]

        age_emb = self.age_embed(age_group)
        gender_emb = self.gender_embed(gender)

        combined = torch.cat([ticket_cls, resolution_cls, age_emb, gender_emb], dim=1)
        logits = self.classifier(combined)  # No activation here

        return logits

In [7]:
from torch.utils.data import random_split, DataLoader

def collate_fn(batch):
    batch_dict = {}

    for key in batch[0]:
        if isinstance(batch[0][key], torch.Tensor):
            batch_dict[key] = torch.stack([item[key] for item in batch])
        else:
            batch_dict[key] = [item[key] for item in batch]

    return batch_dict

dataset = TicketDataset(df_closed)  
dataloader = DataLoader(dataset, batch_size=16, collate_fn=collate_fn)

total_size = len(dataset)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size  # Ensure all data is used

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

In [8]:
from torch import nn, optim
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

model = RatingPredictor(num_classes=5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

def train(model, train_loader, val_loader, criterion, optimizer, device, epochs=5):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct, total = 0, 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
            optimizer.zero_grad()
            inputs = {k: v.to(device) for k, v in batch.items() if k != "target"}
            targets = batch["target"].to(device)
            
            outputs = model(**inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == targets).sum().item()
            total += targets.size(0)

        avg_loss = total_loss / len(train_loader)
        acc = correct / total
        print(f"Train Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}")

        evaluate(model, val_loader, criterion, device)

    return model

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct, total = 0, 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation", leave=False):
            inputs = {k: v.to(device) for k, v in batch.items() if k != "target"}
            targets = batch["target"].to(device)

            outputs = model(**inputs)
            loss = criterion(outputs, targets)

            total_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == targets).sum().item()
            total += targets.size(0)

    avg_loss = total_loss / len(val_loader)
    acc = correct / total
    print(f"Val Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}")

In [9]:
trained_model = train(model, train_loader, val_loader, criterion, optimizer, device, epochs=5)


Epoch 1/5 - Training: 100%|█████████████████████████████████████████████████████████████████████| 139/139 [09:06<00:00,  3.93s/it]


Train Loss: 1.6111, Accuracy: 0.2014


                                                                                                                                  

Val Loss: 1.6093, Accuracy: 0.1884


Epoch 2/5 - Training: 100%|█████████████████████████████████████████████████████████████████████| 139/139 [09:27<00:00,  4.08s/it]


Train Loss: 1.6087, Accuracy: 0.2059


                                                                                                                                  

Val Loss: 1.6123, Accuracy: 0.1812


Epoch 3/5 - Training: 100%|█████████████████████████████████████████████████████████████████████| 139/139 [09:10<00:00,  3.96s/it]


Train Loss: 1.6082, Accuracy: 0.2176


                                                                                                                                  

Val Loss: 1.6130, Accuracy: 0.1884


Epoch 4/5 - Training: 100%|█████████████████████████████████████████████████████████████████████| 139/139 [09:27<00:00,  4.08s/it]


Train Loss: 1.6017, Accuracy: 0.2257


                                                                                                                                  

Val Loss: 1.6242, Accuracy: 0.1848


Epoch 5/5 - Training: 100%|█████████████████████████████████████████████████████████████████████| 139/139 [09:32<00:00,  4.12s/it]


Train Loss: 1.5698, Accuracy: 0.2980


                                                                                                                                  

Val Loss: 1.6735, Accuracy: 0.1594


