In [1]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [2]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [4]:
data = pd.read_csv('/kaggle/input/mail-categories-dataset/final_data.csv')
data.head()

Unnamed: 0,mail,category,priority
0,Subject: Share Your Feedback with Us – We’re L...,Customer Feedback,Normal
1,Subject: You're Invited: The Annual Global Lea...,Event Invitations,Normal
2,Subject: Discover the Perfect Solution with Ou...,Sales Inquiries,Normal
3,Subject: Assistance Needed: System Update Issu...,Technical Support,Urgent
4,Subject: Exclusive Member Rewards & Savings Aw...,Marketing and Promotions,Normal


In [5]:
categories = [
    'General Inquiries',
    'Technical Support',
    'Customer Feedback',
    'Complaints',
    'Sales Inquiries',
    'Collaboration Opportunities',
    'Internal Communication',
    'Legal Matters',
    'Marketing and Promotions',
    'Event Invitations'
]

In [6]:
data['labels'] = [[0 if data.iloc[i,1] != c else 1 for c in categories] for i in range(len(data))]
data.head(10)

Unnamed: 0,mail,category,priority,labels
0,Subject: Share Your Feedback with Us – We’re L...,Customer Feedback,Normal,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
1,Subject: You're Invited: The Annual Global Lea...,Event Invitations,Normal,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
2,Subject: Discover the Perfect Solution with Ou...,Sales Inquiries,Normal,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,Subject: Assistance Needed: System Update Issu...,Technical Support,Urgent,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Subject: Exclusive Member Rewards & Savings Aw...,Marketing and Promotions,Normal,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
5,Subject: Alignment and Integration Initiative ...,Internal Communication,Normal,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
6,Subject: Immediate Attention Required: Unresol...,Complaints,Urgent,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
7,Subject: 🌟 Exclusive VIP Savings Inside – Unwr...,Marketing and Promotions,Normal,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
8,Subject: Update on Company Policies and Proced...,Internal Communication,Normal,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
9,Subject: Assistance with Ongoing Technical Dif...,Technical Support,Normal,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


In [7]:
data_mod = data[['mail', 'labels']]
data_mod.head()

Unnamed: 0,mail,labels
0,Subject: Share Your Feedback with Us – We’re L...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
1,Subject: You're Invited: The Annual Global Lea...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
2,Subject: Discover the Perfect Solution with Ou...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,Subject: Assistance Needed: System Update Issu...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Subject: Exclusive Member Rewards & Savings Aw...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"


In [8]:
train_data = data_mod.iloc[:900]
test_data = data_mod.iloc[800:]

In [9]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 1
VALID_BATCH_SIZE = 1
EPOCHS = 2
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [10]:
train_data

Unnamed: 0,mail,labels
0,Subject: Share Your Feedback with Us – We’re L...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
1,Subject: You're Invited: The Annual Global Lea...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
2,Subject: Discover the Perfect Solution with Ou...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,Subject: Assistance Needed: System Update Issu...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Subject: Exclusive Member Rewards & Savings Aw...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
...,...,...
895,Subject: General Inquiry Regarding [Product/Se...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
896,Subject: Unlock Exclusive Member Savings – Lim...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
897,Subject: Share Your Experience with Us – We Va...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
898,Subject: Exclusive VIP Preview: New Spring Col...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"


In [11]:
test_data

Unnamed: 0,mail,labels
800,Subject: General Inquiry Regarding Product Inf...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
801,Subject: Explore Our Tailored Solutions and Ex...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
802,Subject: General Inquiry: Seeking Information ...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
803,Subject: Comprehensive Technical Support for S...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
804,Subject: Exciting Collaboration Opportunity: U...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
...,...,...
1132,Subject: Comprehensive Technical Support Solut...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
1133,Subject: Unlock Exclusive Savings with Our Lat...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
1134,Subject: Share Your Experience – We Value Your...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
1135,Subject: Share Your Experience with Us – We Va...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"


In [12]:
train_data.rename(columns={'mail': 'text'}, inplace=True)
test_data.rename(columns={'mail': 'text'}, inplace=True)

In [13]:
test_data = test_data.reset_index(drop=True)

In [14]:
test_data

Unnamed: 0,text,labels
0,Subject: General Inquiry Regarding Product Inf...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Subject: Explore Our Tailored Solutions and Ex...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
2,Subject: General Inquiry: Seeking Information ...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,Subject: Comprehensive Technical Support for S...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Subject: Exciting Collaboration Opportunity: U...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
...,...,...
332,Subject: Comprehensive Technical Support Solut...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
333,Subject: Unlock Exclusive Savings with Our Lat...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
334,Subject: Share Your Experience – We Value Your...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
335,Subject: Share Your Experience with Us – We Va...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"


In [15]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [16]:
training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

In [17]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [18]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 10)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [19]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [20]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [21]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()

In [22]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.6934596300125122


900it [10:58,  1.37it/s]
0it [00:00, ?it/s]

Epoch: 1, Loss:  0.07908573001623154


900it [11:03,  1.36it/s]


In [23]:
def validation(tl):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(tl, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [24]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5

337it [00:44,  7.50it/s]


In [25]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.9985163204747775
Hamming Loss = 0.0002967359050445104


In [26]:
def return_loader(data):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
    data_set = MultiLabelDataset(data, tokenizer, 128)
    
    test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
    
    data_loader  = DataLoader(data_set, **test_params)
    return data_loader

In [27]:
islam = train_data.iloc[2:3].reset_index(drop=True)
islam_loader = return_loader(islam)

In [28]:
for e in train_data.iloc[2:3]['text']:
    print(e)
    break

Subject: Discover the Perfect Solution with Our Customized Product Offers

Dear [Client Name],

Thank you for reaching out with your interest in our products and services. We understand that finding the perfect solution tailored to your needs is essential. Our extensive catalog offers a wide range of options, from standard features to advanced customization and configurations, ensuring compatibility, efficiency, and performance that meets the specific demands of your industry niche.

Would you be interested in receiving a detailed quotation or a comprehensive proposal? We offer competitive pricing, special discounts on bundles, and financing options to accommodate different budget requirements. Additionally, we can arrange for a trial or demo to evaluate our product's compatibility with your current systems and its potential to optimize your operations.

We value your considerations regarding warranty, support, and after-sales service. Rest assured, we stand behind our products with a 

In [29]:
def prediction(data):
    islam_loader = return_loader(data)
    for data in islam_loader:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)
        proba = torch.softmax(torch.tensor(outputs), dim=1)
        result = np.array(proba)
        return result

In [30]:
prediction(train_data.iloc[2:3].reset_index(drop=True)).shape

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(1, 10)

In [31]:
train_data.iloc[2:3]

Unnamed: 0,text,labels
2,Subject: Discover the Perfect Solution with Ou...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"


In [32]:
output_model_file = 'my_model.pt'
output_vocab_file = 'vobac.pt'

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

('vobac.pt',)