In [1]:
import re
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
lr = 0.0005
vec_len = 50
seq_len = 20
num_epochs = 50
label_col = "Product"
tokens_path = "/content/drive/MyDrive/ProjectPro/nlp-attention-classification/Modular_code/Modular_code/Output/tokens.pkl"
labels_path = "/content/drive/MyDrive/ProjectPro/nlp-attention-classification/Modular_code/Modular_code/Output/labels.pkl"
data_path = "/content/drive/MyDrive/ProjectPro/nlp-attention-classification/Modular_code/Modular_code/Input/complaints.csv"
model_path = "/content/drive/MyDrive/ProjectPro/nlp-attention-classification/Modular_code/Modular_code/Output/attention.pth"
vocabulary_path = "/content/drive/MyDrive/ProjectPro/nlp-attention-classification/Modular_code/Modular_code/Output/vocabulary.pkl"
embeddings_path = "/content/drive/MyDrive/ProjectPro/nlp-attention-classification/Modular_code/Modular_code/Output/embeddings.pkl"
glove_vector_path = "/content/drive/MyDrive/ProjectPro/nlp-attention-classification/Modular_code/Modular_code/Input/glove.6B.50d.txt"
text_col_name = "Consumer complaint narrative"
label_encoder_path = "/content/drive/MyDrive/ProjectPro/nlp-attention-classification/Modular_code/Modular_code/Output/label_encoder.pkl"
product_map = {'Vehicle loan or lease': 'vehicle_loan',
               'Credit reporting, credit repair services, or other personal consumer reports': 'credit_report',
               'Credit card or prepaid card': 'card',
               'Money transfer, virtual currency, or money service': 'money_transfer',
               'virtual currency': 'money_transfer',
               'Mortgage': 'mortgage',
               'Payday loan, title loan, or personal loan': 'loan',
               'Debt collection': 'debt_collection',
               'Checking or savings account': 'savings_account',
               'Credit card': 'card',
               'Bank account or service': 'savings_account',
               'Credit reporting': 'credit_report',
               'Prepaid card': 'card',
               'Payday loan': 'loan',
               'Other financial service': 'others',
               'Virtual currency': 'money_transfer',
               'Student loan': 'loan',
               'Consumer Loan': 'loan',
               'Money transfers': 'money_transfer'}

In [4]:
def save_file(name, obj):
    """
    Function to save an object as pickle file
    """
    with open(name, 'wb') as f:
        pickle.dump(obj, f)


def load_file(name):
    """
    Function to load a pickle object
    """
    return pickle.load(open(name, "rb"))

## Process glove embeddings
---

In [5]:
with open(glove_vector_path, "rt") as f:
    emb = f.readlines()

In [6]:
vocabulary, embeddings = [], []

for item in emb:
    vocabulary.append(item.split()[0])
    embeddings.append(item.split()[1:])

In [7]:
embeddings = np.array(embeddings, dtype=np.float32)

In [8]:
vocabulary = ["<pad>", "<unk>"] + vocabulary

In [9]:
embeddings = np.vstack([np.ones(50, dtype=np.float32),
                        np.mean(embeddings, axis=0),
                        embeddings])

In [10]:
save_file(embeddings_path, embeddings)
save_file(vocabulary_path, vocabulary)

## Process text data
---

In [11]:
data = pd.read_csv(data_path)

In [12]:
data.dropna(subset=[text_col_name], inplace=True)

In [14]:
data.replace({label_col: product_map}, inplace=True)

### Encode labels

In [13]:
label_encoder = LabelEncoder()
label_encoder.fit(data[label_col])
labels = label_encoder.transform(data[label_col])

In [15]:
save_file(labels_path, labels)
save_file(label_encoder_path, label_encoder)

### Process the text column

In [16]:
input_text = list(data[text_col_name])

In [17]:
len(input_text)

809343

### Convert text to lower case

In [18]:
input_text = [i.lower() for i in tqdm(input_text)]

100%|██████████| 809343/809343 [00:01<00:00, 713849.64it/s]


### Remove punctuations except apostrophe

In [19]:
input_text = [re.sub(r"[^\w\d'\s]+", " ", i)
              for i in tqdm(input_text)]

100%|██████████| 809343/809343 [00:27<00:00, 29374.74it/s]


### Remove digits

In [20]:
input_text = [re.sub("\d+", "", i) for i in tqdm(input_text)]

100%|██████████| 809343/809343 [00:17<00:00, 46063.14it/s]


### Remove more than one consecutive instance of 'x'

In [21]:
input_text = [re.sub(r'[x]{2,}', "", i) for i in tqdm(input_text)]

100%|██████████| 809343/809343 [00:14<00:00, 56828.00it/s]


### Remove multiple spaces with single space

In [22]:
input_text = [re.sub(' +', ' ', i) for i in tqdm(input_text)]

100%|██████████| 809343/809343 [00:41<00:00, 19513.25it/s]


In [23]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Tokenize the text

In [24]:
tokens = [word_tokenize(t) for t in tqdm(input_text)]

100%|██████████| 809343/809343 [07:27<00:00, 1810.48it/s]


### Take the first 20 tokens in each complaint text

In [25]:
tokens = [i[:20] if len(i) > 19 else ['<pad>'] * (20 - len(i)) + i
          for i in tqdm(tokens)]

100%|██████████| 809343/809343 [00:07<00:00, 108220.34it/s]


### Convert tokens to integer indices from vocabulary

In [26]:
def token_index(tokens, vocabulary, missing='<unk>'):
    """
    :param tokens: List of word tokens
    :param vocabulary: All words in the embeddings
    :param missing: Token for words not present in the vocabulary
    :return: List of integers representing the word tokens
    """
    idx_token = []
    for text in tqdm(tokens):
        idx_text = []
        for token in text:
            if token in vocabulary:
                idx_text.append(vocabulary.index(token))
            else:
                idx_text.append(vocabulary.index(missing))
        idx_token.append(idx_text)
    return idx_token

In [27]:
tokens = token_index(tokens, vocabulary)

100%|██████████| 809343/809343 [1:55:35<00:00, 116.70it/s]


### Save the tokens

In [28]:
save_file(tokens_path, tokens)

## Create attention model
---

In [29]:
class AttentionModel(nn.Module):

    def __init__(self, vec_len, seq_len, n_classes):
        super(AttentionModel, self).__init__()
        self.vec_len = vec_len
        self.seq_len = seq_len
        self.attn_weights = torch.cat([torch.tensor([[0.]]),
                                       torch.randn(vec_len, 1) /
                                       torch.sqrt(torch.tensor(vec_len))])
        self.attn_weights.requires_grad = True
        self.attn_weights = nn.Parameter(self.attn_weights)
        self.activation = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        self.linear = nn.Linear(vec_len + 1, n_classes)

    def forward(self, input_data):
        hidden = torch.matmul(input_data, self.attn_weights)
        hidden = self.activation(hidden)
        attn = self.softmax(hidden)
        attn = attn.repeat(1, 1, self.vec_len + 1).reshape(attn.shape[0],
                                                           self.seq_len,
                                                           self.vec_len + 1)
        attn_output = input_data * attn
        attn_output = torch.sum(attn_output, axis=1)
        output = self.linear(attn_output)
        return output

## Create PyTorch dataset
---

In [30]:
class TextDataset(torch.utils.data.Dataset):

    def __init__(self, tokens, embeddings, labels):
        """
        :param tokens: List of word tokens
        :param embeddings: Word embeddings (from glove)
        :param labels: List of labels
        """
        self.tokens = tokens
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        emb = torch.tensor(self.embeddings[self.tokens[idx], :])
        input_ = torch.cat((torch.ones(emb.shape[0],1), emb), dim=1)
        return torch.tensor(self.labels[idx]), input_

### Function to train the model

In [31]:
def train(train_loader, valid_loader, model, criterion, optimizer,
          device, num_epochs, model_path):
    """
    Function to train the model
    :param train_loader: Data loader for train dataset
    :param valid_loader: Data loader for validation dataset
    :param model: Model object
    :param criterion: Loss function
    :param optimizer: Optimizer
    :param device: CUDA or CPU
    :param num_epochs: Number of epochs
    :param model_path: Path to save the model
    """
    best_loss = 1e8
    for i in range(num_epochs):
        print(f"Epoch {i+1} of {num_epochs}")
        valid_loss, train_loss = [], []
        model.train()
        # Train loop
        for batch_labels, batch_data in tqdm(train_loader):
            # Move data to GPU if available
            batch_labels = batch_labels.to(device)
            batch_data = batch_data.to(device)
            # Forward pass
            batch_output = model(batch_data)
            batch_output = torch.squeeze(batch_output)
            # Calculate loss
            loss = criterion(batch_output, batch_labels)
            train_loss.append(loss.item())
            optimizer.zero_grad()
            # Backward pass
            loss.backward()
            # Gradient update step
            optimizer.step()
        model.eval()
        # Validation loop
        for batch_labels, batch_data in tqdm(valid_loader):
            # Move data to GPU if available
            batch_labels = batch_labels.to(device)
            batch_data = batch_data.to(device)
            # Forward pass
            batch_output = model(batch_data)
            batch_output = torch.squeeze(batch_output)
            # Calculate loss
            loss = criterion(batch_output, batch_labels)
            valid_loss.append(loss.item())
        t_loss = np.mean(train_loss)
        v_loss = np.mean(valid_loss)
        print(f"Train Loss: {t_loss}, Validation Loss: {v_loss}")
        if v_loss < best_loss:
            best_loss = v_loss
            # Save model if validation loss improves
            torch.save(model.state_dict(), model_path)
        print(f"Best Validation Loss: {best_loss}")

### Function to test the model

In [32]:
def test(test_loader, model, criterion, device):
    """
    Function to test the model
    :param test_loader: Data loader for test dataset
    :param model: Model object
    :param criterion: Loss function
    :param device: CUDA or CPU
    """
    model.eval()
    test_loss = []
    test_accu = []
    for batch_labels, batch_data in tqdm(test_loader):
        # Move data to device
        batch_labels = batch_labels.to(device)
        batch_data = batch_data.to(device)
        # Forward pass
        batch_output = model(batch_data)
        batch_output = torch.squeeze(batch_output)
        # Calculate loss
        loss = criterion(batch_output, batch_labels)
        test_loss.append(loss.item())
        batch_preds = torch.argmax(batch_output, axis=1)
        # Move predictions to CPU
        if torch.cuda.is_available():
            batch_labels = batch_labels.cpu()
            batch_preds = batch_preds.cpu()
        # Compute accuracy
        test_accu.append(accuracy_score(batch_labels.detach().
                                        numpy(),
                                        batch_preds.detach().
                                        numpy()))
    test_loss = np.mean(test_loss)
    test_accu = np.mean(test_accu)
    print(f"Test Loss: {test_loss}, Test Accuracy: {test_accu}")

## Train attention model
---

### Load the files

In [33]:
tokens = load_file(tokens_path)
labels = load_file(labels_path)
embeddings = load_file(embeddings_path)
label_encoder = load_file(label_encoder_path)
num_classes = len(label_encoder.classes_)
vocabulary = load_file(vocabulary_path)

### Split data into train, validation and test sets

In [34]:
X_train, X_test, y_train, y_test = train_test_split(tokens, labels,
                                                    test_size=0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                      y_train,
                                                      test_size=0.25)

### Create PyTorch datasets

In [35]:
train_dataset = TextDataset(X_train, embeddings, y_train)
valid_dataset = TextDataset(X_valid, embeddings, y_valid)
test_dataset = TextDataset(X_test, embeddings, y_test)

### Create data loaders

In [36]:
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=16,
                                           shuffle=True,
                                           drop_last=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                           batch_size=16)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=16)

### Create model object

In [37]:
device = torch.device("cuda:0" if torch.cuda.is_available()
                      else "cpu")

In [38]:
model = AttentionModel(vec_len, seq_len, num_classes)

### Move the model to GPU if available

In [39]:
if torch.cuda.is_available():
    model = model.cuda()

### Define loss function and optimizer

In [40]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

### Training loop

In [None]:
train(train_loader, valid_loader, model, criterion, optimizer,
      device, num_epochs, model_path)

Epoch 1 of 50


100%|██████████| 30350/30350 [02:46<00:00, 182.59it/s]
100%|██████████| 10117/10117 [00:58<00:00, 173.19it/s]


Train Loss: 1.3856076874067207, Validation Loss: 1.2943941283670235
Best Validation Loss: 1.2943941283670235
Epoch 2 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.82it/s]
100%|██████████| 10117/10117 [00:59<00:00, 171.28it/s]


Train Loss: 1.2750115986347592, Validation Loss: 1.2704321635185383
Best Validation Loss: 1.2704321635185383
Epoch 3 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.98it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.95it/s]


Train Loss: 1.2590644182984088, Validation Loss: 1.2585024273423817
Best Validation Loss: 1.2585024273423817
Epoch 4 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.76it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.32it/s]


Train Loss: 1.2483625302460089, Validation Loss: 1.250447915885198
Best Validation Loss: 1.250447915885198
Epoch 5 of 50


100%|██████████| 30350/30350 [02:47<00:00, 181.10it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.04it/s]


Train Loss: 1.2429968759874144, Validation Loss: 1.2464335264130473
Best Validation Loss: 1.2464335264130473
Epoch 6 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.82it/s]
100%|██████████| 10117/10117 [00:59<00:00, 170.48it/s]


Train Loss: 1.2398800742920777, Validation Loss: 1.2446453613296757
Best Validation Loss: 1.2446453613296757
Epoch 7 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.35it/s]
100%|██████████| 10117/10117 [00:59<00:00, 171.24it/s]


Train Loss: 1.237687419714802, Validation Loss: 1.243032303993629
Best Validation Loss: 1.243032303993629
Epoch 8 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.62it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.68it/s]


Train Loss: 1.2361771259178242, Validation Loss: 1.240883032653694
Best Validation Loss: 1.240883032653694
Epoch 9 of 50


100%|██████████| 30350/30350 [02:45<00:00, 182.92it/s]
100%|██████████| 10117/10117 [00:59<00:00, 171.21it/s]


Train Loss: 1.2348965735333362, Validation Loss: 1.240213788732823
Best Validation Loss: 1.240213788732823
Epoch 10 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.86it/s]
100%|██████████| 10117/10117 [00:59<00:00, 171.16it/s]


Train Loss: 1.234030796356217, Validation Loss: 1.2396467736561072
Best Validation Loss: 1.2396467736561072
Epoch 11 of 50


100%|██████████| 30350/30350 [02:44<00:00, 183.95it/s]
100%|██████████| 10117/10117 [00:58<00:00, 173.13it/s]


Train Loss: 1.2333285529668485, Validation Loss: 1.2392855263149902
Best Validation Loss: 1.2392855263149902
Epoch 12 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.68it/s]
100%|██████████| 10117/10117 [00:58<00:00, 173.80it/s]


Train Loss: 1.2325526293252995, Validation Loss: 1.2381549681748305
Best Validation Loss: 1.2381549681748305
Epoch 13 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.35it/s]
100%|██████████| 10117/10117 [00:58<00:00, 171.68it/s]


Train Loss: 1.231961713125914, Validation Loss: 1.2391605031305333
Best Validation Loss: 1.2381549681748305
Epoch 14 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.29it/s]
100%|██████████| 10117/10117 [00:58<00:00, 173.81it/s]


Train Loss: 1.231452725329352, Validation Loss: 1.2379033508933062
Best Validation Loss: 1.2379033508933062
Epoch 15 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.63it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.12it/s]


Train Loss: 1.2309516668427718, Validation Loss: 1.2372586209335452
Best Validation Loss: 1.2372586209335452
Epoch 16 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.48it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.08it/s]


Train Loss: 1.230543348399379, Validation Loss: 1.23724830464623
Best Validation Loss: 1.23724830464623
Epoch 17 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.45it/s]
100%|██████████| 10117/10117 [00:59<00:00, 169.21it/s]


Train Loss: 1.2301071119553961, Validation Loss: 1.2364942551725546
Best Validation Loss: 1.2364942551725546
Epoch 18 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.71it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.82it/s]


Train Loss: 1.2297571039710447, Validation Loss: 1.2359582294149456
Best Validation Loss: 1.2359582294149456
Epoch 19 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.46it/s]
100%|██████████| 10117/10117 [00:59<00:00, 170.76it/s]


Train Loss: 1.2293317583905414, Validation Loss: 1.2355468379505021
Best Validation Loss: 1.2355468379505021
Epoch 20 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.04it/s]
100%|██████████| 10117/10117 [00:58<00:00, 173.26it/s]


Train Loss: 1.2290590480662256, Validation Loss: 1.235365115828555
Best Validation Loss: 1.235365115828555
Epoch 21 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.14it/s]
100%|██████████| 10117/10117 [00:59<00:00, 171.09it/s]


Train Loss: 1.2286965507975913, Validation Loss: 1.2358662210205298
Best Validation Loss: 1.235365115828555
Epoch 22 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.92it/s]
100%|██████████| 10117/10117 [00:58<00:00, 171.99it/s]


Train Loss: 1.2282554430211396, Validation Loss: 1.234369355837229
Best Validation Loss: 1.234369355837229
Epoch 23 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.05it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.47it/s]


Train Loss: 1.2280105204990708, Validation Loss: 1.2345958853900296
Best Validation Loss: 1.234369355837229
Epoch 24 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.63it/s]
100%|██████████| 10117/10117 [00:59<00:00, 170.18it/s]


Train Loss: 1.2277614743391412, Validation Loss: 1.2338556774145437
Best Validation Loss: 1.2338556774145437
Epoch 25 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.09it/s]
100%|██████████| 10117/10117 [00:58<00:00, 173.62it/s]


Train Loss: 1.2274346190465144, Validation Loss: 1.2338286602144788
Best Validation Loss: 1.2338286602144788
Epoch 26 of 50


100%|██████████| 30350/30350 [02:46<00:00, 182.75it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.51it/s]


Train Loss: 1.2271637062588476, Validation Loss: 1.2346268029031258
Best Validation Loss: 1.2338286602144788
Epoch 27 of 50


100%|██████████| 30350/30350 [02:46<00:00, 182.62it/s]
100%|██████████| 10117/10117 [00:58<00:00, 174.08it/s]


Train Loss: 1.2270078858806823, Validation Loss: 1.2331775255362083
Best Validation Loss: 1.2331775255362083
Epoch 28 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.93it/s]
100%|██████████| 10117/10117 [00:59<00:00, 170.31it/s]


Train Loss: 1.2267234220771852, Validation Loss: 1.2330596871593789
Best Validation Loss: 1.2330596871593789
Epoch 29 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.43it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.49it/s]


Train Loss: 1.2264589819228628, Validation Loss: 1.2334699492021148
Best Validation Loss: 1.2330596871593789
Epoch 30 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.11it/s]
100%|██████████| 10117/10117 [00:58<00:00, 173.70it/s]


Train Loss: 1.2263308680509224, Validation Loss: 1.232252609795163
Best Validation Loss: 1.232252609795163
Epoch 31 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.19it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.89it/s]


Train Loss: 1.2260183872451893, Validation Loss: 1.232432765850286
Best Validation Loss: 1.232252609795163
Epoch 32 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.75it/s]
100%|██████████| 10117/10117 [00:59<00:00, 171.12it/s]


Train Loss: 1.2258459135983293, Validation Loss: 1.2321843576723495
Best Validation Loss: 1.2321843576723495
Epoch 33 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.27it/s]
100%|██████████| 10117/10117 [00:58<00:00, 173.28it/s]


Train Loss: 1.2255896545084932, Validation Loss: 1.2322980625227387
Best Validation Loss: 1.2321843576723495
Epoch 34 of 50


100%|██████████| 30350/30350 [02:45<00:00, 182.85it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.03it/s]


Train Loss: 1.2254182486322016, Validation Loss: 1.2315702504781534
Best Validation Loss: 1.2315702504781534
Epoch 35 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.39it/s]
100%|██████████| 10117/10117 [00:58<00:00, 171.90it/s]


Train Loss: 1.2252232726117138, Validation Loss: 1.231936327032719
Best Validation Loss: 1.2315702504781534
Epoch 36 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.40it/s]
100%|██████████| 10117/10117 [00:59<00:00, 170.78it/s]


Train Loss: 1.225044709076795, Validation Loss: 1.2317534928455223
Best Validation Loss: 1.2315702504781534
Epoch 37 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.88it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.11it/s]


Train Loss: 1.224853105563701, Validation Loss: 1.231194123668469
Best Validation Loss: 1.231194123668469
Epoch 38 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.46it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.24it/s]


Train Loss: 1.2247109383452666, Validation Loss: 1.2312119178289103
Best Validation Loss: 1.231194123668469
Epoch 39 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.42it/s]
100%|██████████| 10117/10117 [00:58<00:00, 173.69it/s]


Train Loss: 1.2245085031618985, Validation Loss: 1.2311972223087486
Best Validation Loss: 1.231194123668469
Epoch 40 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.83it/s]
100%|██████████| 10117/10117 [00:59<00:00, 170.70it/s]


Train Loss: 1.2242580569183217, Validation Loss: 1.2313656867805343
Best Validation Loss: 1.231194123668469
Epoch 41 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.12it/s]
100%|██████████| 10117/10117 [00:58<00:00, 171.50it/s]


Train Loss: 1.2242011293312274, Validation Loss: 1.2306453443638736
Best Validation Loss: 1.2306453443638736
Epoch 42 of 50


100%|██████████| 30350/30350 [02:48<00:00, 179.87it/s]
100%|██████████| 10117/10117 [01:00<00:00, 166.91it/s]


Train Loss: 1.2239855244112565, Validation Loss: 1.2308597805071835
Best Validation Loss: 1.2306453443638736
Epoch 43 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.29it/s]
100%|██████████| 10117/10117 [00:58<00:00, 174.42it/s]


Train Loss: 1.223768597792165, Validation Loss: 1.230402705031232
Best Validation Loss: 1.230402705031232
Epoch 44 of 50


100%|██████████| 30350/30350 [02:45<00:00, 183.55it/s]
100%|██████████| 10117/10117 [00:58<00:00, 173.17it/s]


Train Loss: 1.223692339877713, Validation Loss: 1.2299020967341416
Best Validation Loss: 1.2299020967341416
Epoch 45 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.53it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.28it/s]


Train Loss: 1.2235380730892131, Validation Loss: 1.2308310721659832
Best Validation Loss: 1.2299020967341416
Epoch 46 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.17it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.91it/s]


Train Loss: 1.2233515155364496, Validation Loss: 1.230960909914591
Best Validation Loss: 1.2299020967341416
Epoch 47 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.82it/s]
100%|██████████| 10117/10117 [00:58<00:00, 173.51it/s]


Train Loss: 1.2232092761659543, Validation Loss: 1.2303516044027074
Best Validation Loss: 1.2299020967341416
Epoch 48 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.69it/s]
100%|██████████| 10117/10117 [00:58<00:00, 172.90it/s]


Train Loss: 1.2231298077371604, Validation Loss: 1.230002210402873
Best Validation Loss: 1.2299020967341416
Epoch 49 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.10it/s]
100%|██████████| 10117/10117 [00:58<00:00, 173.32it/s]


Train Loss: 1.2229455044811601, Validation Loss: 1.2292159035340875
Best Validation Loss: 1.2292159035340875
Epoch 50 of 50


100%|██████████| 30350/30350 [02:44<00:00, 184.42it/s]
100%|██████████| 10117/10117 [00:58<00:00, 174.18it/s]

Train Loss: 1.2227869715015027, Validation Loss: 1.2294817762470556
Best Validation Loss: 1.2292159035340875





### Test the model

In [None]:
test(test_loader, model, criterion, device)

## Predict on new text
---

In [None]:
input_text = '''I am a victim of Identity Theft & currently have an Experian account that
I can view my Experian Credit Report and getting notified when there is activity on
my Experian Credit Report. For the past 3 days I've spent a total of approximately 9
hours on the phone with Experian. Every time I call I get transferred repeatedly and
then my last transfer and automated message states to press 1 and leave a message and
someone would call me. Every time I press 1 I get an automatic message stating than you
before I even leave a message and get disconnected. I call Experian again, explain what
is happening and the process begins again with the same end result. I was trying to have
this issue attended and resolved informally but I give up after 9 hours. There are hard
hit inquiries on my Experian Credit Report that are fraud, I didn't authorize, or recall
and I respectfully request that Experian remove the hard hit inquiries immediately just
like they've done in the past when I was able to speak to a live Experian representative
in the United States. The following are the hard hit inquiries : BK OF XXXX XX/XX/XXXX
XXXX XXXX XXXX  XX/XX/XXXX XXXX  XXXX XXXX  XX/XX/XXXX XXXX  XX/XX/XXXX XXXX  XXXX
XX/XX/XXXX'''

### Process input text

In [None]:
input_text = input_text.lower()
input_text = re.sub(r"[^\w\d'\s]+", " ", input_text)
input_text = re.sub("\d+", "", input_text)
input_text = re.sub(r'[x]{2,}', "", input_text)
input_text = re.sub(' +', ' ', input_text)
tokens = word_tokenize(input_text)

In [None]:
tokens = ['<pad>']*(20-len(tokens))+tokens

In [None]:
idx_token = []
for token in tokens:
    if token in vocabulary:
        idx_token.append(vocabulary.index(token))
    else:
        idx_token.append(vocabulary.index('<unk>'))

In [None]:
token_emb = embeddings[idx_token,:]
token_emb = token_emb[:seq_len, :]
inp = torch.from_numpy(token_emb)

In [None]:
inp = torch.cat((torch.ones(inp.shape[0],1), inp), dim=1)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available()
                      else "cpu")

In [None]:
inp = inp.to(device)
inp = torch.unsqueeze(inp, 0)

In [None]:
label_encoder = load_file(label_encoder_path)
num_classes = len(label_encoder.classes_)

In [None]:
# Create model object
model = AttentionModel(vec_len, seq_len, num_classes)

# Load trained weights
model.load_state_dict(torch.load(model_path))

# Move the model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()

# Forward pass
out = torch.squeeze(model(inp))

# Find predicted class
prediction = label_encoder.classes_[torch.argmax(out)]
print(f"Predicted  Class: {prediction}")