In [0]:
import torch
from torch import nn as nn
from torchtext import datasets,data
from matplotlib.pyplot import plot as plt
from collections import Counter, OrderedDict
from torch.autograd import Variable
import torch.optim as optim
import re
import torch.nn.functional as F
import random
import numpy as np

In [0]:
#Importing libraries for text classification

In [2]:
seed = 1234
torch.manual_seed(1234)
torch.cuda.manual_seed(1234)
torch.backends.cudnn.deterministic = True
text = data.Field(tokenize = 'spacy')
label = data.LabelField()
train_data, test_data = datasets.TREC.splits(text, label, fine_grained=False)
train_data, valid_data = train_data.split(random_state=random.seed(seed))
vars(train_data[1])

downloading train_5500.label


train_5500.label: 336kB [00:00, 3.67MB/s]
TREC_10.label: 23.4kB [00:00, 5.63MB/s]                   


downloading TREC_10.label


{'label': 'HUM',
 'text': ['Who',
  'replaced',
  'Bert',
  'Parks',
  'as',
  'the',
  'host',
  'of',
  'The',
  'Miss',
  'America',
  'Pageant',
  '?']}

In [60]:
'''
The 6 labels (for the non-fine-grained case) correspond to the 6 types of questions in the dataset:

    HUM for questions about humans
    ENTY for questions about entities
    DESC for questions asking you for a description
    NUM for questions where the answer is numerical
    LOC for questions where the answer is a location
    ABBR for questions asking about abbreviations

'''

'\nThe 6 labels (for the non-fine-grained case) correspond to the 6 types of questions in the dataset:\n\n    HUM for questions about humans\n    ENTY for questions about entities\n    DESC for questions asking you for a description\n    NUM for questions where the answer is numerical\n    LOC for questions where the answer is a location\n    ABBR for questions asking about abbreviations\n\n'

In [3]:
text.build_vocab(train_data, vectors="glove.6B.100d")

.vector_cache/glove.6B.zip: 862MB [02:24, 5.98MB/s]                           
100%|█████████▉| 398846/400000 [00:17<00:00, 22830.73it/s]

In [0]:
label.build_vocab(train_data)

In [5]:
print(label.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7fb8957b9b70>, {'HUM': 0, 'ENTY': 1, 'DESC': 2, 'NUM': 3, 'LOC': 4, 'ABBR': 5})


In [0]:
batch_size = 64
device = torch.device("cuda"if torch.cuda.is_available()else"cpu")
train_it,valid_it,test_it = data.BucketIterator.splits(
                             (train_data, valid_data, test_data),batch_size=batch_size,device=device)


In [0]:
class NN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout=0.2):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size,embedding_dim)
    self.convs = nn.ModuleList([
                                nn.Conv2d(in_channels = 1, out_channels = n_filters, kernel_size = (fs, embedding_dim)) for fs in filter_sizes
                                ])
    self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
    self.dropout = nn.Dropout(dropout)
    
  def forward(self,text):
    text = text.permute(1,0)
    text = self.embedding(text)
    text = text.unsqueeze(1)
    convs = [F.relu(conv(text)).squeeze(3) for conv in self.convs]
    pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in convs]
    text = self.dropout(torch.cat(pooled, dim=1))
    text = self.fc(text)
    return text

In [35]:
inp_dim = len(text.vocab)
embedding_dim = 100
n_filters = 300
filter_size = [2,3,4]
output_dim = len(label.vocab)
model = NN(inp_dim,embedding_dim,n_filters,filter_size,output_dim,0.2)
model

NN(
  (embedding): Embedding(7505, 100)
  (convs): ModuleList(
    (0): Conv2d(1, 300, kernel_size=(2, 100), stride=(1, 1))
    (1): Conv2d(1, 300, kernel_size=(3, 100), stride=(1, 1))
    (2): Conv2d(1, 300, kernel_size=(4, 100), stride=(1, 1))
  )
  (fc): Linear(in_features=900, out_features=6, bias=True)
  (dropout): Dropout(p=0.2)
)

In [36]:
pretrained_embeddings = text.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1638,  0.6046,  1.0789,  ..., -0.3140,  0.1844,  0.3624],
        ...,
        [-0.3110, -0.3398,  1.0308,  ...,  0.5317,  0.2836, -0.0640],
        [ 0.0091,  0.2810,  0.7356,  ..., -0.7508,  0.8967, -0.7631],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [0]:
optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim=1, keepdim=True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum()/torch.FloatTensor([y.shape[0]])


In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text)
        
        loss = criterion(predictions, batch.label)
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text)
            
            loss = criterion(predictions, batch.label)
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [42]:
n_epochs = 10
for epoch in range(n_epochs):

    train_loss, train_acc = train(model, train_it, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_it, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.036 | Train Acc: 99.61% | Val. Loss: 0.370 | Val. Acc: 86.75% |
| Epoch: 02 | Train Loss: 0.024 | Train Acc: 99.84% | Val. Loss: 0.362 | Val. Acc: 87.31% |
| Epoch: 03 | Train Loss: 0.015 | Train Acc: 99.95% | Val. Loss: 0.371 | Val. Acc: 87.52% |
| Epoch: 04 | Train Loss: 0.011 | Train Acc: 99.97% | Val. Loss: 0.372 | Val. Acc: 86.53% |
| Epoch: 05 | Train Loss: 0.007 | Train Acc: 100.00% | Val. Loss: 0.376 | Val. Acc: 87.65% |
| Epoch: 06 | Train Loss: 0.006 | Train Acc: 100.00% | Val. Loss: 0.379 | Val. Acc: 87.35% |
| Epoch: 07 | Train Loss: 0.005 | Train Acc: 100.00% | Val. Loss: 0.376 | Val. Acc: 87.88% |
| Epoch: 08 | Train Loss: 0.004 | Train Acc: 100.00% | Val. Loss: 0.377 | Val. Acc: 87.40% |
| Epoch: 09 | Train Loss: 0.003 | Train Acc: 100.00% | Val. Loss: 0.381 | Val. Acc: 87.71% |
| Epoch: 10 | Train Loss: 0.003 | Train Acc: 100.00% | Val. Loss: 0.391 | Val. Acc: 87.41% |


In [44]:
test_loss, test_acc = evaluate(model, test_it, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.298 | Test Acc: 90.55% |


In [49]:
!ls

sample_data  text_classifier.pt


In [0]:
torch.save(model.state_dict(), "text_classifier.pt")

In [0]:
import spacy
nlp = spacy.load('en')

def predict_class(sentence, min_len=4):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [text.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    preds = model(tensor)
    max_preds = preds.argmax(dim=1)
    return max_preds.item()

In [57]:
pred_class = predict_class("where is India?")
print(f'Predicted class is: {pred_class} = {label.vocab.itos[pred_class]}')


Predicted class is: 4 = LOC


In [58]:
pred_class = predict_class("who is tanmoy?")
print(f'Predicted class is: {pred_class} = {label.vocab.itos[pred_class]}')

Predicted class is: 0 = HUM


In [59]:
pred_class = predict_class("i am happy")
print(f'Predicted class is: {pred_class} = {label.vocab.itos[pred_class]}')

Predicted class is: 2 = DESC


In [61]:
pred_class = predict_class("where is India?")
print(f'Predicted class is: {pred_class} = {label.vocab.itos[pred_class]}')

Predicted class is: 4 = LOC
