In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%matplotlib inline
!pip install torch>=1.3.1
!pip install torchtext==0.4

Collecting torchtext==0.4
  Downloading torchtext-0.4.0-py3-none-any.whl (53 kB)
[?25l[K     |██████▏                         | 10 kB 24.4 MB/s eta 0:00:01[K     |████████████▍                   | 20 kB 28.8 MB/s eta 0:00:01[K     |██████████████████▌             | 30 kB 12.4 MB/s eta 0:00:01[K     |████████████████████████▊       | 40 kB 9.6 MB/s eta 0:00:01[K     |██████████████████████████████▉ | 51 kB 5.0 MB/s eta 0:00:01[K     |████████████████████████████████| 53 kB 1.3 MB/s 
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.10.0
    Uninstalling torchtext-0.10.0:
      Successfully uninstalled torchtext-0.10.0
Successfully installed torchtext-0.4.0


In [3]:
import torch
import torchtext
#from torchtext.legacy.datasets import text_classification
from torchtext.datasets import text_classification

import os
if not os.path.isdir('./.data'):
    os.mkdir('./.data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./.data', vocab=None)
BATCH_SIZE = 16

ag_news_csv.tar.gz: 100%|██████████| 11.8M/11.8M [00:00<00:00, 95.8MB/s]
120000lines [00:09, 12155.85lines/s]
120000lines [00:20, 5969.01lines/s]
7600lines [00:01, 6361.28lines/s]


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
import torch
import torchtext
from torchtext import data

In [5]:
max_seq_len = 50
TEXT = data.Field(tokenize="spacy", batch_first=True, include_lengths=True, fix_length=max_seq_len)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)

In [6]:
fields = [('label', LABEL), (None, None), ('text',TEXT)]

In [7]:
%ls drive/MyDrive/UNT/AG_news

[0m[01;34mmodel[0m/  test.csv  train.csv


In [8]:
training_data=data.TabularDataset(path = 'drive/MyDrive/UNT/AG_news/train.csv',format = 'csv',fields = fields,skip_header = True)

In [9]:
print(vars(training_data.examples[0]))

{'label': '3', 'text': ['Reuters', '-', 'Short', '-', 'sellers', ',', 'Wall', 'Street', "'s", 'dwindling\\band', 'of', 'ultra', '-', 'cynics', ',', 'are', 'seeing', 'green', 'again', '.']}


In [10]:
train_data, valid_data = training_data.split(split_ratio=0.1)

In [11]:
#initialize glove embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.300d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
# print(TEXT.vocab.stoi)

.vector_cache/glove.6B.zip: 862MB [02:40, 5.36MB/s]                           
100%|█████████▉| 399999/400000 [00:45<00:00, 8725.37it/s]


Size of TEXT vocabulary: 11584
Size of LABEL vocabulary: 4
[('the', 17214), (',', 14482), ('.', 13095), ('-', 9708), ('to', 9543), ('a', 9362), ('of', 8841), ('in', 7584), ('and', 6575), ('on', 4600)]


In [13]:
batch_size = 64

train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data), batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True)

In [14]:
# Create neural network representation
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class CNNTextClassification(nn.Module):
    def __init__(self, vocabulary_size, embedding_size, max_seq_len, out_channels,
                 kernel_heights, dropout, num_class):
        super().__init__()
        self.out_channels = out_channels
        self.kernel_heights = kernel_heights
        self.embedding_size = embedding_size
        self.max_seq_len = max_seq_len
        
        self.embedding = nn.Embedding(vocabulary_size, embedding_size)
        
        self.conv1 = nn.Sequential(nn.Conv1d(in_channels=self.embedding_size, out_channels=self.out_channels,
                               kernel_size=self.kernel_heights[0]),
                                   nn.ReLU(),
                                  nn.MaxPool1d(self.max_seq_len - self.kernel_heights[0]+1))
        
        self.conv2 = nn.Sequential(nn.Conv1d(in_channels=self.embedding_size, out_channels=self.out_channels,
                               kernel_size=self.kernel_heights[1]),
                                   nn.ReLU(),
                                  nn.MaxPool1d(self.max_seq_len - self.kernel_heights[1]+1))
        
        self.conv3 = nn.Sequential(nn.Conv1d(in_channels=self.embedding_size, out_channels=self.out_channels,
                               kernel_size=self.kernel_heights[2]),
                                   nn.ReLU(),
                                  nn.MaxPool1d(self.max_seq_len - self.kernel_heights[2]+1))
        
        self.dropout = nn.Dropout(dropout)
        
        self.fc = nn.Linear(len(self.kernel_heights) * out_channels, num_class)
        
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, text, text_lengths):
        emb = self.embedding(text).permute(0, 2, 1)
        
        conv_out1 = self.conv1(emb).squeeze(2)
        conv_out2 = self.conv2(emb).squeeze(2)
        conv_out3 = self.conv3(emb).squeeze(2)
        
        all_out = torch.cat((conv_out1, conv_out2, conv_out3), 1)
        final_feature_map = self.dropout(all_out)
        
        final_out = self.fc(final_feature_map)
        
        return self.softmax(final_out)

In [15]:
vocabulary_size = len(TEXT.vocab)
n_class = len(LABEL.vocab)
embedding_size = 300
out_channels = 100
kernel_heights = [3, 4, 5]
dropout = 0.4

model = CNNTextClassification(vocabulary_size, embedding_size, max_seq_len,
                              out_channels, kernel_heights, dropout, n_class)

In [16]:
model

CNNTextClassification(
  (embedding): Embedding(11584, 300)
  (conv1): Sequential(
    (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=48, stride=48, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=47, stride=47, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=46, stride=46, padding=0, dilation=1, ceil_mode=False)
  )
  (dropout): Dropout(p=0.4, inplace=False)
  (fc): Linear(in_features=300, out_features=4, bias=True)
  (softmax): LogSoftmax(dim=-1)
)

In [17]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [-0.3967,  0.2623, -0.0905,  ...,  0.2497, -0.1803, -0.0729],
        [ 0.4281,  0.4327,  0.0331,  ..., -0.3796,  0.2552,  0.0976],
        [ 0.6744,  0.0307, -0.3489,  ...,  0.0670,  0.3262,  0.1210]])

In [18]:
def training(model, iterator, optimizer, criterion):
    training_loss = 0
    training_accuracy = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        output = model(text, text_lengths).squeeze()
        
        loss = criterion(output, target)
        
        training_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()

        acc = num_corrects/len(batch)
        training_accuracy += acc.item()

    scheduler.step()
    
    return training_loss / len(iterator), training_accuracy / len(iterator)

def testing(model, iterator, optimizer, criterion):
    testing_loss = 0
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            loss = criterion(output, target)
            
            testing_loss += loss.item()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects/len(batch)
        
            testing_accuracy += acc.item()
            
    return testing_loss / len(iterator), testing_accuracy / len(iterator)

In [21]:
import time

n_epochs = 15
min_val_loss = float("inf")
path='drive/MyDrive/UNT/AG_news/model/saved_weights_cnn.pt'

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

for epoch in range(n_epochs):
    start_time = time.time()
    
    train_loss, train_acc = training(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = testing(model, valid_iterator, optimizer, criterion)
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.2f}%(train)')
    print(f'\tLoss: {val_loss:.4f}(valid)\t|\tAcc: {val_acc * 100:.2f}%(valid)')
    
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        torch.save(model.state_dict(), path)

Epoch: 1  | time in 2 minutes, 48 seconds
	Loss: 0.4726(train)	|	Acc: 83.30%(train)
	Loss: 0.5348(valid)	|	Acc: 79.70%(valid)
Epoch: 2  | time in 2 minutes, 50 seconds
	Loss: 0.3948(train)	|	Acc: 86.34%(train)
	Loss: 0.4786(valid)	|	Acc: 83.19%(valid)
Epoch: 3  | time in 2 minutes, 49 seconds
	Loss: 0.3356(train)	|	Acc: 88.25%(train)
	Loss: 0.4276(valid)	|	Acc: 85.08%(valid)
Epoch: 4  | time in 2 minutes, 48 seconds
	Loss: 0.2721(train)	|	Acc: 90.58%(train)
	Loss: 0.4559(valid)	|	Acc: 85.14%(valid)
Epoch: 5  | time in 2 minutes, 49 seconds
	Loss: 0.2274(train)	|	Acc: 91.87%(train)
	Loss: 0.4271(valid)	|	Acc: 85.30%(valid)
Epoch: 6  | time in 2 minutes, 49 seconds
	Loss: 0.1791(train)	|	Acc: 94.07%(train)
	Loss: 0.4447(valid)	|	Acc: 85.37%(valid)
Epoch: 7  | time in 2 minutes, 47 seconds
	Loss: 0.1427(train)	|	Acc: 95.40%(train)
	Loss: 0.4435(valid)	|	Acc: 86.07%(valid)
Epoch: 8  | time in 2 minutes, 47 seconds
	Loss: 0.1175(train)	|	Acc: 96.28%(train)
	Loss: 0.4612(valid)	|	Acc: 85.32%

In [22]:
testing_data=data.TabularDataset(path = 'drive/MyDrive/UNT/AG_news/test.csv',format = 'csv',fields = fields,skip_header = True)

In [24]:
testing_iterator = data.BucketIterator(testing_data, batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True)

In [25]:
model.load_state_dict(torch.load(path))

def predict(model, iterator):
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        # text = TEXT.preprocess(text)
        label = batch.label
        target = torch.autograd.Variable(label).long()
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects / len(batch)
            testing_accuracy += acc.item()
    
    return testing_accuracy / len(iterator)

In [26]:
test_acc = predict(model, testing_iterator)
print(f"Accuracy {test_acc * 100:.2f}")

Accuracy 85.73
