<a href="https://colab.research.google.com/github/vpw/TSAI-END3/blob/main/Assignment5/Assignment5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import YelpReviewPolarity, SogouNews
from torch.utils.data import DataLoader
import time
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
import random

https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html


In [None]:
help(SogouNews)

Help on function SogouNews in module torchtext.datasets.sogounews:

SogouNews(root='.data', split=('train', 'test'))
    SogouNews dataset
    
    Separately returns the train/test split
    
    Number of lines per split:
        train: 450000
    
        test: 60000
    
    
    Number of classes
        5
    
    
    Args:
        root: Directory where the datasets are saved.
            Default: .data
        split: split or splits to be returned. Can be a string or tuple of strings.
            Default: ('train', 'test')



In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Model as given in the assignment code

In [3]:
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)



In [4]:
tokenizer = get_tokenizer("basic_english")

## Making a TrainTestDataset class which takes as parameters: dataset, #epochs to train, LR, batch size to use

In [5]:
class TrainTestDataset:
    def __init__(self, dataset, epochs, lr, batch_size):
        self.dataset = dataset
        self.epochs = epochs
        self.lr = lr
        self.batch_size = batch_size
        

        train_iter = dataset(split='train')
        # Buid vocabulary
        self.vocab = build_vocab_from_iterator(self.yield_tokens(train_iter), specials=["<unk>"])
        self.vocab.set_default_index(self.vocab["<unk>"])
        # Text and label pipeline
        self.text_pipeline = lambda x: self.vocab(tokenizer(x))
        self.label_pipeline = lambda x: int(x) - 1

        # random sample text is selected for a sample demo at the end
        self.sample_text=""

    # function as in assignment source
    def yield_tokens(self, data_iter):
        for _, text in data_iter:
            yield tokenizer(text)

    # collate function as in assignment source
    def collate_batch(self, batch):
        label_list, text_list, offsets = [], [], [0]
        for (_label, _text) in batch:
            label_list.append(self.label_pipeline(_label))
            processed_text = torch.tensor(self.text_pipeline(_text), dtype=torch.int64)
            text_list.append(processed_text)
            offsets.append(processed_text.size(0))
        label_list = torch.tensor(label_list, dtype=torch.int64)
        offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
        text_list = torch.cat(text_list)
        return label_list.to(device), text_list.to(device), offsets.to(device)    

    # added parameters - criterion, optimizer and epoch - to be called in the train and test function loop
    def train(self, dataloader, criterion, optimizer, epoch):
        self.model.train()
        total_acc, total_count = 0, 0
        log_interval = 500
        start_time = time.time()

        for idx, (label, text, offsets) in enumerate(dataloader):
            optimizer.zero_grad()
            predited_label = self.model(text, offsets)
            loss = criterion(predited_label, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.1) # disuccees
            optimizer.step()
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            if idx % log_interval == 0 and idx > 0:
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches '
                      '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                                  total_acc/total_count))
                total_acc, total_count = 0, 0
                start_time = time.time()
    
    # added parameter - criterion
    def evaluate(self, dataloader, criterion):
        self.model.eval()
        total_acc, total_count = 0, 0

        with torch.no_grad():
            for idx, (label, text, offsets) in enumerate(dataloader):
                predited_label = self.model(text, offsets)
                loss = criterion(predited_label, label)
                total_acc += (predited_label.argmax(1) == label).sum().item()
                total_count += label.size(0)
        return total_acc/total_count

    def train_and_test(self):
        
        train_iter = self.dataset(split = 'train')
        num_class = len(set([label for (label, text) in train_iter]))
        vocab_size = len(self.vocab)
        emsize = 64
        self.model = TextClassificationModel(vocab_size, emsize, num_class).to(device)
        
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
        total_accu = None
        train_iter, test_iter = self.dataset()
        
        train_dataset = to_map_style_dataset(train_iter)
        test_dataset = to_map_style_dataset(test_iter)

        self.sample_text = test_dataset[random.randint(0,len(test_dataset))]

        num_train = int(len(train_dataset) * 0.95)
        split_train_, split_valid_ = \
            random_split(train_dataset, [num_train, len(train_dataset) - num_train])

        #dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)
        train_dataloader = DataLoader(split_train_, batch_size=self.batch_size,
                                      shuffle=True, collate_fn=self.collate_batch)
        valid_dataloader = DataLoader(split_valid_, batch_size=self.batch_size,
                                      shuffle=True, collate_fn=self.collate_batch)
        test_dataloader = DataLoader(test_dataset, batch_size=self.batch_size,
                                    shuffle=True, collate_fn=self.collate_batch)

        for epoch in range(1, self.epochs + 1):
            epoch_start_time = time.time()
            self.train(train_dataloader, criterion, optimizer, epoch)
            accu_val = self.evaluate(valid_dataloader, criterion)
            if total_accu is not None and total_accu > accu_val:
              scheduler.step()
            else:
              total_accu = accu_val
            print('-' * 59)
            print('| end of epoch {:3d} | time: {:5.2f}s | '
                  'valid accuracy {:8.3f} '.format(epoch,
                                                  time.time() - epoch_start_time,
                                                  accu_val))
            print('-' * 59)

 
        print('Checking the results of test dataset.')
        accu_test = self.evaluate(test_dataloader, criterion)
        print('test accuracy {:8.3f}'.format(accu_test))

    def predict(self, text, text_pipeline):
        with torch.no_grad():
            text = torch.tensor(text_pipeline(text))
            output = self.model(text, torch.tensor([0]))
            return output.argmax(1).item() + 1
    
    def test_sample(self, labels):
        self.model = self.model.to("cpu")
        print(self.sample_text)

        print("This is a %s news" %labels[self.predict(self.sample_text, self.text_pipeline)])

In [6]:
tt = TrainTestDataset(YelpReviewPolarity, epochs=5, lr=5, batch_size=64)
tt.train_and_test()

| epoch   1 |   500/ 8313 batches | accuracy    0.783
| epoch   1 |  1000/ 8313 batches | accuracy    0.863
| epoch   1 |  1500/ 8313 batches | accuracy    0.880
| epoch   1 |  2000/ 8313 batches | accuracy    0.887
| epoch   1 |  2500/ 8313 batches | accuracy    0.892
| epoch   1 |  3000/ 8313 batches | accuracy    0.897
| epoch   1 |  3500/ 8313 batches | accuracy    0.899
| epoch   1 |  4000/ 8313 batches | accuracy    0.900
| epoch   1 |  4500/ 8313 batches | accuracy    0.906
| epoch   1 |  5000/ 8313 batches | accuracy    0.905
| epoch   1 |  5500/ 8313 batches | accuracy    0.903
| epoch   1 |  6000/ 8313 batches | accuracy    0.910
| epoch   1 |  6500/ 8313 batches | accuracy    0.908
| epoch   1 |  7000/ 8313 batches | accuracy    0.907
| epoch   1 |  7500/ 8313 batches | accuracy    0.913
| epoch   1 |  8000/ 8313 batches | accuracy    0.911
-----------------------------------------------------------
| end of epoch   1 | time: 80.01s | valid accuracy    0.908 
---------------

In [None]:
tt.test_sample(
    labels={0: "Negative polarity", 1: "Positive polarity"}
)

In [8]:
tt = TrainTestDataset(SogouNews, epochs=5, lr=5, batch_size=64)
tt.train_and_test()

100%|██████████| 384M/384M [00:03<00:00, 111MB/s]


| epoch   1 |   500/ 6680 batches | accuracy    0.812
| epoch   1 |  1000/ 6680 batches | accuracy    0.907
| epoch   1 |  1500/ 6680 batches | accuracy    0.919
| epoch   1 |  2000/ 6680 batches | accuracy    0.919
| epoch   1 |  2500/ 6680 batches | accuracy    0.924
| epoch   1 |  3000/ 6680 batches | accuracy    0.923
| epoch   1 |  3500/ 6680 batches | accuracy    0.925
| epoch   1 |  4000/ 6680 batches | accuracy    0.927
| epoch   1 |  4500/ 6680 batches | accuracy    0.928
| epoch   1 |  5000/ 6680 batches | accuracy    0.927
| epoch   1 |  5500/ 6680 batches | accuracy    0.930
| epoch   1 |  6000/ 6680 batches | accuracy    0.925
| epoch   1 |  6500/ 6680 batches | accuracy    0.929
-----------------------------------------------------------
| end of epoch   1 | time: 174.42s | valid accuracy    0.929 
-----------------------------------------------------------
| epoch   2 |   500/ 6680 batches | accuracy    0.931
| epoch   2 |  1000/ 6680 batches | accuracy    0.931
| epoch 

In [9]:
tt.test_sample(
    labels={0: 'Sports',
                  1: 'Finance',
                  2: 'Entertainment',
                  3: 'Automobile',
                  4: 'Technology'}
)

(5, ' duo1 ca3i   be3n be3n rui4 ya3n DLV-B57 cha3n pi3n mi2ng che1ng \\n  duo1 ca3i   be3n be3n rui4 ya3n DLV-B57\\n  jia4 ge2 : ￥99\\n  ji1 be3n ca1n shu4   duo1 ca3i   be3n be3n rui4 ya3n DLV-B57\\n  shi4 fo3u fa2ng da4o   fo3u \\n  shi4 yo4ng le4i xi2ng   bi3 ji4 be3n \\n  qu1 do4ng le4i xi2ng   wu2 \\n  xi4 to3ng zhi1 chi2  Win 2000/XP/ Vista de3ng ca1o zuo4 xi4 to3ng \\n  shi4 pi2n tu2 xia4ng   ge2 shi4 :YUY2\\n  jie1 ko3u le4i xi2ng  USB2.0, xia4ng xia4 jia1n ro2ng USB1.1\\n  zui4 da4 zhe1n pi2n  30\\n  she4 xia4ng to2u xia4ng su4  130 wa4n \\n  che2ng xia4ng ju4 li2  3CM-- wu2 xia4n yua3n \\n  xi4n za4o bi3   yo1u yu2 48\\n  yua2n jia4n mia2o shu4   xi1n yi1 da4i USB2.0 te4 xia4o duo1 go1ng ne2ng zhu3 ko4ng xi1n pia4n \\n  ga3n gua1ng yua2n jia4n  CMOS\\n  do4ng ta4i fe1n bia4n shua4i  1280×960\\n  xi4ng ne2ng ca1n shu4   duo1 ca3i   be3n be3n rui4 ya3n DLV-B57\\n  zui4 xia3o li2ng mi3n du4  1.0V/Lux-sec\\n  se4 ca3i we4i shu4  24 we4i \\n  shi4 cha2ng  ≥ 53 du4 \\n  ba4o gua1n

AttributeError: ignored