#### Sogou New Dataset Summary
The Sogou News dataset is a mixture of 2,909,551 news articles from the SogouCA and SogouCS news corpora, in 5 categories. The number of training samples selected for each class is 90,000 and testing 12,000. 

**content:** a string feature.
**label:** a classification label, with possible values including sports (0), finance (1), entertainment (2), automobile (3), technology (4).

In [4]:
from torchtext.datasets import SogouNews
train_iter, test_iter = SogouNews(split=('train', 'test'))

In [5]:
print("Total number of train data:{}".format(len(train_iter)))
print("Total number of test data:{}".format(len(test_iter)))

Total number of train data:450000
Total number of test data:60000


In [6]:
# list a sample data.
next(train_iter)

(4,
 '2008 di4 qi1 jie4 qi1ng da3o guo2 ji4 che1 zha3n me3i nv3 mo2 te4  2008di4 qi1 jie4 qi1ng da3o guo2 ji4 che1 zha3n yu2 15 ri4 za4i qi1ng da3o guo2 ji4 hui4 zha3n zho1ng xi1n she4ng da4 ka1i mu4 . be3n ci4 che1 zha3n jia1ng chi2 xu4 da4o be3n yue4 19 ri4 . ji1n nia2n qi1ng da3o guo2 ji4 che1 zha3n shi4 li4 nia2n da3o che2ng che1 zha3n gui1 mo2 zui4 da4 di2 yi1 ci4 , shi3 yo4ng lia3o qi1ng da3o guo2 ji4 hui4 zha3n zho1ng xi1n di2 qua2n bu4 shi4 ne4i wa4i zha3n gua3n . yi3 xia4 we2i xia4n cha3ng mo2 te4 tu2 pia4n .')

In [7]:
next(test_iter)

(1,
 ' ti3 ca1o shi4 jie4 be1i : che2ng fe1i na2 pi2ng he2ng mu4 zi4 yo2u ca1o ji1n pa2i  su4 du4 : ( shuo1 mi2ng : dia3n ji1 zi4 do4ng bo1 fa4ng )\\n  shuo1 mi2ng : dia3n ji1 ga1i a4n niu3 , xua3n ze2 yi1 lu4n ta2n ji2 ke3 ')

In [8]:
from torch.utils.data import DataLoader
train_iter = SogouNews(split = 'train')


In [9]:
dataloader = DataLoader(train_iter, batch_size=64, shuffle=False)

In [10]:
# initialize tokenizer
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [11]:
# build basic english tokenizer
tokenizer = get_tokenizer("basic_english")


def yield_tokens(data_iter):
  for _, text in data_iter:
    yield tokenizer(text)

# build vocab object.
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])

In [12]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device) 

In [17]:
train_iter = SogouNews(split='train')
# data loader using collate batch
dataloader = DataLoader(train_iter, batch_size=64, shuffle=False, collate_fn=collate_batch)

In [18]:
from torch import nn

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [19]:
train_iter = SogouNews(split='train')
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [20]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) # disuccees
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [21]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
# Hyperparameters
EPOCHS = 5 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training
  
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = SogouNews()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/ 6680 batches | accuracy    0.825
| epoch   1 |  1000/ 6680 batches | accuracy    0.908
| epoch   1 |  1500/ 6680 batches | accuracy    0.917
| epoch   1 |  2000/ 6680 batches | accuracy    0.921
| epoch   1 |  2500/ 6680 batches | accuracy    0.920
| epoch   1 |  3000/ 6680 batches | accuracy    0.925
| epoch   1 |  3500/ 6680 batches | accuracy    0.923
| epoch   1 |  4000/ 6680 batches | accuracy    0.926
| epoch   1 |  4500/ 6680 batches | accuracy    0.926
| epoch   1 |  5000/ 6680 batches | accuracy    0.929
| epoch   1 |  5500/ 6680 batches | accuracy    0.929
| epoch   1 |  6000/ 6680 batches | accuracy    0.928
| epoch   1 |  6500/ 6680 batches | accuracy    0.930
-----------------------------------------------------------
| end of epoch   1 | time: 146.60s | valid accuracy    0.931 
-----------------------------------------------------------
| epoch   2 |   500/ 6680 batches | accuracy    0.933
| epoch   2 |  1000/ 6680 batches | accuracy    0.932
| epoch 

### Objective
Pick any 2 datasets (except AG_NEWS) from torchtext.datasets and train your model on them achieving 50% more accuracy than random prediction. Upload to Github with a proper readme file describing your datasets, and showing your logs as well.

### Result:

SogouNews data set was input data set for classification and this was 5 class classification problem. <br>
Objective was to achieving 50% more accuracy than random prediction(20%) which was 30%. <br>
Highest validation accuracy acheived was 93.6% which was 50% more accuracy than random prediction. <br>