In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install pytorch_pretrained_bert



In [3]:
from transformers import DistilBertConfig, BertTokenizer, \
DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertModel
from pytorch_pretrained_bert.optimization import BertAdam
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import numpy as np
from sklearn import preprocessing

from __future__ import absolute_import, division, print_function
import csv
import os
import sys
import logging
import copy
import time

logger = logging.getLogger()
csv.field_size_limit(2147483647) # Increase CSV reader's field limit incase we have long text.

131072

In [None]:
train_lines = []
with open('/content/drive/My Drive/data/bbc_news/train.tsv', "r", encoding="utf-8") as f:
  reader = csv.reader(f, delimiter="\t")
  
  for line in reader:
    if sys.version_info[0] == 2:
      line = list(unicode(cell, 'utf-8') for cell in line)
    train_lines.append(line)
  
train_lines = np.array(train_lines)
print(train_lines)


test_lines = []
with open('/content/drive/My Drive/data/bbc_news/test.tsv', "r", encoding="utf-8") as f:
  reader = csv.reader(f, delimiter="\t")
  
  for line in reader:
    if sys.version_info[0] == 2:
      line = list(unicode(cell, 'utf-8') for cell in line)
    test_lines.append(line)
  
test_lines = np.array(test_lines)
print(test_lines)

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_seq_length = 128
label_names = ['entertainment', 'business', 'sport', 'politics', 'tech']
le = preprocessing.LabelEncoder()
le.fit(label_names)

LabelEncoder()

In [0]:
class BBCNewsDataset(Dataset):
    def __init__(self, text_inputs, labels):
        self.all_input_ids = list()
        self.text_inputs = text_inputs
        for text in self.text_inputs:
          tokens = tokenizer.tokenize(text)

          if len(tokens) > max_seq_length - 2:
            tokens = tokens[:(max_seq_length - 2)]

          tokens = ["[CLS]"] + tokens + ["[SEP]"]
          input_ids = tokenizer.convert_tokens_to_ids(tokens)
          padding = [0] * (max_seq_length - len(input_ids))
          input_ids += padding

          self.all_input_ids.append(input_ids)
          
        self.labels = le.transform(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        x = self.all_input_ids[index]
        y = self.labels[index]

        return torch.tensor(x), torch.tensor(y)

In [7]:
train_bbc_news_dataset = BBCNewsDataset(train_lines[:, 0], train_lines[:, 1])
test_bbc_news_dataset = BBCNewsDataset(test_lines[:, 0], test_lines[:, 1])
print(train_bbc_news_dataset[0])
print(test_bbc_news_dataset[0])

(tensor([  101,  2093, 23837,  5672, 14113,  2557,  2265,  1996,  2397,  2198,
        14113,  1005,  1055,  4035,  2557,  1015,  2265,  2003,  2000,  2022,
         4594,  1999,  2337,  2011,  2093,  3065,  4354,  2011,  2093, 23837,
         7995,  2006,  7578,  1010,  2512,  1011,  3293,  2189,  1012, 15876,
         2860, 15037,  1010, 20710,  6448, 14074,  1998,  6487,  4830,  2924,
         2097,  2169,  3677,  1996,  3054,  1011,  2733,  1010,  2397,  1011,
         2305,  2335, 10994,  1010, 27696,  2866,  5848,  1012,  2557,  1015,
         2056,  1996,  2265,  2052,  2025,  3046,  2000,  5672, 14113,  1010,
         2021,  2052,  4125,  2000,  1996,  1000,  4119,  1000,  1997,  1000,
         4363,  2010,  8027,  4142,  1000,  2007, 21446,  2189,  1012, 14113,
         2351,  2044,  6114,  1037,  2540,  2886,  1999,  7304,  1999,  2255,
         1012,  2557,  1015,  2056,  1996,  2093, 23837,  2018,  2042,  4217,
         2005,  2037,  1000,  1999,  1011,  5995,  3315,   102]

In [0]:
num_classes = len(label_names)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
train_loader = DataLoader(dataset=train_bbc_news_dataset, batch_size=64, shuffle=True, num_workers=4)
test_loader = DataLoader(dataset=test_bbc_news_dataset, batch_size=64, shuffle=True, num_workers=4)

In [9]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

for param in model.parameters():
    param.requires_grad = False

model.classifier = torch.nn.Linear(in_features=768, out_features=num_classes, bias=True)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[5000, 10000, 15000], gamma=0.5)
criterion = torch.nn.CrossEntropyLoss()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(model)
print('Distilbert model parameter: {}'.format(count_parameters(model)))

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (dropout): Dropout(p=0.1, inplace=False)
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
         

In [10]:
print(len(train_bbc_news_dataset))

1777


In [11]:
def accuracy(output, labels):
  pred = torch.argmax(output, dim=1)
  correct = pred.eq(labels)
  return torch.mean(correct.float())

total_loss, total_acc = 0, 0
loss_list = []
acc_list = []

epochs = 20
itr = 1
p_itr = 10

# start training
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
since = time.time()

for epoch in range(epochs):
  model.train()
  for samples, labels in train_loader:
    samples, labels = samples.to(device), labels.to(device)
    optimizer.zero_grad()
    output = model(samples)[0]
    loss = criterion(output, labels)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
    total_acc += accuracy(output, labels)
    scheduler.step()

    if itr % p_itr == 0:
      print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, train_acc: {:.3f}'\
            .format(epoch + 1, epochs, itr, total_loss / p_itr, total_acc / p_itr))
      
      loss_list.append(total_loss / p_itr)
      acc_list.append(total_acc / p_itr)
      total_loss, total_acc = 0, 0
    itr += 1

  model.eval()
  test_acc = 0.0
  for samples, labels in test_loader:
    with torch.no_grad():
      samples, labels = samples.to(device), labels.to(device)
      output = model(samples)[0]
      test_acc += accuracy(output, labels)

  print('Accuracy on test set after {} epoch: {}%'.format(epoch + 1,
                                  round(test_acc.item()*100.0/len(test_loader), 2)))

  if (test_acc.item() > best_acc):
    best_acc = test_acc.item()
    best_model_wts = copy.deepcopy(model.state_dict())
    print('update best')

  print('-' * 10)

time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('best acc on test set: ', best_acc/len(test_loader))
model.load_state_dict(best_model_wts)
torch.save(model.state_dict(), '/content/drive/My Drive/distil_model.pth')

[Epoch 1/20] Iteration 10 -> Train Loss: 1.5630, train_acc: 0.350
[Epoch 1/20] Iteration 20 -> Train Loss: 1.4817, train_acc: 0.375
Accuracy on test set after 1 epoch: 63.07%
update best
----------
[Epoch 2/20] Iteration 30 -> Train Loss: 1.4216, train_acc: 0.544
[Epoch 2/20] Iteration 40 -> Train Loss: 1.3444, train_acc: 0.723
[Epoch 2/20] Iteration 50 -> Train Loss: 1.2815, train_acc: 0.839
Accuracy on test set after 2 epoch: 88.59%
update best
----------
[Epoch 3/20] Iteration 60 -> Train Loss: 1.2084, train_acc: 0.868
[Epoch 3/20] Iteration 70 -> Train Loss: 1.1669, train_acc: 0.836
[Epoch 3/20] Iteration 80 -> Train Loss: 1.1113, train_acc: 0.883
Accuracy on test set after 3 epoch: 92.17%
update best
----------
[Epoch 4/20] Iteration 90 -> Train Loss: 1.0430, train_acc: 0.916
[Epoch 4/20] Iteration 100 -> Train Loss: 1.0147, train_acc: 0.898
[Epoch 4/20] Iteration 110 -> Train Loss: 0.9708, train_acc: 0.928
Accuracy on test set after 4 epoch: 94.19%
update best
----------
[Epoch 5