<a href="https://colab.research.google.com/github/vin-thomas/BBC_news_classification/blob/main/BBC_BERT_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Imports**

In [None]:
import os
import pandas as pd
import torch
import sklearn
from IPython.core.display import display, HTML
import glob
import numpy as np
from torch.utils.data import Dataset, DataLoader

##**Install and import Transformers, BertTokenizer**

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.1-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 8.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 61.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 71.2 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 51.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [None]:
import transformers
from transformers import BertTokenizer

##**Download the dataset**

In [None]:
URL = "http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip" 
!wget -P 'Data/' $URL

--2022-01-29 07:06:59--  http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip
Resolving mlg.ucd.ie (mlg.ucd.ie)... 137.43.93.132
Connecting to mlg.ucd.ie (mlg.ucd.ie)|137.43.93.132|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2874078 (2.7M) [application/zip]
Saving to: ‘Data/bbc-fulltext.zip’


2022-01-29 07:07:00 (2.87 MB/s) - ‘Data/bbc-fulltext.zip’ saved [2874078/2874078]



In [None]:
!unzip "/content/Data/bbc-fulltext.zip" -d 'Data/'

In [None]:
file_name_iter = glob.iglob('/content/Data/bbc/**/*.txt')

In [None]:
def get_topic(file):
  return file.split('/')[-2]

In [None]:
def get_content(file):
  with open(file, 'rb') as f:
    content = f.read()
    return content

In [None]:
labels = []
content = []

for file in file_name_iter:
  labels.append(get_topic(file))
  content.append (get_content(file))

In [None]:
mod_content =[]
for item in content:
  item= str(item)
  mod_content.append(item)

In [None]:
type(mod_content), type(mod_content[0]), len(mod_content), type(labels), len(labels)

(list, str, 2225, list, 2225)

In [None]:
labels, uniques = pd.factorize(labels)

In [None]:
type(labels)

numpy.ndarray

In [None]:
from sklearn.model_selection import train_test_split

# Train Test Split
train_inputs, val_inputs, train_labels, val_labels = train_test_split(mod_content, labels, test_size=0.1, random_state=42)

In [None]:
train_bbc_df = pd.DataFrame(list(zip(train_inputs, train_labels)), columns =['Text', 'Labels'])
val_bbc_df = pd.DataFrame(list(zip(val_inputs,val_labels)), columns= ['Text', 'Labels'])

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
max_len = 150

##**Custom Dataset Class**

In [None]:
class BBC(Dataset):

  def __init__(self, df, tokenizer, max_len):
    self.text = df.Text
    self.targets = df.Labels
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.text)
  
  def __getitem__(self, item):
    text = str(self.text[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors= 'pt'
      
    )

    return {
      'review_text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

##**Funciton to create dataloader**

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = BBC(
    df= df,
    tokenizer=tokenizer,
    max_len=max_len
    )

  return DataLoader(
      ds,
      batch_size=batch_size,
      shuffle = True,
      num_workers=2
      )

In [None]:
train_bbc_df.shape, val_bbc_df.shape

((2002, 2), (223, 2))

In [None]:
val_bbc_df.Labels.unique()

array([0, 3, 2, 4, 1])

In [None]:
BATCH_SIZE = 50
val_batch_size =20
train_data_loader = create_data_loader(train_bbc_df, tokenizer, max_len, BATCH_SIZE)
val_data_loader = create_data_loader(val_bbc_df, tokenizer, max_len, val_batch_size)

##**Model Class**

In [None]:
from torch import nn
from transformers import BertModel, BertForSequenceClassification

class BertClassifier(nn.Module):

  def __init__(self, n_classes):
    super(BertClassifier, self).__init__()
    self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                              num_labels=5,
                                                              output_attentions= False,
                                                              output_hidden_states= False)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask, labels):
    pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      labels= labels
    )
           
    return pooled_output

In [None]:
model = BertClassifier(5)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=1e-5)
device

device(type='cuda')

In [None]:
model = model.to(device)

In [None]:
def evaluate ():
  eval_model = model
  eval_model.eval
  val_accuracy = []
  val_loss = []
  
  for batch in val_data_loader:
    
    input_ids = batch['input_ids'].to(device, dtype= torch.long)
    attention_mask = batch['attention_mask'].to(device, dtype= torch.long)
    labels_ = batch['targets']
    labels = torch.nn.functional.one_hot(labels_, 5).squeeze(1)
    labels = labels.to(device, dtype= torch.float)
    
    result = model(input_ids= input_ids, attention_mask=attention_mask, 
                   labels=labels)
    loss = result.loss
    val_loss.append(loss.item())
    
    logits = result.logits

    _, predicted = torch.max(logits, 1)
         
    accuracy = (predicted == labels_.to(device)).cpu().numpy().mean() * 100
    #correct += (predicted == labels_.to(device)).sum().item()
    val_accuracy.append(accuracy)

  val_loss = np.mean(val_loss)
  val_accuracy = np.mean(val_accuracy)

  return val_loss, val_accuracy

##**Train**

In [None]:
epochs=3
train_losses = []
train_accuracy = []
model.train()


for e in range(epochs):
  train_loss = 0
  correct = 0
  i=0
  for batch in (train_data_loader):
    i+=1
    input_ids = batch['input_ids'].to(device, dtype= torch.long)
    attention_mask = batch['attention_mask'].to(device, dtype= torch.long)
    labels_ = batch['targets']
    labels = torch.nn.functional.one_hot(labels_, 5).squeeze(1)
    labels = labels.to(device, dtype= torch.float) 
    
    model.zero_grad()
 
    result = model(input_ids= input_ids, attention_mask=attention_mask, 
                   labels=labels)
    loss = result.loss
    logits = result.logits
      
    
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(),1)
    optimizer.step()

    _, predicted = torch.max(logits, 1)
    correct += (predicted == labels_.to(device)).sum().item()

    print(i,')', 'loss', loss.item(), 'correct', (predicted == labels_.to(device)).sum().item())
    
  
  train_losses.append(loss/len (train_inputs))
  train_accuracy.append(100 * correct/len(train_inputs))
  print('epoch: {}, Train Loss:{:.6f} Train Accuracy: {:.2f} '.format(e+1,train_losses[-1], train_accuracy[-1]))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1 ) loss 0.6969845294952393 correct 5
2 ) loss 0.6631826162338257 correct 7
3 ) loss 0.6283763647079468 correct 16
4 ) loss 0.6038246750831604 correct 16
5 ) loss 0.5860958695411682 correct 17
6 ) loss 0.5754627585411072 correct 15
7 ) loss 0.5704957842826843 correct 11
8 ) loss 0.5524526238441467 correct 20
9 ) loss 0.5439665913581848 correct 14
10 ) loss 0.5346143245697021 correct 22
11 ) loss 0.5335674285888672 correct 15
12 ) loss 0.5258780121803284 correct 21
13 ) loss 0.5287726521492004 correct 15
14 ) loss 0.5096898078918457 correct 23
15 ) loss 0.5172194838523865 correct 16
16 ) loss 0.5102400183677673 correct 19
17 ) loss 0.5052440762519836 correct 22
18 ) loss 0.48774704337120056 correct 25
19 ) loss 0.4771480858325958 correct 30
20 ) loss 0.49516966938972473 correct 17
21 ) loss 0.47700226306915283 correct 22
22 ) loss 0.4695652723312378 correct 34
23 ) loss 0.4696936309337616 correct 24
24 ) loss 0.4637793302536011 correct 28
25 ) loss 0.45241814851760864 correct 27
26 ) lo

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1 ) loss 0.4183225929737091 correct 42
2 ) loss 0.41208958625793457 correct 44
3 ) loss 0.3881213665008545 correct 41
4 ) loss 0.4051302969455719 correct 42
5 ) loss 0.386962890625 correct 42
6 ) loss 0.3954249918460846 correct 44
7 ) loss 0.3857426345348358 correct 42
8 ) loss 0.3912836015224457 correct 44
9 ) loss 0.3864504098892212 correct 45
10 ) loss 0.3713667094707489 correct 46
11 ) loss 0.3652103543281555 correct 47
12 ) loss 0.3685725927352905 correct 45
13 ) loss 0.3602641224861145 correct 48
14 ) loss 0.3681393265724182 correct 44
15 ) loss 0.3602341413497925 correct 46
16 ) loss 0.35585734248161316 correct 46
17 ) loss 0.3494625389575958 correct 46
18 ) loss 0.3723527789115906 correct 46
19 ) loss 0.35114437341690063 correct 45
20 ) loss 0.3451877236366272 correct 47
21 ) loss 0.3370518088340759 correct 47
22 ) loss 0.33021998405456543 correct 45
23 ) loss 0.3226364254951477 correct 47
24 ) loss 0.32744136452674866 correct 47
25 ) loss 0.3138826787471771 correct 47
26 ) los

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1 ) loss 0.25821653008461 correct 47
2 ) loss 0.26668378710746765 correct 47
3 ) loss 0.25083059072494507 correct 48
4 ) loss 0.24379484355449677 correct 50
5 ) loss 0.24467691779136658 correct 49
6 ) loss 0.25630420446395874 correct 47
7 ) loss 0.22729697823524475 correct 48
8 ) loss 0.24120663106441498 correct 47
9 ) loss 0.2498008906841278 correct 48
10 ) loss 0.2499818503856659 correct 46
11 ) loss 0.22905516624450684 correct 49
12 ) loss 0.21530412137508392 correct 50
13 ) loss 0.23309458792209625 correct 48
14 ) loss 0.22361105680465698 correct 49
15 ) loss 0.2246142029762268 correct 49
16 ) loss 0.23167158663272858 correct 49
17 ) loss 0.21959710121154785 correct 47
18 ) loss 0.22482788562774658 correct 47
19 ) loss 0.20101004838943481 correct 50
20 ) loss 0.20530490577220917 correct 49
21 ) loss 0.20737196505069733 correct 48
22 ) loss 0.20781707763671875 correct 48
23 ) loss 0.20758949220180511 correct 48
24 ) loss 0.19662833213806152 correct 50
25 ) loss 0.1961456686258316 co

In [None]:
model.eval()
val_accuracy = []
val_loss = []
  
for batch in val_data_loader:

  input_ids = batch['input_ids'].to(device, dtype= torch.long)
  attention_mask = batch['attention_mask'].to(device, dtype= torch.long)
  labels_ = batch['targets']
  labels = torch.nn.functional.one_hot(labels_, 5).squeeze(1)
  labels = labels.to(device, dtype= torch.float)
  
  result = model(input_ids= input_ids, attention_mask=attention_mask, 
                  labels=labels)
  loss = result.loss
  
  val_loss.append(loss.item())
  
  logits = result.logits

  _, predicted = torch.max(logits, 1)
        
  accuracy = (predicted == labels_.to(device)).cpu().numpy().mean() * 100
  val_accuracy.append(accuracy)


val_loss = np.mean(val_loss)
val_accuracy = np.mean(val_accuracy)
print ('validation_loss:', val_loss, 'validation accuracy:', val_accuracy)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


validation_loss: 0.1634738283852736 validation accuracy: 95.0
