# Text classification using BERT

In [None]:
!pip -q install transformers

In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [4]:
!wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'

--2024-03-24 19:01:21--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip’

smsspamcollection.z     [  <=>               ] 198.65K   649KB/s    in 0.3s    

2024-03-24 19:01:21 (649 KB/s) - ‘smsspamcollection.zip’ saved [203415]



In [5]:
!unzip -o smsspamcollection.zip

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [None]:
!head -10 SMSSpamCollection

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
spam	H

In [6]:
file_path = '/content/SMSSpamCollection'

In [None]:
df = pd.DataFrame({'label':int(),'text':str()}, index=[])
with open(file_path) as f:
  for line in f.readlines():
    split = line.split('\t')
    df = df.append({'label':1 if split[0] =='spam' else 0,
                    'text':split[1]},
                   ignore_index=True)

df.head()

In [8]:
text = df.text.values
print(text)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n'
 'Ok lar... Joking wif u oni...\n'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n"
 ... 'Pity, * was in mood for that. So...any other suggestions?\n'
 "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free\n"
 'Rofl. Its true to its name\n']


In [9]:
labels = df.label.values
print(labels)

[0 0 1 ... 0 0 0]


In [10]:
#Download tokenizer from BertTokenizer

tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
def print_rand_sentence():
  index = random.randint(0, len(text)-1)
  table = np.array([tokenizer.tokenize(text[index]),
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒══════════╤═════════════╕
│ Tokens   │   Token IDs │
╞══════════╪═════════════╡
│ my       │        2026 │
├──────────┼─────────────┤
│ love     │        2293 │
├──────────┼─────────────┤
│ .        │        1012 │
├──────────┼─────────────┤
│ .        │        1012 │
├──────────┼─────────────┤
│ .        │        1012 │
├──────────┼─────────────┤
│ i        │        1045 │
├──────────┼─────────────┤
│ hope     │        3246 │
├──────────┼─────────────┤
│ your     │        2115 │
├──────────┼─────────────┤
│ not      │        2025 │
├──────────┼─────────────┤
│ doing    │        2725 │
├──────────┼─────────────┤
│ anything │        2505 │
├──────────┼─────────────┤
│ drastic  │       23956 │
├──────────┼─────────────┤
│ .        │        1012 │
├──────────┼─────────────┤
│ don      │        2123 │
├──────────┼─────────────┤
│ '        │        1005 │
├──────────┼─────────────┤
│ t        │        1056 │
├──────────┼─────────────┤
│ you      │        2017 │
├──────────┼─────────────┤
│

In [12]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transforers.tokenization_utils_base.BatchEncoding> with the following fileds:
  - input_ids: list of token ids
  - token_type_ids: list of token type ids
  - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True)
  '''

  return tokenizer.encode_plus(
      input_text,
      add_special_tokens = True,
      max_length = 32,
      pad_to_max_length = True,
      return_attention_mask = True,
      return_tensors = 'pt'
  )

for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids'])
  attention_masks.append(encoding_dict['attention_mask'])

token_id = torch.cat(token_id, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
token_id[6]

tensor([ 101, 2130, 2026, 2567, 2003, 2025, 2066, 2000, 3713, 2007, 2033, 1012,
        2027, 7438, 2033, 2066, 8387, 7353, 1012,  102,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])

In [None]:
def print_rand_sentence_encoding():
  index = random.randint(0, len(text)-1)
  tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
  token_ids = [i.numpy() for i in token_id[index]]
  attention = [i.numpy() for i in attention_masks[index]]

  table = np.array([tokens, token_ids, attention]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'
  ))

print_rand_sentence_encoding()

In [13]:
val_ratio = 0.2
batch_size = 16

train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels
)

train_set = TensorDataset(token_id[train_idx],
                          attention_masks[train_idx],
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx],
                        attention_masks[val_idx],
                        labels[val_idx])

train_dataloader = DataLoader(
    train_set,
    sampler = RandomSampler(train_set),
    batch_size = batch_size
)

validation_dataloader = DataLoader(
    val_set,
    sampler = SequentialSampler(val_set),
    batch_size = batch_size
)

In [14]:
def b_tp(preds, labels):
  '''Return True Positives: count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])


def b_fp(preds, labels):
  '''False Positives'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])


def b_fn(preds, labels):
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  preds = np.argmax(preds, axis=1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) >0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity


In [15]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 5e-5,
                              eps = 1e-08
                              )
# model.cuda()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'

epochs = 2

for _ in trange(epochs, desc = 'Epoch'):

  #set model to training mode
  model.train()

  #tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    optimizer.zero_grad()
    #Forward pass
    train_output = model(b_input_ids,
                         token_type_ids = None,
                         attention_mask = b_input_mask,
                         labels = b_labels)

    #Backward pass
    train_output.loss.backward()
    optimizer.step()
    #Update tracking variables
    tr_loss += train_output.loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  #set model to evaluation mode
  model.eval()

  val_accuracy = []
  val_precision = []
  val_recall = []
  val_specificity = []

  for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
      eval_output = model(b_input_ids,
                          token_type_ids = None,
                          attention_mask = b_input_mask)
    logits = eval_output.logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
    val_accuracy.append(b_accuracy)
    if b_precision != 'nan': val_precision.append(b_precision)
    if b_recall != 'nan': val_recall.append(b_recall)
    if b_specificity != 'nan': val_specificity.append(b_specificity)

  print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
  print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
  print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
  print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

Epoch:   0%|          | 0/2 [28:31<?, ?it/s]


NameError: name 'b_spedificity' is not defined

# Next word prediction using GPT2

In [None]:
!pip install pytorch-transformers

In [3]:
import torch
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel

In [60]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [61]:
text = "I had a glass of orange"
indexed_tokens = tokenizer.encode(text)

In [62]:
token_tensor = torch.tensor([indexed_tokens])

In [63]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [64]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [65]:
token_tensor = token_tensor.to('cpu')
model.to('cpu')

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [66]:
with torch.no_grad():
  outputs = model(token_tensor)
  predictions = outputs[0]

In [19]:
type(outputs)

tuple

In [21]:
predictions

tensor([[[ -41.7623,  -41.7778,  -44.8669,  ...,  -49.2863,  -46.8294,
           -42.0784],
         [-112.4733, -114.1680, -120.8443,  ..., -118.4115, -116.1079,
          -117.4254],
         [-113.9452, -112.3593, -116.2751,  ..., -118.9581, -116.5398,
          -114.8422],
         [ -80.1158,  -81.0733,  -86.8785,  ...,  -91.6743,  -85.3965,
           -82.9541],
         [ -90.7379,  -91.0372,  -95.0355,  ...,  -98.3009,  -94.9098,
           -91.2529],
         [ -93.5977,  -96.3308, -100.0373,  ..., -106.6967, -104.1619,
           -96.7653]]])

In [67]:
predicted_index = torch.argmax(predictions[0, -1, :]).item()
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

print(predicted_text)

 I had a glass of orange juice


# Masked word prediction using BERT

In [33]:
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM

In [34]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 13299752.51B/s]


In [35]:
text = "[CLS] Who is Vanessa Zhu? [SEP] Vanessa Zhu is a GenAI researcher [SEP]"
tokenized_text = tokenizer.tokenize(text)

In [36]:
tokenized_text

['[CLS]',
 'who',
 'is',
 'vanessa',
 'zhu',
 '?',
 '[SEP]',
 'vanessa',
 'zhu',
 'is',
 'a',
 'gen',
 '##ai',
 'researcher',
 '[SEP]']

In [37]:
masked_index = 8
tokenized_text[masked_index] = '[MASK]'

tokenized_text

['[CLS]',
 'who',
 'is',
 'vanessa',
 'zhu',
 '?',
 '[SEP]',
 'vanessa',
 '[MASK]',
 'is',
 'a',
 'gen',
 '##ai',
 'researcher',
 '[SEP]']

In [39]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

In [44]:
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,1]

tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [45]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

tokens_tensor = tokens_tensor.to('cpu')
segments_tensors = segments_tensors.to('cpu')
model.to('cpu')

with torch.no_grad():
  outputs = model(tokens_tensor, token_type_ids = segments_tensors)
  predictions = outputs[0]

In [46]:
predictions

tensor([[[ -7.2632,  -7.1759,  -7.2400,  ...,  -6.4886,  -6.2659,  -4.0411],
         [-12.7186, -12.8679, -13.0356,  ..., -10.9403,  -9.8771, -11.2336],
         [-10.7826, -10.4992, -11.0582,  ...,  -9.9475,  -6.6461,  -7.1356],
         ...,
         [ -3.4770,  -3.1270,  -3.3776,  ...,  -4.7894,  -3.6793,  -2.9370],
         [ -5.3890,  -5.0365,  -5.6892,  ...,  -5.6600,  -5.3356,  -4.2611],
         [-12.5032, -12.2151, -12.3898,  ..., -10.5356, -10.1931, -10.6357]]])

In [47]:
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

print('Predicted token is: ', predicted_token)

Predicted token is:  zhu
