In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# load data
data = pd.read_csv('./input/abcnews-date-text.csv')
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1244184 entries, 0 to 1244183
Data columns (total 2 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   publish_date   1244184 non-null  int64 
 1   headline_text  1244184 non-null  object
dtypes: int64(1), object(1)
memory usage: 19.0+ MB


In [5]:
data = data.rename(columns={'headline_text': 'text'})
data['text'] = data['text'].astype(str)

In [6]:
data[data['text'].duplicated(keep=False)].sort_values('text').head(10)

Unnamed: 0,publish_date,text
116298,20040920,10 killed in pakistan bus crash
57967,20031129,10 killed in pakistan bus crash
911080,20141023,110 with barry nicholls
672958,20120217,110 with barry nicholls
748629,20121214,110 with barry nicholls
676423,20120302,110 with barry nicholls
897042,20140820,110 with barry nicholls episode 15
826828,20131017,110 with barry nicholls episode 15
826829,20131017,110 with barry nicholls episode 16
898353,20140826,110 with barry nicholls episode 16


In [7]:
data = data.drop_duplicates('text').reset_index(drop=True)

In [8]:
data = data.head(50000)
data.head()

Unnamed: 0,publish_date,text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publish_date  50000 non-null  int64 
 1   text          50000 non-null  object
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


# 1. Labelling 
- through Natural Language Processing(NLP) Libraries
- https://www.kaggle.com/code/ahmadalijamali/headline-news-classification-with-bert
- https://www.snorkel.org/use-cases/01-spam-tutorial#3-writing-more-labeling-functions

In [10]:
from sklearn.utils import shuffle
from snorkel.labeling import LabelingFunction
from snorkel.preprocess import preprocessor
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import labeling_function

import re
import spacy
from spacy import displacy

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string


#Supervised learning
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

##Deep learning libraries and APIs
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

from textblob import TextBlob
from textblob import Word

import nltk
nltk.download('wordnet')
from textblob.wordnet import VERB
from textblob.wordnet import NOUN
from textblob.wordnet import ADJ
from textblob.wordnet import ADV

from tabulate import tabulate
from tqdm import trange
import random

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zoezh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
punc = string.punctuation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zoezh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
df = shuffle(data)
data = df.sample(frac=0.25, replace=False, random_state=1)

In [13]:
POSITIVE = 1
NEGATIVE = 0
UNKNOWN = -1

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return UNKNOWN
def make_keyword_lf(keywords, label=POSITIVE):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

# reference: https://www.snorkel.org/use-cases/01-spam-tutorial#3-writing-more-labeling-functions
# list of positive keywords/negative keywords
#these two lists can be further extended 
"""positive news might contain the following words' """
keywords_positive = make_keyword_lf(keywords=['boosts', 'great', 'develops', 'promising', 'ambitious', 'delighted', 
                                             'record', 'win', 'breakthrough', 'recover', 'achievement', 'peace', 
                                             'party', 'hope', 'flourish', 'respect', 'partnership', 'champion', 
                                             'positive', 'happy', 'bright', 'confident', 'encouraged', 'perfect', 
                                             'complete', 'assured'], label=POSITIVE)
"""negative news might contain the following words"""
keywords_negative = make_keyword_lf(keywords=['war','solidiers', 'turmoil', 'injur','trouble', 'aggressive', 'killed', 
                                             'coup', 'evasion', 'strike', 'troops', 'dismisses', 'attacks', 'defeat', 
                                             'damage', 'dishonest', 'dead', 'fear', 'foul', 'fails', 'hostile', 'cuts', 
                                             'accusations', 'victims',  'death', 'unrest', 'fraud', 'dispute', 'destruction', 
                                             'battle', 'unhappy', 'bad', 'alarming', 'angry', 'anxious', 'dirty', 'pain', 
                                             'poison', 'unfair', 'unhealthy'], label=NEGATIVE)


In [14]:
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x
#find polarity
@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return POSITIVE if x.polarity > 0.6 else UNKNOWN
#find subjectivity 
@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return POSITIVE if x.subjectivity >= 0.5 else UNKNOWN

In [15]:
lfs = [keywords_positive, keywords_negative, textblob_polarity, textblob_subjectivity]

applier = PandasLFApplier(lfs=lfs)
L_snorkel = applier.apply(df=data)

label_model = LabelModel(cardinality=2, verbose=True)

label_model.fit(L_train=L_snorkel)

100%|██████████| 12500/12500 [00:13<00:00, 930.82it/s] 
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.044]
INFO:root:[10 epochs]: TRAIN:[loss=0.018]
INFO:root:[20 epochs]: TRAIN:[loss=0.001]
INFO:root:[30 epochs]: TRAIN:[loss=0.001]
INFO:root:[40 epochs]: TRAIN:[loss=0.001]
INFO:root:[50 epochs]: TRAIN:[loss=0.000]
 57%|█████▋    | 57/100 [00:00<00:00, 564.40epoch/s]INFO:root:[60 epochs]: TRAIN:[loss=0.000]
INFO:root:[70 epochs]: TRAIN:[loss=0.000]
INFO:root:[80 epochs]: TRAIN:[loss=0.000]
INFO:root:[90 epochs]: TRAIN:[loss=0.000]
100%|██████████| 100/100 [00:00<00:00, 632.96epoch/s]
INFO:root:Finished Training


In [16]:
data["label"] = label_model.predict(L=L_snorkel)

In [17]:
data = data.drop(columns=['publish_date'])
data

Unnamed: 0,text,label
9827,government recommends australia soccer chiefs be,-1
15309,older meares claims third national sprint title,-1
18289,nowra tafe teachers reject pay offer,-1
30089,nats welcome familys move to take over pie maker,1
12441,baghdad conditions appalling un,1
...,...,...
14159,roos not focussing on carey factor,-1
31347,photojournalist seriously wounded in liberia,1
18710,funeral held for legendary disc jockey,1
12749,qld fatality takes national road toll to 20,-1


In [18]:
data['label']=data['label'].mask(data['label']==-1,2)
data['label'].value_counts()


label
2    8642
1    2516
0    1342
Name: count, dtype: int64

In [19]:
data

Unnamed: 0,text,label
9827,government recommends australia soccer chiefs be,2
15309,older meares claims third national sprint title,2
18289,nowra tafe teachers reject pay offer,2
30089,nats welcome familys move to take over pie maker,1
12441,baghdad conditions appalling un,1
...,...,...
14159,roos not focussing on carey factor,2
31347,photojournalist seriously wounded in liberia,1
18710,funeral held for legendary disc jockey,1
12749,qld fatality takes national road toll to 20,2


In [20]:
df = data.copy()

# 2. Preprocessing

In [21]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

In [22]:
text = data.text.values
labels = data.label.values

In [23]:
text

array(['government recommends australia soccer chiefs be',
       'older meares claims third national sprint title',
       'nowra tafe teachers reject pay offer', ...,
       'funeral held for legendary disc jockey',
       'qld fatality takes national road toll to 20',
       'downer to raise fate of aung san suu kyi'], dtype=object)

In [24]:
labels

array([2, 2, 2, ..., 1, 2, 2])

In [25]:
def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  index = random.randint(0, len(text)-1)
  table = np.array([tokenizer.tokenize(text[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒══════════╤═════════════╕
│ Tokens   │   Token IDs │
╞══════════╪═════════════╡
│ al       │        2632 │
├──────────┼─────────────┤
│ ja       │       14855 │
├──────────┼─────────────┤
│ ##zee    │       23940 │
├──────────┼─────────────┤
│ ##ra     │        2527 │
├──────────┼─────────────┤
│ pulls    │        8005 │
├──────────┼─────────────┤
│ plug     │       13354 │
├──────────┼─────────────┤
│ after    │        2044 │
├──────────┼─────────────┤
│ ban      │        7221 │
├──────────┼─────────────┤
│ ##s      │        2015 │
╘══════════╧═════════════╛


In [26]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [27]:
token_id[6]

tensor([  101, 10069,  4057,  2005,  4394,  2450,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0])

In [28]:
def print_rand_sentence_encoding():
  '''Displays tokens, token IDs and attention mask of a random text sample'''
  index = random.randint(0, len(text) - 1)
  tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
  token_ids = [i.numpy() for i in token_id[index]]
  attention = [i.numpy() for i in attention_masks[index]]

  table = np.array([tokens, token_ids, attention]).T
  print(tabulate(table, 
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

╒═══════════╤═════════════╤══════════════════╕
│ Tokens    │   Token IDs │   Attention Mask │
╞═══════════╪═════════════╪══════════════════╡
│ [CLS]     │         101 │                1 │
├───────────┼─────────────┼──────────────────┤
│ indonesia │        6239 │                1 │
├───────────┼─────────────┼──────────────────┤
│ and       │        1998 │                1 │
├───────────┼─────────────┼──────────────────┤
│ australia │        2660 │                1 │
├───────────┼─────────────┼──────────────────┤
│ discuss   │        6848 │                1 │
├───────────┼─────────────┼──────────────────┤
│ fighting  │        3554 │                1 │
├───────────┼─────────────┼──────────────────┤
│ regional  │        3164 │                1 │
├───────────┼─────────────┼──────────────────┤
│ [SEP]     │         102 │                1 │
├───────────┼─────────────┼──────────────────┤
│ [PAD]     │           0 │                0 │
├───────────┼─────────────┼──────────────────┤
│ [PAD]     │

In [29]:
val_ratio = 0.2
# Recommended batch size: 16, 32.
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [30]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

## 1.3 Modelling - BERT

In [31]:
import torch
import torchvision
from transformers import BertForSequenceClassification
from tqdm import trange
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

In [32]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [33]:
model.cuda()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 1
print('Training on', device)
for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels.long())
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

Training on cuda


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Epoch: 100%|██████████| 1/1 [01:42<00:00, 102.62s/it]


	 - Train loss: 0.4840
	 - Validation Accuracy: 0.2607
	 - Validation Precision: 0.8502
	 - Validation Recall: 0.9557
	 - Validation Specificity: 0.7512




