In [1]:
"""
Hugging Face BERT with custom classifier (PyTorch)

https://www.kaggle.com/code/angyalfold/hugging-face-bert-with-custom-classifier-pytorch/notebook
"""

import pandas as pd
from sklearn.model_selection import train_test_split

train_csv_path = './train.csv'
train_df = pd.read_csv(train_csv_path)

all_texts = train_df['text'].values.tolist()
all_labels = train_df['target'].values.tolist()

print("Tweets are loaded. Total # of tweets: {}.".format(len(all_texts)))
print("# of labels:")
print(train_df['target'].value_counts())

# As it turns out there are couple of tweets which occurs multiple times. Among those there are some whose labels aren't consistent throughout the occurrences.

frequent_tweets = {}
for t, l in zip(all_texts, all_labels):
    if all_texts.count(t) > 2:
        frequent_tweets[t] = [l] if t not in frequent_tweets else frequent_tweets[t] + [l]
        
print("The number of tweeets which appear multiple times: {}"
      .format(len(frequent_tweets.keys())))     

print("Tweets which have inconsistent labeling:")
print()

for t, ls in frequent_tweets.items():
    if not all(element == ls[0] for element in ls):
        print(t)
        print(ls)

# The amount of tweets with inconsistent labeling seems reasonably low so they can be fixed by hand. (Note: One could argue that deleting tweets with inconsistent labeling would be a better practice because modifing the input like that is an overreach, but for the sake of the example I go with releballing.)

Tweets are loaded. Total # of tweets: 7613.
# of labels:
target
0    4342
1    3271
Name: count, dtype: int64
The number of tweeets which appear multiple times: 19
Tweets which have inconsistent labeling:

To fight bioterrorism sir.
[1, 0, 1, 0]
.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4
[1, 1, 0, 1]
He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam
[0, 1, 1, 0, 0, 0]
Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her
 
#FARRAKHAN #QUOTE
[1, 0, 0]
#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption
[1, 1, 0]
The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'
[0, 0, 1, 0, 0, 1]
Hellfire is surrounded by desires so be careful and d

In [2]:
should_be_real = [".POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4",
                 "#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption",
                 "CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring"]

should_not_be_real = ["He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam",
                     "Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her",
                      "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'",
                     "Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife",
                     "#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect",
                     "that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time",
                     "To fight bioterrorism sir."]


def fix_labels(tweets_to_fix, correct_label):
    for i, (tweet, label) in enumerate(zip(all_texts, all_labels)):
        if any(tweet.startswith(t) for t in tweets_to_fix):
            all_labels[i] = correct_label

        
fix_labels(should_be_real, 1)
fix_labels(should_not_be_real, 0)

print("Relabeled {} tweets in total".format(len(should_be_real) + len(should_not_be_real)))

Relabeled 10 tweets in total


In [3]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    all_texts, all_labels,
    stratify = train_df['target']
)

print('Train data is read and split into training and validation sets.')
print('Size of train data (# of entries): {}'.format(len(train_texts)))
print('Size of validation data (# of entries): {}'.format(len(val_texts)))

Train data is read and split into training and validation sets.
Size of train data (# of entries): 5709
Size of validation data (# of entries): 1904


In [4]:
# Data cleaning

"""
The most obvious step to take is to remove URLs as they are most likely just noise.

An additional consideration to take into account is that Hugging Face's tokenizer employs subword tokenization as detailed in their summary here. It essentialy means that if the tokenizer encounters a word which is unknown to it the word gets splitted into multiple tokens. Each new token gets the '##' prefix. For example: "annoyingly" becomes "annoying" + "##ly". Now it is easy to figure out which words are unknown to the model (just by searching for the '##' prefix) and thus gain ideas what sort of cleaning might worth implementing.

In this implementation URLs, @ links, non ascii characters are completely removed, the negation of some of the auxiliary verbs are fixed (eg.: shouldnt -> should not) and some of the personal pronouns (eg.: im -> i am)
"""

from ex_custom_classifier_helper import clean_tweet 


cleaned_train_texts = [clean_tweet(tweet) for tweet in train_texts]
print("Train tweets cleaned.")
cleaned_val_texts = [clean_tweet(tweet) for tweet in val_texts]
print("Validation tweets cleaned.")

Train tweets cleaned.
Validation tweets cleaned.


In [6]:
from transformers import AutoTokenizer

model_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# The tokenizer's truncation=True setting ensures that the sequence of tokens is truncated if the sequence is longer than the maximal input length acceptable by the model. padding=True ensures that each sentence is padded to the longest sentence of the batch.
train_encodings = tokenizer(cleaned_train_texts, truncation=True, padding=True)
val_encodings = tokenizer(cleaned_val_texts, truncation=True, padding=True)
print('Train & validation texts encoded')

Train & validation texts encoded


In [8]:
# Custom dataset

"""
PyTorch uses datasets and dataloaders to handle data (see their introductionary tutorial here https://pytorch.org/tutorials/beginner/basics/data_tutorial.html). It means that in order to make the handling of tweets straightforward a custom dataset has to be defined. (Named TweetDataset in this code)

A dataset is a data structure which makes it easy to iterate through the data in training and testing loops, therefore it needs to implement three methods of its base class (which is torch.utils.data.Dataset): __init__ (to initialize the dataset with the data), __len__ (to get the number of items in the dataset) and __getitem__ (to return the ith element of the dataset).
"""
import torch

class TweetDataset(torch.utils.data.Dataset):
    """
    Class to store the tweet data as PyTorch Dataset
    """
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        # an encoding can have keys such as input_ids and attention_mask
        # item is a dictionary which has the same keys as the encoding has
        # and the values are the idxth value of the corresponding key (in PyTorch's tensor format)
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
        
print(TweetDataset.__doc__)


    Class to store the tweet data as PyTorch Dataset
    


In [None]:
# Custom model

"""
A pre-trained BERT model with custom classifier. Based on this notebook.

The custom model consists of a pre-trained BERT model (a model which holds a semantical representation of English) and on the top of the BERT model there is a custom neural network which is trained to the specific task (tweet classification in this case). Therefore, it seems to be reasonable to have freeze_bert and unfreeze_bert methods apart from the mandatory __init__ and forward. Having this two additional methods makes it possible to sort of train the underlying BERT model and the custom classifier separately. (So train BERT and the custom head together, freeze BERT and then train the custom head on the classification task based on the previously trained BERT). The idea of freezing & unfreezing was taken from Milan Kalkenings' notebook.
"""