Name: Tanvir Ahmed
Student ID: 20075186

Hate speech detection with BERT

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs

In [2]:
from datasets import load_dataset

raw_dataset = load_dataset("tweets_hate_speech_detection")

Downloading builder script:   0%|          | 0.00/3.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.46k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/683k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31962 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17197 [00:00<?, ? examples/s]

In [3]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'tweet'],
        num_rows: 31962
    })
    test: Dataset({
        features: ['label', 'tweet'],
        num_rows: 17197
    })
})

In [4]:
raw_dataset['train']

Dataset({
    features: ['label', 'tweet'],
    num_rows: 31962
})

In [5]:
raw_dataset['train'].features

{'label': ClassLabel(names=['no-hate-speech', 'hate-speech'], id=None),
 'tweet': Value(dtype='string', id=None)}

In [6]:
def show_samples(dataset, num_samples=5, seed=52):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Text: {example['tweet']}'")
        print(f"'>> Label: {example['label']}'")

In [7]:
show_samples(raw_dataset)


'>> Text: i wanted to play pokÃ©mon so i turned on my gameboy color and guess what? i lost all the datas. 15 years of datas. :'( #game #pokemon  '
'>> Label: 0'

'>> Text: #losglobos   gorilla simulator: you need to do to adapt to the environment. the need to tear the city. mater '
'>> Label: 0'

'>> Text: @user oh well that me done for work this week until next tuesday   #beeroclock #longweekend #gettingdrunk '
'>> Label: 0'

'>> Text: childless woman demands maternity leave  via @user *seems like a headline from the onion, but it's not*  '
'>> Label: 0'

'>> Text: @user @user  obama &amp; biden are the most #antiwhite  president &amp; vp in history. #trump'
'>> Label: 1'


In [8]:
import pandas as pd

raw_dataset.set_format("pandas")
tweet_df = raw_dataset["train"][:]
# Show counts for categories
tweet_df["label"].value_counts()[:20]

0    29720
1     2242
Name: label, dtype: int64

In [9]:
tweet_df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is so...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [10]:
tweet_df[tweet_df['label'] == 1]

Unnamed: 0,label,tweet
13,1,@user #cnn calls #michigan middle school 'buil...
14,1,no comment! in #australia #opkillingbay #se...
17,1,retweet if you agree!
23,1,@user @user lumpy says i am a . prove it lumpy.
34,1,it's unbelievable that in the 21st century we'...
...,...,...
31934,1,lady banned from kentucky mall. @user #jcpenn...
31946,1,@user omfg i'm offended! i'm a mailbox and i'...
31947,1,@user @user you don't have the balls to hashta...
31948,1,"makes you ask yourself, who am i? then am i a..."


In [11]:
hate_tweets = [13, 17, 17128, 17125, 31960]

for x in hate_tweets:
    print(f'tweet_{x}: {tweet_df["tweet"][x]}')

tweet_13: @user #cnn calls #michigan middle school 'build the wall' chant '' #tcot  
tweet_17: retweet if you agree! 
tweet_17128: @user @user @user not sure you understand what bigotry is.  
tweet_17125: being creative with time management   fathers day:  via @user
tweet_31960: @user #sikh #temple vandalised in in #calgary, #wso condemns  act  


In [12]:
from sklearn.model_selection import train_test_split

X = tweet_df['tweet']
y = tweet_df['label']

# Perform stratified sampling to split the dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.33, stratify=y, random_state=52)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, stratify=y_temp, random_state=52)

# Create new DataFrames for the train, validation, and test sets
train_df = pd.DataFrame({'tweet': X_train, 'label': y_train})
val_df = pd.DataFrame({'tweet': X_val, 'label': y_val})
test_df = pd.DataFrame({'tweet': X_test, 'label': y_test})

In [13]:
train_df.label.value_counts(), val_df.label.value_counts(), test_df.label.value_counts()

(0    19912
 1     1502
 Name: label, dtype: int64,
 0    6571
 1     496
 Name: label, dtype: int64,
 0    3237
 1     244
 Name: label, dtype: int64)

In [14]:
raw_dataset.reset_format()

In [15]:
import tensorflow as tf
from transformers import AutoTokenizer
import re
import unicodedata


def preprocess(text):
    '''Function to clean the input text by removing special characters, symbols,
     and non-ASCII characters, and normalizing the text to a standard form.
    '''
    text = re.sub(r'[^\w\s]', '', text)   # Function to substitute (replace) any characters that are not letters
                                          # or digits (\w) or whitespace (\s) with an empty string ('')
    text = text.encode('ascii', 'ignore').decode('utf-8')
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    return text

def convert_to_tensors(X, y, tokenizer, max_length):
    '''
    Function to convert the input texts to tokens and then into input_ids and
    attention_masks tensors.
    '''
    X_preprocessed = X.apply(preprocess)

    # Tokenize the preprocessed text using the tokenizer
    tokenized = tokenizer(list(X_preprocessed), padding='max_length', max_length=max_length, return_tensors='tf')

    # Convert to TensorFlow tensors
    input_ids = tokenized['input_ids'].numpy()
    attention_mask = tokenized['attention_mask'].numpy()
    labels = y.to_numpy()

    return input_ids, attention_mask, labels


# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# Tokenize the training data to determine the maximum length
max_length = max(X_train.apply(lambda text: len(tokenizer.encode(text))))
print('Maximum length = ', max_length)

# Convert to TensorFlow tensors
input_ids_train, attention_mask_train, labels_train = convert_to_tensors(X_train, y_train, tokenizer, max_length)
input_ids_val, attention_mask_val, labels_val = convert_to_tensors(X_val, y_val, tokenizer, max_length)
input_ids_test, attention_mask_test, labels_test = convert_to_tensors(X_test, y_test, tokenizer, max_length)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Maximum length =  171


In [16]:
# Create TensorFlow datasets with DataCollator
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': input_ids_train, 'attention_mask': attention_mask_train},
    labels_train
)).batch(32)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': input_ids_val, 'attention_mask': attention_mask_val},
    labels_val
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': input_ids_test, 'attention_mask': attention_mask_test},
    labels_test
)).batch(32)

# Apply cache and prefetch optimizations
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(tf.data.AUTOTUNE)

In [17]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf
from sklearn.metrics import classification_report

# Build the model
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
optimizer = tf.keras.optimizers.AdamW(learning_rate=1e-5, weight_decay=0.004)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Train the model
history = model.fit(
    x=train_dataset,
    validation_data=val_dataset,
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [19]:
# Evaluate on test dataset
results = model.evaluate(x={'input_ids': input_ids_test, 'attention_mask': attention_mask_test}, y=labels_test)

# Print evaluation results
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

# Generate classification report
y_pred = model.predict({'input_ids': input_ids_test, 'attention_mask': attention_mask_test})
y_pred_labels = tf.argmax(y_pred.logits, axis=1).numpy()
print("Classification Report:\n", classification_report(labels_test, y_pred_labels))

Test Loss: 0.15396493673324585
Test Accuracy: 0.9551852941513062
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      3237
           1       0.66      0.75      0.70       244

    accuracy                           0.96      3481
   macro avg       0.82      0.86      0.84      3481
weighted avg       0.96      0.96      0.96      3481



In [20]:
# Testing on unlabelled data

raw_dataset.set_format("pandas")
test_df = raw_dataset["test"][:] # this part of the raw_dataset is not labelled

In [21]:
test_df.head()

Unnamed: 0,label,tweet
0,-1,#studiolife #aislife #requires #passion #dedic...
1,-1,@user #white #supremacists want everyone to se...
2,-1,safe ways to heal your #acne!! #altwaystohe...
3,-1,is the hp and the cursed child book up for res...
4,-1,"3rd #bihday to my amazing, hilarious #nephew..."


In [22]:
test_preprocessed = test_df['tweet'].apply(preprocess)

max_length = max(test_df['tweet'].apply(lambda text: len(tokenizer.encode(text))))
print('Maximum length = ', max_length)

# Tokenize the preprocessed text using the tokenizer
tokenized_test = tokenizer(list(test_preprocessed), padding='max_length', max_length=max_length, return_tensors='tf')

# Convert to TensorFlow tensors
test_input_ids = tokenized_test['input_ids'].numpy()
test_attention_mask = tokenized_test['attention_mask'].numpy()

Maximum length =  104


In [23]:
test_pred = model.predict({'input_ids': test_input_ids, 'attention_mask': test_attention_mask})




In [24]:
test_pred

TFSequenceClassifierOutput(loss=None, logits=array([[ 3.5567145 , -3.6382093 ],
       [-2.5375369 ,  2.6794796 ],
       [ 3.8626065 , -3.856089  ],
       ...,
       [-0.06680313, -0.06182364],
       [ 3.1665664 , -3.1865592 ],
       [ 3.1237705 , -3.1450763 ]], dtype=float32), hidden_states=None, attentions=None)

In [25]:
test_pred_labels = tf.argmax(test_pred.logits, axis=1).numpy()

In [26]:
test_df['label'] = test_pred_labels
test_df[test_df['label'] == 1].tail()

Unnamed: 0,label,tweet
17128,1,@user the uk governmentâs new #anti-semitism...
17176,1,@user @user are the most racist pay ever!!!!!
17188,1,"black professor demonizes, proposes nazi style..."
17192,1,thought factory: left-right polarisation! #tru...
17194,1,#hillary #campaigned today in #ohio((omg)) &am...


In [27]:
len(test_df[test_df['label'] == 1]), len(test_df)

(1416, 17197)

In [28]:
hate_tweets = [33, 42, 17128, 17125, 17188]

for x in hate_tweets:
  print(f'tweet_{x}: {test_df["tweet"][x]}')


suppo the #taiji fisherman! no bullying! no racism! #tweet4taiji #thecove #seashepherd 
@user @user trumps invested billions into saudi. he empowers the people funding isis.   #trumpsahypocrite
@user the uk governmentâs new #anti-semitism definition conflates  with valid criticism of #israel | opendemocracy 
be careful in criticizing #obama for his decision on #israel &amp; sanctions against #russiahacking , as #liberals will consider this 
black professor demonizes, proposes nazi style confiscation of "white" assets; like 1930's germany  #breaking 


In [29]:
raw_dataset.reset_format()
raw_dataset["test"][17125]

{'label': -1,
 'tweet': 'be careful in criticizing #obama for his decision on #israel &amp; sanctions against #russiahacking , as #liberals will consider this '}

------------------------------------------------