In [None]:
%pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 27.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 90.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 72.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Pre-processing

In [None]:
path = '/content/gdrive/MyDrive/6688. HTW/Emerging AI/data/df_all.csv'

df = pd.read_csv(path)
df.iloc[:,1] = df.iloc[:,1].astype(str)
# convert column text to string

df.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,8,rt crooked hillary you didn't lose because yo...,explicit_hate
1,9,"rt estonian pm : "" i don't want blacks in es...",explicit_hate
2,34,are a bunch of religious bigots . i hope you e...,explicit_hate
3,45,hamas is the only one who lies and all those l...,explicit_hate
4,65,one of the questions never asked : if africans...,explicit_hate


In [None]:
df['len_text'] = df['text'].str.len()
df.head()

Unnamed: 0.1,Unnamed: 0,text,class,len_text
0,8,rt crooked hillary you didn't lose because yo...,explicit_hate,199
1,9,"rt estonian pm : "" i don't want blacks in es...",explicit_hate,101
2,34,are a bunch of religious bigots . i hope you e...,explicit_hate,135
3,45,hamas is the only one who lies and all those l...,explicit_hate,103
4,65,one of the questions never asked : if africans...,explicit_hate,143


In [None]:
df.describe()

Unnamed: 0.1,Unnamed: 0,len_text
count,64875.0,64875.0
mean,10958.991584,96.306975
std,6492.230353,62.724153
min,0.0,1.0
25%,5406.0,55.0
50%,10812.0,86.0
75%,16218.0,122.0
max,24782.0,1492.0


In [None]:
len(df)

64875

In [None]:
# set array dimensions
seq_len = 512
num_samples = len(df)

# initialize empty zero arrays
Xids = np.zeros((num_samples, seq_len))
Xmask = np.zeros((num_samples, seq_len))

# check shape
Xids.shape


(64875, 512)

In [None]:
from transformers import BertTokenizer

In [None]:
# initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, phrase in enumerate(df['text']):
    tokens = tokenizer.encode_plus(phrase, max_length=seq_len, truncation=True,
                                   padding='max_length', add_special_tokens=True,
                                   return_tensors='tf')
    # assign tokenized outputs to respective rows in numpy arrays
    Xids[i, :] = tokens['input_ids']
    Xmask[i, :] = tokens['attention_mask']

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
Xids

array([[  101.,   187.,  1204., ...,     0.,     0.,     0.],
       [  101.,   187.,  1204., ...,     0.,     0.,     0.],
       [  101.,  1132.,   170., ...,     0.,     0.,     0.],
       ...,
       [  101.,  1753.,  1106., ...,     0.,     0.,     0.],
       [  101., 18542.,   157., ...,     0.,     0.,     0.],
       [  101.,  6991., 17528., ...,     0.,     0.,     0.]])

In [None]:
df['class_encoded'] = df['class'].astype('category').cat.codes
df.head()

Unnamed: 0.1,Unnamed: 0,text,class,len_text,class_encoded
0,8,rt crooked hillary you didn't lose because yo...,explicit_hate,199,0
1,9,"rt estonian pm : "" i don't want blacks in es...",explicit_hate,101,0
2,34,are a bunch of religious bigots . i hope you e...,explicit_hate,135,0
3,45,hamas is the only one who lies and all those l...,explicit_hate,103,0
4,65,one of the questions never asked : if africans...,explicit_hate,143,0


In [None]:
df['class'].value_counts()

not_hate         36066
offensive        19190
implicit_hate     7100
explicit_hate     2519
Name: class, dtype: int64

In [None]:
df['class_encoded'].value_counts()

2    36066
3    19190
1     7100
0     2519
Name: class_encoded, dtype: int64

In [None]:
# first extract sentiment column
arr = df['class_encoded'].values
arr

array([0, 0, 0, ..., 2, 2, 2], dtype=int8)

In [None]:
# we then initialize the zero array
labels = np.zeros((num_samples, arr.max()+1))

# set relevant index for each row to 1 (one-hot encode)
labels[np.arange(num_samples), arr] = 1

In [None]:
labels

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])

In [None]:
# create the dataset object
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

def map_func(input_ids, masks, labels):
    # we convert our three-item tuple into a two-item tuple where the input item is a dictionary
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# then we use the dataset map method to apply this transformation
dataset = dataset.map(map_func)

In [None]:
# we will split into batches of 16
batch_size = 10

# shuffle and batch - dropping any remaining samples that don't cleanly
# fit into a batch of 16
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

In [None]:
# set split size (90% training data) and calculate training set size
split = 0.8
size = int((Xids.shape[0]/batch_size)*split)

# get training and validation sets
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

# Modelling

In [None]:
# AutoModel for PyTorch, TFAutoModel for TensorFlow
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
number_of_classes = len(df['class_encoded'].value_counts())
number_of_classes

4

In [None]:
# two input layers, we ensure layer name variables match to dictionary keys in TF dataset
input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

# we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)
embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # access pooled activations with [1]

# convert bert embeddings into 4 output classes
x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(number_of_classes, activation='softmax', name='outputs')(x)

In [None]:
# initialize model
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

In [None]:
# TRAINING
optimizer = tf.keras.optimizers.Adam(lr=1e-5, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

  super(Adam, self).__init__(name, **kwargs)


In [None]:
train_ds

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(10, 512), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(10, 512), dtype=tf.float64, name=None)}, TensorSpec(shape=(10, 4), dtype=tf.float64, name=None))>

In [None]:
history = model.fit(
                        train_ds,
                        epochs=10,
                        validation_data=val_ds
                    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
1048/5190 [=====>........................] - ETA: 1:15:41 - loss: 0.1503 - accuracy: 0.9438

In [None]:
# save model files to directory '/sentiment_model'
path_saved_model = '/content/gdrive/MyDrive/6688. HTW/Emerging AI/data/NLP_hatespeech_model'
model.save(path_saved_model)

# # we just load the model from which directory it was saved to (eg '/sentiment_model')
# model = tf.keras.models.load_model('sentiment_model')