# Hate Intensity Prediction (HIP): Regression

HIP Module takes a sentence (whether normalised or not) and predicts the hateful intensity of the sentence.

The hate intensity is annotated on a scale of 1-10, 0 is reserved for non-hateful sentences which we do not use in our dataset.
1 is the lowest hate intensity and 10 is the highest.

If using final activation layer is linear then range stays same.
If using sigmoid activation layer then input label is normalised to 0-1 range.


In [1]:
import tensorflow as tf
from tqdm import tqdm
import numpy as np
import pandas as pd
from transformers import BertTokenizer
from transformers import DistilBertTokenizer, RobertaTokenizer, BertConfig, TFBertModel, TFRobertaModel, RobertaConfig
from sklearn.model_selection import train_test_split
import pickle
import random
import sys
import math
from scipy import stats
from scipy.spatial import distance
import random
import os

In [33]:
BASE_FOLDER = "./data/"
INPUT_FILE = 'hate_int_prof_SVO.tsv'
OUTPUT_FOLDER = "hate_intensity_linear_weights_att/"
OUTPUT_FILE = "hate_int_linear_trans42_ATT"
BERT_MODEL = "roberta-base"
MAX_LENGTH = 128
TEST_SIZE = 0.2
SEED = 42

USE_ATT = True

BERT_DROPOUT = 0.2
LSTM_UNITS = 50
DENSE_UNITS = 50
LSTM_DROPOUT = 0.1
DENSE_DROPOUT = 0.2
EPOCHS = 2 #(Default 10)
BATCH_SIZE = 32


def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

random_seed(SEED)

### Base TRANSFORMER MODEL definitions

In [3]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence,
                                       add_special_tokens=True,
                                       max_length=MAX_LENGTH,
                                       pad_to_max_length=True,
                                       return_attention_mask=True,
                                       return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])

    return np.asarray(input_ids, dtype='int32'), np.asarray(
        input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')


config = RobertaConfig.from_pretrained(BERT_MODEL,
                                       output_hidden_states=False,
                                       attention_probs_dropout_prob =BERT_DROPOUT,
                                       output_attentions = True)
config.output_hidden_states = False
transformer_model = TFRobertaModel.from_pretrained(BERT_MODEL, config=config)

# Freeze the pre-trained layers for fine-tuning
for layer in transformer_model.layers[:3]:
    layer.trainable = False

# Defining tokenizer
tokenizer = RobertaTokenizer.from_pretrained(BERT_MODEL,
                                             add_special_tokens=True,
                                             max_length=MAX_LENGTH,
                                             pad_to_max_length=True)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.transformer.layer.3.ffn.lin1.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.1.ffn.lin1.weight', 'distilbert.transformer.layer.4.ffn.lin2.bias', 'distilbert.transformer.layer.2.sa_layer_norm.bias', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.4.attention.k_lin.bias', 'distilbert.transformer.layer.4.ffn.lin1.weight', 'distilbert.transformer.layer.2.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.bias', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.5.attention.v_lin.bias', 'distilbert.transformer.layer.5.attention.out_lin.bias', 'distilbert.transformer.layer.3.ffn.lin2.weight', 'distilbert.transformer.layer.5.output_layer_norm.weight', 'distilbert.transformer.layer.1.ffn.lin2.bias', 'distilbert.tran

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

### Model Design

In [29]:
input_ids_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                     name='input_token',
                                     dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                       name='masked_token',
                                       dtype='int32')
SVO_in = tf.keras.layers.Input(shape=(MAX_LENGTH, 3),
                                     name='svo_encoding',
                                     dtype='float32')
embedding_layer = transformer_model(input_ids_in,
                                    attention_mask=input_masks_in)[0]
# Injecting SVO encodings to the embedding layer
embedding_layer = tf.concat([embedding_layer, SVO_in], -1)

# Fusion layer to combine the embedding with the SVO before passing it to LSTM 
mlp_units = MAX_LENGTH + 3
fusion_layer = tf.keras.layers.Dense(mlp_units, activation='relu')(embedding_layer)



X = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(LSTM_UNITS,
                         return_sequences=True,
                         dropout=LSTM_DROPOUT,
                         recurrent_dropout=LSTM_DROPOUT,
                         kernel_initializer='normal'))(fusion_layer)
if USE_ATT:
    X = tf.keras.layers.Attention(use_scale=True)([X, X])  # Use attention.
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(DENSE_UNITS,
                          activation='relu',
                          kernel_initializer='normal')(X)
X = tf.keras.layers.Dropout(DENSE_DROPOUT)(X)
X = tf.keras.layers.Dense(
    1,
    activation='linear',  # Can be with activation="sigmoid" here.
    kernel_initializer='normal')(X)
model = tf.keras.Model(inputs=[input_ids_in, input_masks_in, SVO_in], outputs=X)
model.compile(
    optimizer='adam',
    loss='mean_squared_error',  # Treat HIP as a regression problem
    metrics=['acc', tf.keras.metrics.RootMeanSquaredError()])
model.summary()



Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_token (InputLayer)    [(None, 128)]                0         []                            
                                                                                                  
 masked_token (InputLayer)   [(None, 128)]                0         []                            
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1094822   ['input_token[0][0]',         
 )                           ngAndCrossAttentions(last_   40         'masked_token[0][0]']        
                             hidden_state=(None, 128, 7                                           
                             68),                                                           

### Dataset prep

In [36]:
SVO_P_data = pd.read_csv(INPUT_FILE, sep='\t')
SVO_P_data[['Subject', 'Verb', 'Object']] = SVO_P_data[['Subject', 'Verb', 'Object']].applymap(lambda x: np.array(eval(x)))
SVO_P_data['SVO'] = SVO_P_data.apply(lambda row: np.row_stack((row['Subject'], row['Verb'], row['Object'])), axis=1)

sentences = SVO_P_data['Sentence'].to_numpy()
hate_intensities = SVO_P_data['Intensity'].to_numpy()
profanity = SVO_P_data['Profanity'].to_numpy()
# SVO labelled after using roberta base tokenizer
SVO = SVO_P_data['SVO'].to_numpy()

MAX_LENGTH = 128

def padd_array_with_zeros(arr, desired_len):
    # Prepend 0 to accomodate BERT [CLS] token
    arr = np.insert(arr, 0, 0)
    # Padding
    current_len = len(arr)
    if current_len < desired_len:
        padded_arr = np.pad(arr, (0, desired_len - current_len), mode='constant')
    else:
        padded_arr = arr[:desired_len]

    return padded_arr

inp = list(zip(sentences, SVO, profanity))

X_tr, X_te, y_tr, y_te = train_test_split(inp, hate_intensities,
                                            test_size=0.2, random_state=7)

train_sentences = np.array([t[0] for t in X_tr])
train_SVO = [t[1] for t in X_tr]
train_profanity = np.array([t[2] for t in X_tr])

test_sentences = np.array([t[0] for t in X_te])
test_SVO = [t[1] for t in X_te]
test_profanity = np.array([t[2] for t in X_te])

## Padding zeros to SVO to make all of them same length
train_SVO_padded = list()
for sample in train_SVO:
    train_SVO_padded.append([padd_array_with_zeros(arr, MAX_LENGTH) for arr in sample])
train_SVO = np.array(train_SVO_padded)

test_SVO_padded = list()
for sample in test_SVO:
    test_SVO_padded.append([padd_array_with_zeros(arr, MAX_LENGTH) for arr in sample])
test_SVO = np.array(test_SVO_padded)

print('Sentences: ', 'train', train_sentences.shape, 'test', test_sentences.shape)
print('SVO: ', 'train', train_SVO.shape, 'test', test_SVO.shape)
print('Profanity: ', 'train', train_profanity.shape, 'test', test_profanity.shape)
train_SVO = tf.cast(tf.transpose(train_SVO, [0, 2, 1]), tf.float32)
test_SVO = tf.cast(tf.transpose(test_SVO, [0, 2, 1]), tf.float32)

train_input_ids, train_input_masks, train_input_segment = tokenize(
    train_sentences, tokenizer)
test_input_ids, test_input_masks, test_input_segment = tokenize(
    test_sentences, tokenizer)
y_tr = np.asarray(y_tr)
y_te = np.asarray(y_te)


Sentences:  train (4843,) test (1211,)
SVO:  train (4843, 3, 128) test (1211, 3, 128)
Profanity:  train (4843,) test (1211,)


100%|██████████| 4843/4843 [00:07<00:00, 656.12it/s]
100%|██████████| 1211/1211 [00:01<00:00, 921.45it/s]


In [37]:
print(X_tr[0]) # Training sentences
print("\n\nTrain input ids", train_input_ids, "\n\nAttention masks", train_input_masks, "\n\nToken type ids", train_input_segment) # input_ids, attention_masks, token_type_ids

('Did you know that the rape epidemic in Europe is due to Muslim immigrants? They have been raping our children for many years with no counteraction from the police and government. Islam is a death cult!', array([[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 1)


Train input ids [[  101  2106  2017 ...     0     0     0]
 [  101  1030  5310 ...     0     0     0]
 [  101 19817  9453 ...     0     0     0]
 ...
 [  101  2016  2106 ...     0     0     0]
 [  101  2064  2619 ...     0     0     0]
 [  101  2017  2035 ...     0     0     0]] 

Attention masks [[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 

### Train and evlauate

In [32]:
model.fit(x=[train_input_ids, train_input_masks, train_SVO],
          y=y_tr,
          epochs=EPOCHS,
          validation_split=0.1,
          batch_size=BATCH_SIZE)

print("\n\n TEST split", TEST_SIZE)
results = model.evaluate(x=[test_input_ids, test_input_masks, test_SVO], y=y_te)
print(results)
result = model.predict(x=[test_input_ids, test_input_masks])
result = np.array(result, dtype=np.float)
result = result.flatten()
print("pear", stats.pearsonr(result, y_te))
print("cosine", 1 - distance.cosine(result, y_te))

Epoch 1/2

KeyboardInterrupt: ignored

In [None]:
model.save_weights(BASE_FOLDER + OUTPUT_FOLDER + OUTPUT_FILE)

### To save model
Run
```
# model.save_weights(BASE_FOLDER + OUTPUT_FOLDER + OUTPUT_FILE)
```

### To load model
Run upto the cells up till `model_design` part and then do
```
model.load_weights(BASE_FOLDER+OUTPUT_FOLDER+OUTPUT_FILE)
```