In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random 


## Dataset

In [51]:
SVO_P_FILE = '/home/karet/Documents/IRE/Project/hatetext-socialmedia/novelty/data/hate_int_prof_SVO.tsv'
SVO_P_data = pd.read_csv(SVO_P_FILE, sep='\t')
SVO_P_data[['Subject', 'Verb', 'Object']] = SVO_P_data[['Subject', 'Verb', 'Object']].map(lambda x: np.array(eval(x)))
SVO_P_data['SVO'] = SVO_P_data.apply(lambda row: np.row_stack((row['Subject'], row['Verb'], row['Object'])), axis=1)

sentences = SVO_P_data['Sentence'].to_numpy()
hate_intensities = SVO_P_data['Intensity'].to_numpy()
profanity = SVO_P_data['Profanity'].to_numpy()
# SVO labelled after using roberta base tokenizer
SVO = SVO_P_data['SVO'].to_numpy()

print('Sentences: ', sentences.shape, '\nhate_intensities: ', hate_intensities.shape, '\nprofanity: ', profanity.shape, '\nSVO: ', SVO.shape, '\nSVO[0]:', SVO[0].shape, ' SVO[1]:', SVO[1].shape)

Sentences:  (6054,) 
hate_intensities:  (6054,) 
profanity:  (6054,) 
SVO:  (6054,) 
SVO[0]: (3, 31)  SVO[1]: (3, 17)


In [52]:
MAX_LENGTH = 128

In [53]:
def padd_array_with_zeros(arr, desired_len):
    # Prepend 0 to accomodate BERT [CLS] token 
    arr = np.insert(arr, 0, 0)
    # Padding
    current_len = len(arr)
    if current_len < desired_len:
        padded_arr = np.pad(arr, (0, desired_len - current_len), mode='constant')
    else: 
        padded_arr = arr[:desired_len]
    
    return padded_arr

In [54]:
inp = list(zip(sentences, SVO, profanity))

X_tr, X_te, y_tr, y_te = train_test_split(inp, hate_intensities, 
                                            test_size=0.2, random_state=78)

train_sentences = np.array([t[0] for t in X_tr])
train_SVO = [t[1] for t in X_tr]
train_profanity = np.array([t[2] for t in X_tr])

test_sentences = np.array([t[0] for t in X_te])
test_SVO = [t[1] for t in X_te]
test_profanity = np.array([t[2] for t in X_te])

## Padding zeros to SVO to make all of them same length 
train_SVO_padded = list()
for sample in train_SVO:
    train_SVO_padded.append([padd_array_with_zeros(arr, MAX_LENGTH) for arr in sample])
train_SVO = np.array(train_SVO_padded)

test_SVO_padded = list()
for sample in test_SVO:
    test_SVO_padded.append([padd_array_with_zeros(arr, MAX_LENGTH) for arr in sample])
test_SVO = np.array(test_SVO_padded)


In [55]:
print('Sentences: ', 'train', train_sentences.shape, 'test', test_sentences.shape)
print('SVO: ', 'train', train_SVO.shape, 'test', test_SVO.shape)
print('Profanity: ', 'train', train_profanity.shape, 'test', test_profanity.shape)

Sentences:  train (4843,) test (1211,)
SVO:  train (4843, 3, 128) test (1211, 3, 128)
Profanity:  train (4843,) test (1211,)
