# Notebook 1: Preprocessing of data

This notebook presents the preprocessing of the data as explained in the paper in section 4.2. By doing so, the posts are prepared for subsequent feature extraction while the rationales and labels are aggregated.


**Table of Contents**:
0. [Technical setup](#setup)
1. [Load documents and aggregate labels/rationales](#load)
2. [Preprocess posts](#preprocessing)
3. [Export preprocessed dataframe](#export)

# 1.0 Technical setup <a id="setup"></a>


In [None]:
# set up Google Colab workspace

from google.colab import drive
drive.mount('/content/drive')

!pip install emoji

Mounted at /content/drive


In [None]:
# loading modules

import pandas as pd
pd.set_option('display.max_colwidth', 300)
import re
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import Counter
import nltk
from nltk.corpus import stopwords, words, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import WordNetLemmatizer, pos_tag, word_tokenize
from nltk.corpus.reader.wordnet import WordNetError
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from sklearn import preprocessing
import pickle
from copy import deepcopy
import json
import emoji
import itertools

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# define functions for saving and loading pickled objects
def save_pickle(objectname, picklename):
    pickle_out = open(picklename,"wb")
    pickle.dump(objectname, pickle_out)
    pickle_out.close()
    print(picklename, 'successfully pickled.')

def load_pickle(picklename):
    pickle_in = open(picklename,"rb")
    return pickle.load(pickle_in)

# 1.1 Load documents and aggregate labels/rationales <a id="load"></a>

In [None]:
# import dataset

with open('/content/drive/MyDrive/Seminar/dataset.txt') as json_file:
    hateXplain_raw = json.load(json_file)

In [None]:
# create binary classification problem by majority voting (0: normal, 1: hateful)
def gen_label(dict_lab):
    n = len(dict_lab)
    labels = [dict_lab[i]['label'] for i in range(n)]
    c = Counter(labels)
    label_mostfreq = c.most_common(1)
    if label_mostfreq[0][0] == 'normal':
        l = 0
    else:
        l = 1
    return l

# combine rationales by adding the ones of all annotators
def gen_rational(list_rat):
    if len(list_rat) == 0:
        return []
    
    rationales = [0] * len(list_rat[0])
    for rat in list_rat:
        rationales = [sum(x) for x in zip(rationales, rat)]
    rationales = [1 if i > 1 else i for i in rationales]
    return rationales

In [None]:
# create dataframe of data

hateXplain_df = pd.DataFrame.transpose(pd.DataFrame.from_records(hateXplain_raw)).drop(columns=['post_id'])
hateXplain_df['label'] = [gen_label(hateXplain_df["annotators"][i]) for i in range(hateXplain_df.shape[0])]
hateXplain_df['rationales_comb'] = [gen_rational(hateXplain_df["rationales"][i]) for i in range(hateXplain_df.shape[0])]

hateXplain_df.head()

Unnamed: 0,annotators,rationales,post_tokens,label,rationales_comb
10001291_gab,"[{'label': 'hatespeech', 'annotator_id': 199, 'target': ['Jewish']}, {'label': 'hatespeech', 'annotator_id': 200, 'target': ['Jewish']}, {'label': 'hatespeech', 'annotator_id': 201, 'target': ['Jewish']}]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]","[jews, are, responsible, for, destroying, my, towns, culture, and, turning, leeds, into, a, ghetto, hell, of, dollarstan, full, of, dollarstanis, please, killalljewz]",1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
10002346_gab,"[{'label': 'normal', 'annotator_id': 202, 'target': ['Men', 'Other']}, {'label': 'offensive', 'annotator_id': 203, 'target': ['Other']}, {'label': 'normal', 'annotator_id': 204, 'target': ['Women']}]",[],"[leftist, mind, control, fails, i, think, this, overstates, it, men, prefer, women, who, are, real, people, and, prefer, tall, thin, young, bodies, if, asked, in, abstraction, the, two, are, not, the, same, question]",0,[]
10007406_gab,"[{'label': 'offensive', 'annotator_id': 205, 'target': ['Homosexual']}, {'label': 'hatespeech', 'annotator_id': 206, 'target': ['Homosexual']}, {'label': 'normal', 'annotator_id': 207, 'target': ['Homosexual']}]",[],"[the, scientific, findings, that, advocates, and, the, pro, gay, media, cite, tend, to, come, from, decidedly, unrepresentative, samples, that, have, been, collected, from, places, like, bulletin, boards, at, gay, parenting, support, groups, where, everyone, has, an, interest, in, proving, that,...",1,[]
10008196_gab,"[{'label': 'hatespeech', 'annotator_id': 200, 'target': ['African']}, {'label': 'hatespeech', 'annotator_id': 208, 'target': ['African']}, {'label': 'normal', 'annotator_id': 209, 'target': ['None']}]","[[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[it, why, they, behave, like, animals, no, matter, where, you, put, them, they, will, destroy, anything, of, value, black, people, want, to, claim, we, are, the, racists, but, then, why, do, they, move, out, of, the, ghetto, once, they, have, success, even, they, do, not, want, to, live, around,...",1,"[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
10009556_gab,"[{'label': 'offensive', 'annotator_id': 27, 'target': ['Asian']}, {'label': 'offensive', 'annotator_id': 1, 'target': ['Arab']}, {'label': 'offensive', 'annotator_id': 52, 'target': ['Arab']}]","[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]","[why, are, people, so, offended, by, the, word, paki, surely, the, word, fucking, that, comes, before, is, more, offensive]",1,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


# 1.2 Preprocess posts <a id="preprocessing"></a>

In [None]:
# Preprocessing to return the cleaned text as a list of words as string

lem = WordNetLemmatizer() #load lemmatizer
eng_vocab = set(words.words()) # load Englisch vocabulary
special_characters = '!@#$%^&*()-+?_=,<>/'

def lemmatize(word): #function to lemmatize a word
    pos_label = (nltk.pos_tag([word])[0][1][0]).lower() #identify single character pos-constant from pos_tag
    
    if pos_label == 'j': pos_label = 'a'    # 'j' <-> 'a' reassignment for adjectives because 'j' is not in wordnet: 'a' as label for adjectives
    
    if pos_label in ['r']:  # identify and lemmatize adverbs 
        try:
            return wordnet.synset(word+'.r.1').lemmas()[0].pertainyms()[0].name()
        except (IndexError, WordNetError):
            return word
    elif pos_label in ['a', 's', 'v']: # identify and lemmatize (satellite) adjectives and verbs
        return lem.lemmatize(word, pos=pos_label)
    else:   # lemmatize nouns and everything else
        return lem.lemmatize(word)
    
def text_process(text):
    text_processed =  [word for word in text if 
                       (word not in stopwords.words('english') and  # remove stopwords
                        not word.isdigit() and # remove digits
                        not any(c in special_characters for c in word) and # remove words with special characters
                        len(re.findall(r'[^\w\s,.]', word)) == 0 and # remove emojis and other special sequences
                        len(word) > 1 and # remove empty strings
                        word.isascii())] # remove non-English characters
    
    rem_words = [word for word in text if word not in text_processed] # find words which were removed
    ind_rem_words = []
    for word in rem_words:
        ind_rem_words.append([i for i, x in enumerate(text) if x == word])
    ind_rem_words = list(set(itertools.chain.from_iterable(ind_rem_words))) # get indices of removed words
    
    text_processed = [word.lower() for word in text_processed]
    text_lemmatized = [lemmatize(word) for word in text_processed] #lemmatize words
    return text_lemmatized, ind_rem_words

# adapt rationales by removing indices of the words which were removed
# during the preprocessing
def new_rationale(old_ind, ind_remove):
    if len(old_ind) < 2:
        return []
    return [i for j, i in enumerate(old_ind) if j not in ind_remove]

In [None]:
# Preprocessing of texts

processed_text = []
rationales_adapted = []
for ind in tqdm(range(hateXplain_df.shape[0]), "Text Processing"):
    processed_sentence, ind_remove = text_process(hateXplain_df['post_tokens'][ind])
    processed_text.append(processed_sentence)
    rationales_adapted.append(new_rationale(hateXplain_df['rationales_comb'][ind], ind_remove))            

hateXplain_df['tokens_processed'] = processed_text
hateXplain_df['rationales_comb'] = rationales_adapted

HBox(children=(FloatProgress(value=0.0, description='Text Processing', max=20148.0, style=ProgressStyle(descri…




In [None]:
hateXplain_df = hateXplain_df.reset_index(drop=True)

# remove empty rows
ind_empty = [i for i in range(len(hateXplain_df['tokens_processed'])) if len(hateXplain_df['tokens_processed'][i]) == 0]
hateXplain_df.drop(ind_empty, inplace = True)

hateXplain_df = hateXplain_df.reset_index(drop=True)
hateXplain_df.head()

Unnamed: 0,annotators,rationales,post_tokens,label,rationales_comb,tokens_processed
0,"[{'label': 'hatespeech', 'annotator_id': 199, 'target': ['Jewish']}, {'label': 'hatespeech', 'annotator_id': 200, 'target': ['Jewish']}, {'label': 'hatespeech', 'annotator_id': 201, 'target': ['Jewish']}]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]","[jews, are, responsible, for, destroying, my, towns, culture, and, turning, leeds, into, a, ghetto, hell, of, dollarstan, full, of, dollarstanis, please, killalljewz]",1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[jew, responsible, destroy, town, culture, turn, leeds, ghetto, hell, dollarstan, full, dollarstanis, please, killalljewz]"
1,"[{'label': 'normal', 'annotator_id': 202, 'target': ['Men', 'Other']}, {'label': 'offensive', 'annotator_id': 203, 'target': ['Other']}, {'label': 'normal', 'annotator_id': 204, 'target': ['Women']}]",[],"[leftist, mind, control, fails, i, think, this, overstates, it, men, prefer, women, who, are, real, people, and, prefer, tall, thin, young, bodies, if, asked, in, abstraction, the, two, are, not, the, same, question]",0,[],"[leftist, mind, control, fails, think, overstates, men, prefer, woman, real, people, prefer, tall, thin, young, body, ask, abstraction, two, question]"
2,"[{'label': 'offensive', 'annotator_id': 205, 'target': ['Homosexual']}, {'label': 'hatespeech', 'annotator_id': 206, 'target': ['Homosexual']}, {'label': 'normal', 'annotator_id': 207, 'target': ['Homosexual']}]",[],"[the, scientific, findings, that, advocates, and, the, pro, gay, media, cite, tend, to, come, from, decidedly, unrepresentative, samples, that, have, been, collected, from, places, like, bulletin, boards, at, gay, parenting, support, groups, where, everyone, has, an, interest, in, proving, that,...",1,[],"[scientific, finding, advocate, pro, gay, medium, cite, tend, come, decided, unrepresentative, sample, collect, place, like, bulletin, board, gay, parent, support, group, everyone, interest, prove, gay, parent, success, austin, ruse]"
3,"[{'label': 'hatespeech', 'annotator_id': 200, 'target': ['African']}, {'label': 'hatespeech', 'annotator_id': 208, 'target': ['African']}, {'label': 'normal', 'annotator_id': 209, 'target': ['None']}]","[[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[it, why, they, behave, like, animals, no, matter, where, you, put, them, they, will, destroy, anything, of, value, black, people, want, to, claim, we, are, the, racists, but, then, why, do, they, move, out, of, the, ghetto, once, they, have, success, even, they, do, not, want, to, live, around,...",1,"[1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[behave, like, animal, matter, put, destroy, anything, value, black, people, want, claim, racist, move, ghetto, success, even, want, live, around, kind]"
4,"[{'label': 'offensive', 'annotator_id': 27, 'target': ['Asian']}, {'label': 'offensive', 'annotator_id': 1, 'target': ['Arab']}, {'label': 'offensive', 'annotator_id': 52, 'target': ['Arab']}]","[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]","[why, are, people, so, offended, by, the, word, paki, surely, the, word, fucking, that, comes, before, is, more, offensive]",1,"[0, 0, 0, 1, 0, 0, 0, 0, 0]","[people, offend, word, paki, sure, word, fuck, come, offensive]"


# 1.3 Export preprocessed dataframe <a id="export"></a>

In [None]:
# export dataset as pickle
save_pickle(hateXplain_df, '/content/drive/MyDrive/Seminar/hateXplain_processed.pickle')

/content/drive/MyDrive/Seminar/hateXplain_processed.pickle successfully pickled.
