Adding all datasets together

In [1]:
import re
from pathlib import Path

import pandas as pd
import json
import csv

### Conan dataset:

Hate  speech  collection: For  each  language we asked two native speaker experts (NGO train-ers) to write around 50 prototypical islamophobic short hate texts. This step was used to ensure that:(i) the sample uniformly covers the typical ‘arguments’ against Islam as much as possible, (ii) wecan distribute to the NLP community the originalhate speech as well as its counter-narrative

In [2]:
with open(r'../data/conan/CONAN.json') as json_file:
    data = json.load(json_file)
conan = (pd.DataFrame.from_dict(data=data['conan'][0], orient='index')).T
for i in range(len(data['conan'])):
    sett = (pd.DataFrame.from_dict(data=data['conan'][i], orient='index')).T
    if sett['cn_id'][0][0:2] == 'EN':
        conan = conan.append(sett, ignore_index=True)
conan_dt = pd.DataFrame(conan.hateSpeech.unique())
conan_dt['classification'] = 'islamophobic'
conan_dt.columns = ['text', 'classification']

### LoL dataset

In [3]:
lol = pd.read_csv (r'../data/lol/lol_data_ok.csv')
lol_data = lol[lol['classification'] != 'neutral']
lol_data.classification = lol_data.classification.replace('harrasment', 'cyberbullying')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


### Reddit

In [4]:
reddit = pd.read_csv (r'../data/reddit_binary-jing-qian/reddit.csv')
red_data = reddit[reddit['hate_speech_idx'].isnull() == False][['text']]
red_data['classification'] = 'hateful'

### Gab

In [5]:
gab = pd.read_csv(r'../data/gab_binary-jing-qian/gab.csv')
gab_data = gab[~gab['hate_speech_idx'].isnull()][['text']]
gab_data['classification'] = 'hateful'

### Twitter hierarchy

In [6]:
twitter_hierarchy = pd.read_csv (r'../data/twitter_hierarchy_t-davidson/labeled_data.csv')
twitter_h = twitter_hierarchy[twitter_hierarchy['offensive_language'] > 0][['tweet']]
twitter_h['classification'] = 'offensive' 
twitter_h.columns = ['text', 'classification']

### Abusive speech

In [7]:
hate_speech = pd.read_csv (r'../data/multilingual_and_multi-aspect_hate_speech_analysis/hate_speech_mlma/en_dataset.csv')
abusive = hate_speech[hate_speech['sentiment'].str.contains("abusive")][['tweet']]
abusive['classification'] = 'abusive' 
abusive.columns = ['text', 'classification']

### Online hate speech

In [8]:
personal_attack = pd.read_csv(r'../data/personal_attacks_seen_at_scale_wulczyn/train.csv')
insults = personal_attack[personal_attack.Insult == 1][['Comment']]
insults['classification'] = 'insult' 
insults.columns = ['text', 'classification']

### White suppremacy forum

In [9]:
annotations = pd.read_csv(r'../data/stormfront_ternary_vicomtech/annotations_metadata.csv')
hates = list()
for file in annotations[annotations['label'] == 'hate'].file_id.unique():
    hates_dt = pd.read_csv('../data/stormfront_ternary_vicomtech/all_files/' + str(file) + '.txt', header = None)
    hates.append(hates_dt[0][0])
hates = pd.DataFrame(hates)
hates['classification'] = 'hateful' 
hates.columns = ['text', 'classification']

### Multimodal twitter

In [10]:
data_dir = Path("../data/twitter_mutlimodal_hate_speech")

with open(data_dir / "MMHS150K_GT.json", "r") as json_file:
    data = json.load(json_file)

twits = []
for key, value in data.items():
    value["tweet_id"] = key
    try:
        with open(data_dir / "img_txt" / f"{key}.json") as img_txt:
            img_txt = json.load(img_txt)
        value["img_text"] = img_txt["img_text"]
    except FileNotFoundError:
        value["img_text"] = None
    twits.append(value)
df = pd.DataFrame(twits)
df.to_feather(data_dir / "data.feather")

In [11]:
twitter2 = pd.read_feather("../data/twitter_mutlimodal_hate_speech/data.feather")
labels = twitter2["labels_str"].apply(pd.Series)

oh_labels = pd.concat([
    pd.get_dummies(labels[li])
    for li in labels.columns
]).groupby(level=0).max().astype(bool)

twitter2[oh_labels.columns] = oh_labels

twitter2["classification"] = labels.mode(axis=1)[0]

twitter2["text"] = (twitter2["tweet_text"] + twitter2["img_text"].astype(str))
# twitter2 = twitter2[["tweet_text", "text", "classification"]]
twitter2 = twitter2[["text", "classification"]]
twitter2

Unnamed: 0,text,classification
0,@FriskDontMiss Nigga https://t.co/cAsaLWEpue#Y...,Homophobe
1,My horses are retarded https://t.co/HYhqc6d5WN...,OtherHate
2,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,NotHate
3,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,NotHate
4,“EVERYbody calling you Nigger now!” https://t....,Racist
...,...,...
149818,@svdate @gtconway3d I would just say hes Donny...,NotHate
149819,@Cheftime_Dev congrats my nigga keep on grindi...,NotHate
149820,My nigga big shitty https://t.co/e0snJGBgH9None,NotHate
149821,did she just say “my nigga” to Rich? &amp; she...,NotHate


### Appending datasets

In [12]:
dataset = conan_dt
dataset = dataset.append(conan_dt, ignore_index=True)
dataset = dataset.append(lol_data, ignore_index=True)
dataset = dataset.append(red_data, ignore_index=True)
dataset = dataset.append(twitter_h, ignore_index=True)
dataset = dataset.append(abusive, ignore_index=True)
dataset = dataset.append(insults, ignore_index=True)
dataset = dataset.append(hates, ignore_index=True)
dataset = dataset.append(gab_data, ignore_index=True)
dataset = dataset.append(twitter2, ignore_index=True)

### Clean the dataset

In [13]:
def clean_tweets(vTEXT):
    """
    Removes URLs, tags and more from tweets. Extend it if needed.
    
    short words: \b\w{1,2}\b
    tags starting with @: @\w*
    URL: (https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b
    """
    vTEXT = re.sub(r'\b\w{1,2}\b|@\w*|(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)


In [14]:
dataset["text_raw"] = dataset["text"]
dataset["text"] = dataset["text"].apply(clean_tweets)
dataset

Unnamed: 0,text,classification,text_raw
0,"According recent OFSTED report, school Bir...",islamophobic,"According to a recent OFSTED report, a school ..."
1,Birmingham there school where girls and boy...,islamophobic,In Birmingham there is a school where girls an...
2,school Birningham still separating girls fr...,islamophobic,A school in Birningham is still separating gir...
3,The police and politicians are covering these...,islamophobic,The police and politicians are covering up the...
4,Muslims grooming gangs are protected the gove...,islamophobic,Muslims grooming gangs are protected by the go...
...,...,...,...
190117,would just say hes Donny the retard MOGIF ...,NotHate,@svdate @gtconway3d I would just say hes Donny...
190118,congrats nigga keep grinding AIR Elapsed T...,NotHate,@Cheftime_Dev congrats my nigga keep on grindi...
190119,nigga big shitty,NotHate,My nigga big shitty https://t.co/e0snJGBgH9None
190120,did she just say “ nigga” Rich? &amp; she sai...,NotHate,did she just say “my nigga” to Rich? &amp; she...


In [15]:
dataset.classification.unique()

array(['islamophobic', 'cyberbullying', 'hateful', 'offensive', 'abusive',
       'insult', 'Homophobe', 'OtherHate', 'NotHate', 'Racist', 'Sexist',
       'Religion'], dtype=object)

In [17]:
dataset.to_csv('../data/main_dataset.csv', index=False)