Adding all datasets together

In [1]:
import pandas as pd
import json
import csv

### Conan dataset:

Hate  speech  collection: For  each  language we asked two native speaker experts (NGO train-ers) to write around 50 prototypical islamophobic short hate texts. This step was used to ensure that:(i) the sample uniformly covers the typical ‘arguments’ against Islam as much as possible, (ii) wecan distribute to the NLP community the originalhate speech as well as its counter-narrative

In [2]:
with open(r'..\data\conan\CONAN.json') as json_file:
    data = json.load(json_file)
conan = (pd.DataFrame.from_dict(data=data['conan'][0], orient='index')).T
for i in range(len(data['conan'])):
    sett = (pd.DataFrame.from_dict(data=data['conan'][i], orient='index')).T
    if sett['cn_id'][0][0:2] == 'EN':
        conan = conan.append(sett, ignore_index=True)
conan_dt = pd.DataFrame(conan.hateSpeech.unique())
conan_dt['classification'] = 'islamophobic'
conan_dt.columns = ['text', 'classification']

### LoL dataset

In [3]:
lol = pd.read_csv (r'..\data\lol\lol_data_ok.csv')
lol_data = lol[lol['classification'] != 'neutral']
lol_data.classification = lol_data.classification.replace('harrasment', 'cyberbullying')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


### Reddit

In [4]:
reddit = pd.read_csv (r'..\data\reddit_binary-jing-qian\reddit.csv')
red_data = reddit[reddit['hate_speech_idx'].isnull() == False][['text']]
red_data['classification'] = 'hateful'

### Gab

In [5]:
gab = pd.read_csv(r'..\data\gab_binary-jing-qian\gab.csv')
gab_data = gab[~gab['hate_speech_idx'].isnull()][['text']]
gab_data['classification'] = 'hateful'

### Twitter hierarchy

In [6]:
twitter_hierarchy = pd.read_csv (r'..\data\twitter_hierarchy_t-davidson\labeled_data.csv')
twitter_h = twitter_hierarchy[twitter_hierarchy['offensive_language'] > 0][['tweet']]
twitter_h['classification'] = 'offensive' 
twitter_h.columns = ['text', 'classification']

### Abusive speech

In [7]:
hate_speech = pd.read_csv (r'..\data\multilingual_and_multi-aspect_hate_speech_analysis\hate_speech_mlma\en_dataset.csv')
abusive = hate_speech[hate_speech['sentiment'].str.contains("abusive")][['tweet']]
abusive['classification'] = 'abusive' 
abusive.columns = ['text', 'classification']

### Online hate speech

In [8]:
personal_attack = pd.read_csv(r'..\data\personal_attacks_seen_at_scale_wulczyn\train.csv')
insults = personal_attack[personal_attack.Insult == 1][['Comment']]
insults['classification'] = 'insult' 
insults.columns = ['text', 'classification']

### White suppremacy forum

In [9]:
annotations = pd.read_csv(r'..\data\stormfront_ternary_vicomtech\annotations_metadata.csv')
hates = list()
for file in annotations[annotations['label'] == 'hate'].file_id.unique():
    hates_dt = pd.read_csv('../data/stormfront_ternary_vicomtech/all_files/' + str(file) + '.txt', header = None)
    hates.append(hates_dt[0][0])
hates = pd.DataFrame(hates)
hates['classification'] = 'hateful' 
hates.columns = ['text', 'classification']

### Multimodal twitter

In [10]:
twitter2 = pd.read_excel('twitter_mutlimodal_hate_speech.xlsx', sheet_name=None)
twitter2 = twitter2['Sheet1'][['text', 'Homophobe', 'NotHate', 'OtherHate', 'Racist', 'Religion', 'Sexist']]
twitter_melted = pd.melt(twitter2, id_vars=['text'], value_vars=['Homophobe', 'NotHate', 'OtherHate', 'Racist', 'Religion', 'Sexist'])
twitter_melted = twitter_melted[twitter_melted['value'] == True][['text', 'variable']]
twitter_melted.columns = ['text', 'classification']

### Appending datasets

In [11]:
dataset = conan_dt
dataset = dataset.append(conan_dt, ignore_index=True)
dataset = dataset.append(lol_data, ignore_index=True)
dataset = dataset.append(red_data, ignore_index=True)
dataset = dataset.append(twitter_h, ignore_index=True)
dataset = dataset.append(abusive, ignore_index=True)
dataset = dataset.append(insults, ignore_index=True)
dataset = dataset.append(hates, ignore_index=True)
dataset = dataset.append(gab_data, ignore_index=True)
dataset = dataset.append(twitter_melted, ignore_index=True)

In [12]:
dataset.classification.unique()

array(['islamophobic', 'cyberbullying', 'hateful', 'offensive', 'abusive',
       'insult', 'Homophobe', 'NotHate', 'OtherHate', 'Racist',
       'Religion', 'Sexist'], dtype=object)

In [13]:
dataset.to_csv('..\data\main_dataset.csv')