# Input Statistical Analysis for all datasets

In [76]:
import numpy as np
import pandas as pd
import spacy
import emoji
from collections import Counter

In [77]:
DATA_DIR = '../data'

### Utility Functions

In [78]:
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

def slice_dataframe_and_compute_word_frequency(df, slice_cols, slice_vals, text_col, spacy_lang_pkg):
    sliced_df = df.copy()
    for i in range(len(slice_cols)):
        sliced_df = sliced_df[sliced_df[slice_cols[i]] == slice_vals[i]]
    print(f'Found a total of {len(sliced_df)} examples')
    nlp = spacy.load(spacy_lang_pkg)
    text = ' '.join(sliced_df[text_col])
    text = emoji.get_emoji_regexp().sub(r'', text)
    doc = nlp(text)
    words = [token.text for token in doc if not token.is_stop and not token.is_punct and len(token) > 1]
    freqs = Counter(words)
    return freqs

### Spanish (Basile et al.)

In [79]:
df = pd.read_csv(f'{DATA_DIR}/spanish-basile/hateval2019_es_train.csv')

In [80]:
df.head()

Unnamed: 0,id,text,HS,TR,AG
0,20001,Easyjet quiere duplicar el número de mujeres p...,1,0,0
1,20002,El gobierno debe crear un control estricto de ...,1,0,0
2,20003,Yo veo a mujeres destruidas por acoso laboral ...,0,0,0
3,20004,"— Yo soy respetuoso con los demás, sólamente l...",0,0,0
4,20007,Antonio Caballero y como ser de mal gusto e ig...,0,0,0


#### 1.1 Word Frequecy for Examples that are Hate Speech

In [81]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS'], [1], 'text', 'es_core_news_sm')
freqs.most_common()[:10]

Found a total of 1857 examples


[('puta', 418),
 ('perra', 345),
 ('zorra', 216),
 ('Cállate', 122),
 ('callate', 110),
 ('inmigrantes', 102),
 ('mujer', 96),
 ('Callate', 91),
 ('mereces', 82),
 ('cállate', 78)]

#### 1.2 Word Frequecy for Examples that are Hate Speech and Aggressive

In [82]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS', 'AG'], [1, 1], 'text', 'es_core_news_sm')
freqs.most_common()[:10]

Found a total of 1502 examples


[('puta', 398),
 ('perra', 322),
 ('zorra', 207),
 ('Cállate', 122),
 ('callate', 109),
 ('Callate', 90),
 ('mereces', 80),
 ('cállate', 78),
 ('mierda', 74),
 ('PUTA', 67)]

#### 1.3 Word Frequecy for Examples that are Hate Speech but not Aggressive

In [83]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS', 'AG'], [1, 0], 'text', 'es_core_news_sm')
freqs.most_common()[:10]

Found a total of 355 examples


[('mujer', 54),
 ('inmigrantes', 36),
 ('mujeres', 32),
 ('subsaharianos', 23),
 ('perra', 23),
 ('puta', 20),
 ('España', 17),
 ('país', 17),
 ('papeles', 16),
 ('inmigrante', 15)]

#### 2.  Word Frequecy for Examples that are not Hate Speech

In [84]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS'], [0], 'text', 'es_core_news_sm')
freqs.most_common()[:10]

Found a total of 2643 examples


[('puta', 784),
 ('hijo', 237),
 ('acoso', 198),
 ('madre', 149),
 ('perra', 142),
 ('polla', 141),
 ('PUTA', 139),
 ('mierda', 117),
 ('mereces', 112),
 ('violación', 110)]

### Spanish (Pereira et al.)

In [85]:
# Necessary wrangling to fit it into a DataFrame
data = []
with open(f'{DATA_DIR}/spanish-pereira/labeled_corpus_6K.txt', 'r',encoding='utf-8') as f:
    for line in f.readlines():
        curr = line.split(";||;")
        data.append(curr[:-1] + [int(curr[-1][0])])

In [86]:
df = pd.DataFrame(data=np.array(data), columns=['id', 'text', 'HS'])

In [87]:
df['HS'] = df.HS.astype(int)

#### 1. Word Frequecy for Examples that are Hate Speech

In [88]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS'], [1], 'text', 'es_core_news_sm')
freqs.most_common()[:10]

Found a total of 1567 examples


[('subnormal', 350),
 ('mierda', 144),
 ('puto', 97),
 ('puta', 91),
 ('fascista', 85),
 ('fachas', 82),
 ('facha', 66),
 ('fascistas', 61),
 ('nazi', 44),
 ('retrasado', 42)]

#### 2. Word Frequecy for Examples that are not Hate Speech

In [89]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS'], [0], 'text', 'es_core_news_sm')
freqs.most_common()[:10]

Found a total of 4433 examples


[('independentista', 501),
 ('subnormal', 306),
 ('fascista', 200),
 ('facha', 196),
 ('fascistas', 157),
 ('negro', 145),
 ('fachas', 135),
 ('mayoría', 124),
 ('género', 121),
 ('gente', 118)]

### French (Ousidhoum et al.)

In [90]:
df = pd.read_csv(f'{DATA_DIR}/french-ousidhoum/fr_dataset.csv')

In [91]:
df.head()

Unnamed: 0,HITId,tweet,sentiment,directness,annotator_sentiment,target,group
0,1,rt @user personnage ainsi q tte la clique gauc...,abusive,direct,shock,origin,other
1,2,@user @user @user bah oui t'as raison l'autre ...,offensive,indirect,shock,other,individual
2,3,moi j'ai personne à qui m'adresser pour réclam...,hateful,direct,anger_confusion_sadness_fear_disgust,other,women
3,4,@user @user c...est vrai que les anchois sont ...,normal,indirect,indifference,other,other
4,5,eh les renois faut se réveiller la @url,normal,direct,sadness,origin,african_descent


#### 1. Word Frequency for Examples that are not Hate Speech

In [92]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['normal'], 'tweet', 'fr_core_news_sm')
freqs.most_common()[:10]

Found a total of 821 examples


[('@user', 970),
 ('@url', 475),
 ('gauchiste', 93),
 ('renois', 79),
 ('attarde', 66),
 ('migrants', 62),
 ('violence', 60),
 ('contre', 53),
 ('arabes', 43),
 ('attardé', 42)]

#### 2. Word Frequency for Examples that are Hate Speech

In [93]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['abusive'], 'tweet', 'fr_core_news_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['offensive'], 'tweet', 'fr_core_news_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['hateful'], 'tweet', 'fr_core_news_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['disrespectful'], 'tweet', 'fr_core_news_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['fearful'], 'tweet', 'fr_core_news_sm')
freqs.most_common()[:10]

Found a total of 594 examples
Found a total of 1336 examples
Found a total of 207 examples
Found a total of 142 examples
Found a total of 236 examples


[('@user', 2335),
 ('@url', 1104),
 ('mongol', 517),
 ('attardé', 379),
 ('gauchiste', 312),
 ('renois', 253),
 ('sale', 228),
 ('rebeus', 188),
 ('arabe', 133),
 ('migrants', 105)]

### Turkish

In [94]:
df = pd.read_csv(f'{DATA_DIR}/turkish/offenseval-tr-training-v1/offenseval-tr-training-v1.tsv', sep='\t')

In [95]:
df.head()

Unnamed: 0,id,tweet,subtask_a
0,20948,@USER en güzel uyuyan insan ödülü jeon jungkoo...,NOT
1,10134,"@USER Mekanı cennet olsun, saygılar sayın avuk...",NOT
2,23457,Kızlar aranızda kas yığını beylere düşenler ol...,NOT
3,18401,Biraz ders çalışayım. Tembellik ve uyku düşman...,NOT
4,17525,@USER Trezeguet yerine El Sharawy daha iyi olm...,NOT


#### 1. Word Frequency for tweets that are Hate Speech

In [96]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['subtask_a'], ['OFF'], 'tweet', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 6046 examples


[('@USER', 5367),
 ('  ', 1380),
 ('bir', 1267),
 ('bu', 1167),
 ('ve', 847),
 ('ne', 718),
 ('de', 688),
 ('da', 660),
 ('için', 546),
 ('gibi', 528)]

#### 2. Word Frequency for tweets that are not Hate Speech

In [97]:
# This is too long for Spacy. DON'T RUN
# freqs = slice_dataframe_and_compute_word_frequency(df, ['subtask_a'], ['NOT'], 'tweet', 'en_core_web_sm')

### Danish

In [98]:
df = pd.read_csv(f'{DATA_DIR}/danish/data/offenseval-da-training-v1.tsv', sep='\t')

In [99]:
df.head()

Unnamed: 0,id,tweet,subtask_a
0,3131,"Jeg tror det vil være dejlig køligt, men jeg v...",NOT
1,711,Så kommer de nok til at investere i en ny cyke...,NOT
2,2500,Nu er det jo også de Ikea-aber der har lavet s...,OFF
3,2678,"128 Varme emails, er vi enige om at det er sex...",NOT
4,784,"Desværre tyder det på, at amerikanerne er helt...",NOT


#### 1. Word Frequency for tweets that are Hate Speech

In [100]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['subtask_a'], ['OFF'], 'tweet', 'da_core_news_sm')
freqs.most_common()[:10]

Found a total of 384 examples


[('lort', 45),
 ('@USER', 31),
 ('bare', 29),
 ('  ', 24),
 ('godt', 22),
 ('fandme', 22),
 ('når', 19),
 ('folk', 19),
 ('Fuck', 18),
 ('lortet', 15)]

#### 2. Word Frequency for tweets that are not Hate Speech

In [101]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['subtask_a'], ['NOT'], 'tweet', 'da_core_news_sm')
freqs.most_common()[:10]

Found a total of 2576 examples


[('bare', 147),
 ('@USER', 126),
 ('godt', 123),
 ('når', 121),
 ('  ', 117),
 ('Danmark', 100),
 ('se', 82),
 ('helt', 81),
 ('URL', 77),
 ('år', 74)]

### Hindi

In [102]:
df = pd.read_csv(f'{DATA_DIR}/hindi/agr_hi_train.csv')

In [103]:
df.columns = ['id', 'text', 'agr']

In [104]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['agr'], ['OAG'], 'tweet', 'da_core_news_sm')
freqs.most_common()[:10]

Found a total of 4855 examples


KeyError: 'tweet'

### English (basile et al)

In [105]:
df = pd.read_csv(f'{DATA_DIR}/english-basile/hateval2019_en_train.csv')

In [106]:
df.head()

Unnamed: 0,id,text,HS,TR,AG
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,0,0
1,202,Why would young fighting age men be the vast m...,1,0,0
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,0,0
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,0,0
4,205,Orban in Brussels: European leaders are ignori...,0,0,0


#### 1.1 Word Frequency for Examples that are Hate Speech

In [107]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS'], [1], 'text', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 3783 examples


[('bitch', 693),
 ('women', 336),
 ('like', 301),
 ('BuildThatWall', 291),
 ('refugees', 246),
 ('illegal', 227),
 ('whore', 219),
 ('cunt', 207),
 ('want', 203),
 ('migrants', 198)]

#### 1.2 Word Frequency for Examples that are Hate Speech and Aggressive

In [108]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS', 'AG'], [1, 1], 'text', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 1559 examples


[('bitch', 220),
 ('BuildThatWall', 197),
 ('refugees', 121),
 ('MAGA', 114),
 ('illegal', 113),
 ('like', 107),
 ('Trump', 102),
 ('Illegal', 100),
 ('@realDonaldTrump', 97),
 ('immigration', 92)]

#### 1.3 Word Frequency for Examples that are Hate Speech but not Aggressive

In [109]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS', 'AG'], [1, 0], 'text', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 2224 examples


[('bitch', 473),
 ('women', 258),
 ('like', 194),
 ('whore', 152),
 ('woman', 147),
 ('hoe', 144),
 ('cunt', 140),
 ('refugees', 125),
 ('ass', 118),
 ('want', 117)]

#### Word Frequency for Examples that are not Hate Speech

In [110]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS'], [0], 'text', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 5217 examples


[('refugees', 557),
 ('immigrant', 509),
 ('women', 405),
 ('men', 377),
 ('like', 332),
 ('migrants', 292),
 ('people', 259),
 ('rape', 234),
 ('refugee', 233),
 ('woman', 231)]

### English (ousidhoum et al)

In [111]:
df = pd.read_csv(f'{DATA_DIR}/english-ousidhoum/en_dataset.csv')

In [112]:
df.head()

Unnamed: 0,HITId,tweet,sentiment,directness,annotator_sentiment,target,group
0,1,call sis im boy girl still faggot shut,offensive,indirect,confusion_disgust,origin,gay
1,2,@user @user legit nilas retarded idk,offensive_disrespectful,indirect,anger_confusion,disability,special_needs
2,3,said retard @url,fearful_abusive_offensive_hateful,indirect,confusion_disgust,disability,special_needs
3,4,america another 8 years obama's ideology via h...,fearful_abusive_hateful_disrespectful_normal,indirect,anger_fear_shock_sadness_disgust,origin,other
4,5,@user don... fucking made cry twat.,offensive,indirect,shock_disgust,gender,women


#### 1. Word Frequency for Examples that are not Hate Speech

In [113]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['normal'], 'tweet', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 661 examples


[('@user', 396),
 ('@url', 304),
 ('retarded', 62),
 ('shithole', 61),
 ('like', 54),
 ('faggot', 51),
 ('spic', 49),
 ('retard', 45),
 ('twat', 42),
 ('cunt', 39)]

#### 2. Word Frequency for Examples that are Hate Speech

In [114]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['offensive'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['abusive'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['hateful'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['disrespectful'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['fearful'], 'tweet', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 2954 examples
Found a total of 1 examples
Found a total of 315 examples
Found a total of 10 examples
Found a total of 28 examples


[('@user', 2787),
 ('@url', 1340),
 ('retard', 406),
 ('retarded', 372),
 ('faggot', 324),
 ('cunt', 287),
 ('shithole', 283),
 ('twat', 280),
 ('like', 220),
 ('fucking', 207)]

### Arabic

In [115]:
df = pd.read_csv(f'{DATA_DIR}/arabic-ousidhoum/ar_dataset.csv')

In [116]:
df.head()

Unnamed: 0,HITId,tweet,sentiment,directness,annotator_sentiment,target,group
0,1,صلاة الفجر خير لك من ترديد بول البعير وسبي الن...,hateful_normal,indirect,shock,gender,individual
1,2,صراحة نفسي اشوف ولاد الوسخة اللي قالوا مدرب اج...,offensive,indirect,anger_confusion_sadness_indifference_disgust,other,other
2,3,طيب! هي متبرجة وعبايتها ملونه وطالعة من بيتهم ...,offensive,indirect,indifference,other,individual
3,4,@user @user انا اوافقك بخصوص السوريين و العراق...,normal,direct,indifference,origin,other
4,5,هذه السعودية التي شعبها شعب الخيم و بول البعير...,normal,indirect,indifference,origin,other


#### 1. Word Frequency for Examples that are not Hate Speech

In [117]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['normal'], 'tweet', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 915 examples


[('@user', 529),
 ('@url', 336),
 ('في', 226),
 ('من', 218),
 ('التحرش', 214),
 ('الحريم', 119),
 ('على', 105),
 ('البعير', 93),
 ('بول', 91),
 ('ما', 77)]

#### 2. Word Frequency for Examples that are Hate Speech

In [118]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['offensive'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['abusive'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['hateful'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['disrespectful'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['fearful'], 'tweet', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 950 examples
Found a total of 19 examples
Found a total of 460 examples
Found a total of 167 examples
Found a total of 12 examples


[('@user', 1984),
 ('@url', 488),
 ('بول', 402),
 ('البعير', 389),
 ('من', 374),
 ('يا', 374),
 ('خنازير', 342),
 ('خنزير', 248),
 ('في', 213),
 ('على', 161)]