# Word Count Input Statistical Analysis for all datasets

In [134]:
import numpy as np
import pandas as pd
import spacy
import emoji
from collections import Counter

In [135]:
DATA_DIR = '../data'

### Utility Functions

In [136]:
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

def slice_dataframe_and_compute_word_frequency(df, slice_cols, slice_vals, text_col, spacy_lang_pkg):
    sliced_df = df.copy()
    for i in range(len(slice_cols)):
        sliced_df = sliced_df[sliced_df[slice_cols[i]] == slice_vals[i]]
    print(f'Found a total of {len(sliced_df)} examples')
    nlp = spacy.load(spacy_lang_pkg)
    text = ' '.join(sliced_df[text_col])
    text = emoji.get_emoji_regexp().sub(r'', text)
    doc = nlp(text)
    words = [token.text for token in doc if not token.is_stop and not token.is_punct and len(token) > 1]
    freqs = Counter(words)
    pos_counts = doc.count_by(spacy.attrs.POS)
    for k,v in sorted(pos_counts.items()):
        print(f'{k:{4}}. {doc.vocab[k].text:{5}}: {v}')
    return freqs

def slice_dataframe_and_compute_pos_tags(df, slice_cols, slice_vals, text_col, spacy_lang_pkg):
    sliced_df = df.copy()
    for i in range(len(slice_cols)):
        sliced_df = sliced_df[sliced_df[slice_cols[i]] == slice_vals[i]]
    print(f'Found a total of {len(sliced_df)} examples')
    nlp = spacy.load(spacy_lang_pkg)
    text = ' '.join(sliced_df[text_col])
    text = emoji.get_emoji_regexp().sub(r'', text)
    doc = nlp(text)
    pos_counts = doc.count_by(spacy.attrs.POS)
    for k,v in sorted(pos_counts.items()):
        print(f'{k:{4}}. {doc.vocab[k].text:{5}}: {v}')
    return pos_counts


### Spanish (Basile et al.)

In [137]:
df = pd.read_csv(f'{DATA_DIR}/spanish-basile/hateval2019_es_train.csv')

In [138]:
df.head()

Unnamed: 0,id,text,HS,TR,AG
0,20001,Easyjet quiere duplicar el número de mujeres p...,1,0,0
1,20002,El gobierno debe crear un control estricto de ...,1,0,0
2,20003,Yo veo a mujeres destruidas por acoso laboral ...,0,0,0
3,20004,"— Yo soy respetuoso con los demás, sólamente l...",0,0,0
4,20007,Antonio Caballero y como ser de mal gusto e ig...,0,0,0


#### 1.1 Word Frequecy for Examples that are Hate Speech

In [139]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS'], [1], 'text', 'es_core_news_sm')
freqs.most_common()[:10]

Found a total of 1857 examples
  84. ADJ  : 2986
  85. ADP  : 4253
  86. ADV  : 1742
  87. AUX  : 1624
  89. CCONJ: 1452
  90. DET  : 4246
  91. INTJ : 40
  92. NOUN : 7060
  93. NUM  : 389
  94. PART : 10
  95. PRON : 3129
  96. PROPN: 4764
  97. PUNCT: 4489
  98. SCONJ: 1171
  99. SYM  : 140
 100. VERB : 4512
 103. SPACE: 359


[('puta', 418),
 ('perra', 345),
 ('zorra', 216),
 ('Cállate', 122),
 ('callate', 110),
 ('inmigrantes', 102),
 ('mujer', 96),
 ('Callate', 91),
 ('mereces', 82),
 ('cállate', 78)]

#### 1.2 Word Frequecy for Examples that are Hate Speech and Aggressive

In [140]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS', 'AG'], [1, 1], 'text', 'es_core_news_sm')
freqs.most_common()[:10]

Found a total of 1502 examples
  84. ADJ  : 2369
  85. ADP  : 3153
  86. ADV  : 1342
  87. AUX  : 1256
  89. CCONJ: 1126
  90. DET  : 3133
  91. INTJ : 29
  92. NOUN : 5371
  93. NUM  : 263
  94. PART : 9
  95. PRON : 2457
  96. PROPN: 4005
  97. PUNCT: 3335
  98. SCONJ: 861
  99. SYM  : 114
 100. VERB : 3486
 103. SPACE: 287


[('puta', 398),
 ('perra', 322),
 ('zorra', 207),
 ('Cállate', 122),
 ('callate', 109),
 ('Callate', 90),
 ('mereces', 80),
 ('cállate', 78),
 ('mierda', 74),
 ('PUTA', 67)]

#### 1.3 Word Frequecy for Examples that are Hate Speech but not Aggressive

In [141]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS', 'AG'], [1, 0], 'text', 'es_core_news_sm')
freqs.most_common()[:10]

Found a total of 355 examples
  84. ADJ  : 618
  85. ADP  : 1105
  86. ADV  : 396
  87. AUX  : 370
  89. CCONJ: 325
  90. DET  : 1114
  91. INTJ : 13
  92. NOUN : 1687
  93. NUM  : 122
  95. PRON : 666
  96. PROPN: 773
  97. PUNCT: 1149
  98. SCONJ: 318
  99. SYM  : 27
 100. VERB : 1015
 103. SPACE: 72


[('mujer', 54),
 ('inmigrantes', 36),
 ('mujeres', 32),
 ('subsaharianos', 23),
 ('perra', 23),
 ('puta', 20),
 ('España', 17),
 ('país', 17),
 ('papeles', 16),
 ('inmigrante', 15)]

#### 2.  Word Frequecy for Examples that are not Hate Speech

In [142]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS'], [0], 'text', 'es_core_news_sm')
freqs.most_common()[:10]

Found a total of 2643 examples
  84. ADJ  : 4239
  85. ADP  : 7222
  86. ADV  : 2678
  87. AUX  : 2382
  89. CCONJ: 2147
  90. DET  : 6409
  91. INTJ : 118
  92. NOUN : 11411
  93. NUM  : 733
  94. PART : 10
  95. PRON : 4591
  96. PROPN: 7449
  97. PUNCT: 6388
  98. SCONJ: 1819
  99. SYM  : 352
 100. VERB : 6922
 103. SPACE: 506


[('puta', 784),
 ('hijo', 237),
 ('acoso', 198),
 ('madre', 149),
 ('perra', 142),
 ('polla', 141),
 ('PUTA', 139),
 ('mierda', 117),
 ('mereces', 112),
 ('violación', 110)]

### Spanish (Pereira et al.)

In [143]:
# Necessary wrangling to fit it into a DataFrame
data = []
with open(f'{DATA_DIR}/spanish-pereira/labeled_corpus_6K.txt', 'r',encoding='utf-8') as f:
    for line in f.readlines():
        curr = line.split(";||;")
        data.append(curr[:-1] + [int(curr[-1][0])])

In [144]:
df = pd.DataFrame(data=np.array(data), columns=['id', 'text', 'HS'])

In [145]:
df['HS'] = df.HS.astype(int)

#### 1. Word Frequecy for Examples that are Hate Speech

In [146]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS'], [1], 'text', 'es_core_news_sm')
freqs.most_common()[:10]

Found a total of 1567 examples
  84. ADJ  : 2410
  85. ADP  : 3006
  86. ADV  : 1390
  87. AUX  : 1427
  89. CCONJ: 999
  90. DET  : 2773
  91. INTJ : 52
  92. NOUN : 4982
  93. NUM  : 238
  94. PART : 4
  95. PRON : 2119
  96. PROPN: 3030
  97. PUNCT: 3351
  98. SCONJ: 999
  99. SYM  : 61
 100. VERB : 3164
 103. SPACE: 242


[('subnormal', 350),
 ('mierda', 144),
 ('puto', 97),
 ('puta', 91),
 ('fascista', 85),
 ('fachas', 82),
 ('facha', 66),
 ('fascistas', 61),
 ('nazi', 44),
 ('retrasado', 42)]

#### 2. Word Frequecy for Examples that are not Hate Speech

In [147]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS'], [0], 'text', 'es_core_news_sm')
freqs.most_common()[:10]

Found a total of 4433 examples
  84. ADJ  : 6948
  85. ADP  : 9660
  86. ADV  : 4262
  87. AUX  : 4655
  89. CCONJ: 3070
  90. DET  : 9212
  91. INTJ : 145
  92. NOUN : 15314
  93. NUM  : 1058
  94. PART : 14
  95. PRON : 5951
  96. PROPN: 9354
  97. PUNCT: 10326
  98. SCONJ: 2971
  99. SYM  : 259
 100. VERB : 9312
 103. SPACE: 787


[('independentista', 501),
 ('subnormal', 306),
 ('fascista', 200),
 ('facha', 196),
 ('fascistas', 157),
 ('negro', 145),
 ('fachas', 135),
 ('mayoría', 124),
 ('género', 121),
 ('gente', 118)]

### French (Ousidhoum et al.)

In [148]:
df = pd.read_csv(f'{DATA_DIR}/french-ousidhoum/fr_dataset.csv')

In [149]:
df.head()

Unnamed: 0,HITId,tweet,sentiment,directness,annotator_sentiment,target,group
0,1,rt @user personnage ainsi q tte la clique gauc...,abusive,direct,shock,origin,other
1,2,@user @user @user bah oui t'as raison l'autre ...,offensive,indirect,shock,other,individual
2,3,moi j'ai personne à qui m'adresser pour réclam...,hateful,direct,anger_confusion_sadness_fear_disgust,other,women
3,4,@user @user c...est vrai que les anchois sont ...,normal,indirect,indifference,other,other
4,5,eh les renois faut se réveiller la @url,normal,direct,sadness,origin,african_descent


#### 1. Word Frequency for Examples that are not Hate Speech

In [150]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['normal'], 'tweet', 'fr_core_news_sm')
freqs.most_common()[:10]

Found a total of 821 examples
  84. ADJ  : 1311
  85. ADP  : 1596
  86. ADV  : 1041
  87. AUX  : 567
  89. CCONJ: 373
  90. DET  : 1541
  92. NOUN : 2926
  93. NUM  : 98
  95. PRON : 1500
  96. PROPN: 276
  97. PUNCT: 699
  98. SCONJ: 306
  99. SYM  : 3
 100. VERB : 2573
 101. X    : 9
 103. SPACE: 50


[('@user', 970),
 ('@url', 475),
 ('gauchiste', 93),
 ('renois', 79),
 ('attarde', 66),
 ('migrants', 62),
 ('violence', 60),
 ('contre', 53),
 ('arabes', 43),
 ('attardé', 42)]

#### 2. Word Frequency for Examples that are Hate Speech

In [151]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['abusive'], 'tweet', 'fr_core_news_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['offensive'], 'tweet', 'fr_core_news_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['hateful'], 'tweet', 'fr_core_news_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['disrespectful'], 'tweet', 'fr_core_news_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['fearful'], 'tweet', 'fr_core_news_sm')
freqs.most_common()[:10]

Found a total of 594 examples
  84. ADJ  : 1088
  85. ADP  : 1147
  86. ADV  : 497
  87. AUX  : 349
  89. CCONJ: 221
  90. DET  : 1262
  92. NOUN : 2142
  93. NUM  : 87
  95. PRON : 911
  96. PROPN: 182
  97. PUNCT: 492
  98. SCONJ: 141
  99. SYM  : 3
 100. VERB : 1367
 101. X    : 13
 103. SPACE: 43
Found a total of 1336 examples
  84. ADJ  : 2016
  85. ADP  : 1976
  86. ADV  : 1166
  87. AUX  : 851
  89. CCONJ: 482
  90. DET  : 2152
  91. INTJ : 2
  92. NOUN : 3689
  93. NUM  : 195
  95. PRON : 2027
  96. PROPN: 387
  97. PUNCT: 906
  98. SCONJ: 429
  99. SYM  : 4
 100. VERB : 4465
 101. X    : 16
 103. SPACE: 64
Found a total of 207 examples
  84. ADJ  : 389
  85. ADP  : 375
  86. ADV  : 201
  87. AUX  : 130
  89. CCONJ: 91
  90. DET  : 440
  92. NOUN : 761
  93. NUM  : 23
  95. PRON : 349
  96. PROPN: 76
  97. PUNCT: 185
  98. SCONJ: 54
 100. VERB : 554
 101. X    : 5
 103. SPACE: 24
Found a total of 142 examples
  84. ADJ  : 269
  85. ADP  : 266
  86. ADV  : 135
  87. AUX  : 98
  

[('@user', 2335),
 ('@url', 1104),
 ('mongol', 517),
 ('attardé', 379),
 ('gauchiste', 312),
 ('renois', 253),
 ('sale', 228),
 ('rebeus', 188),
 ('arabe', 133),
 ('migrants', 105)]

### Turkish

In [152]:
df = pd.read_csv(f'{DATA_DIR}/turkish/offenseval-tr-training-v1/offenseval-tr-training-v1.tsv', sep='\t')

In [153]:
df.head()

Unnamed: 0,id,tweet,subtask_a
0,20948,@USER en güzel uyuyan insan ödülü jeon jungkoo...,NOT
1,10134,"@USER Mekanı cennet olsun, saygılar sayın avuk...",NOT
2,23457,Kızlar aranızda kas yığını beylere düşenler ol...,NOT
3,18401,Biraz ders çalışayım. Tembellik ve uyku düşman...,NOT
4,17525,@USER Trezeguet yerine El Sharawy daha iyi olm...,NOT


#### 1. Word Frequency for tweets that are Hate Speech

In [154]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['subtask_a'], ['OFF'], 'tweet', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 6046 examples
  84. ADJ  : 6137
  85. ADP  : 1100
  86. ADV  : 962
  87. AUX  : 479
  89. CCONJ: 46
  90. DET  : 184
  91. INTJ : 742
  92. NOUN : 26053
  93. NUM  : 1753
  94. PART : 260
  95. PRON : 571
  96. PROPN: 67725
  97. PUNCT: 11159
  98. SCONJ: 4
  99. SYM  : 820
 100. VERB : 6404
 101. X    : 2283
 103. SPACE: 4165


[('@USER', 5367),
 ('  ', 1380),
 ('bir', 1267),
 ('bu', 1167),
 ('ve', 847),
 ('ne', 718),
 ('de', 688),
 ('da', 660),
 ('için', 546),
 ('gibi', 528)]

#### 2. Word Frequency for tweets that are not Hate Speech

In [155]:
# This is too long for Spacy. DON'T RUN
# freqs = slice_dataframe_and_compute_word_frequency(df, ['subtask_a'], ['NOT'], 'tweet', 'en_core_web_sm')

### Danish

In [156]:
df = pd.read_csv(f'{DATA_DIR}/danish/data/offenseval-da-training-v1.tsv', sep='\t')

In [157]:
df.head()

Unnamed: 0,id,tweet,subtask_a
0,3131,"Jeg tror det vil være dejlig køligt, men jeg v...",NOT
1,711,Så kommer de nok til at investere i en ny cyke...,NOT
2,2500,Nu er det jo også de Ikea-aber der har lavet s...,OFF
3,2678,"128 Varme emails, er vi enige om at det er sex...",NOT
4,784,"Desværre tyder det på, at amerikanerne er helt...",NOT


#### 1. Word Frequency for tweets that are Hate Speech

In [158]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['subtask_a'], ['OFF'], 'tweet', 'da_core_news_sm')
freqs.most_common()[:10]

Found a total of 384 examples
  84. ADJ  : 763
  85. ADP  : 1057
  86. ADV  : 1286
  87. AUX  : 728
  89. CCONJ: 295
  90. DET  : 639
  91. INTJ : 24
  92. NOUN : 1816
  93. NUM  : 70
  94. PART : 154
  95. PRON : 1160
  96. PROPN: 446
  97. PUNCT: 1279
  98. SCONJ: 213
  99. SYM  : 25
 100. VERB : 1246
 101. X    : 86
 103. SPACE: 135


[('lort', 45),
 ('@USER', 31),
 ('bare', 29),
 ('  ', 24),
 ('godt', 22),
 ('fandme', 22),
 ('når', 19),
 ('folk', 19),
 ('Fuck', 18),
 ('lortet', 15)]

#### 2. Word Frequency for tweets that are not Hate Speech

In [159]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['subtask_a'], ['NOT'], 'tweet', 'da_core_news_sm')
freqs.most_common()[:10]

Found a total of 2576 examples
  84. ADJ  : 3431
  85. ADP  : 4746
  86. ADV  : 6399
  87. AUX  : 3265
  89. CCONJ: 1426
  90. DET  : 2674
  91. INTJ : 98
  92. NOUN : 8200
  93. NUM  : 449
  94. PART : 708
  95. PRON : 4896
  96. PROPN: 2558
  97. PUNCT: 6201
  98. SCONJ: 1010
  99. SYM  : 262
 100. VERB : 5735
 101. X    : 554
 103. SPACE: 653


[('bare', 147),
 ('@USER', 126),
 ('godt', 123),
 ('når', 121),
 ('  ', 117),
 ('Danmark', 100),
 ('se', 82),
 ('helt', 81),
 ('URL', 77),
 ('år', 74)]

### Hindi

In [160]:
df = pd.read_csv(f'{DATA_DIR}/hindi/agr_hi_train.csv')

In [161]:
df.columns = ['id', 'text', 'agr']

In [162]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['agr'], ['OAG'], 'tweet', 'da_core_news_sm')
freqs.most_common()[:10]

Found a total of 4855 examples


KeyError: 'tweet'

### English (basile et al)

In [163]:
df = pd.read_csv(f'{DATA_DIR}/english-basile/hateval2019_en_train.csv')

In [164]:
df.head()

Unnamed: 0,id,text,HS,TR,AG
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,0,0
1,202,Why would young fighting age men be the vast m...,1,0,0
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,0,0
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,0,0
4,205,Orban in Brussels: European leaders are ignori...,0,0,0


#### 1.1 Word Frequency for Examples that are Hate Speech

In [165]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS'], [1], 'text', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 3783 examples
  84. ADJ  : 6059
  85. ADP  : 7267
  86. ADV  : 4209
  87. AUX  : 4511
  89. CCONJ: 2162
  90. DET  : 6672
  91. INTJ : 454
  92. NOUN : 18766
  93. NUM  : 986
  94. PART : 2941
  95. PRON : 8077
  96. PROPN: 10426
  97. PUNCT: 9374
  98. SCONJ: 1007
  99. SYM  : 2719
 100. VERB : 12887
 101. X    : 382
 103. SPACE: 756


[('bitch', 693),
 ('women', 336),
 ('like', 301),
 ('BuildThatWall', 291),
 ('refugees', 246),
 ('illegal', 227),
 ('whore', 219),
 ('cunt', 207),
 ('want', 203),
 ('migrants', 198)]

#### 1.2 Word Frequency for Examples that are Hate Speech and Aggressive

In [166]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS', 'AG'], [1, 1], 'text', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 1559 examples
  84. ADJ  : 2664
  85. ADP  : 3336
  86. ADV  : 1784
  87. AUX  : 1985
  89. CCONJ: 1034
  90. DET  : 2858
  91. INTJ : 206
  92. NOUN : 8538
  93. NUM  : 463
  94. PART : 1276
  95. PRON : 3495
  96. PROPN: 5230
  97. PUNCT: 4462
  98. SCONJ: 424
  99. SYM  : 1611
 100. VERB : 5846
 101. X    : 156
 103. SPACE: 309


[('bitch', 220),
 ('BuildThatWall', 197),
 ('refugees', 121),
 ('MAGA', 114),
 ('illegal', 113),
 ('like', 107),
 ('Trump', 102),
 ('Illegal', 100),
 ('@realDonaldTrump', 97),
 ('immigration', 92)]

#### 1.3 Word Frequency for Examples that are Hate Speech but not Aggressive

In [167]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS', 'AG'], [1, 0], 'text', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 2224 examples
  84. ADJ  : 3396
  85. ADP  : 3925
  86. ADV  : 2419
  87. AUX  : 2535
  89. CCONJ: 1128
  90. DET  : 3820
  91. INTJ : 246
  92. NOUN : 10197
  93. NUM  : 525
  94. PART : 1663
  95. PRON : 4581
  96. PROPN: 5208
  97. PUNCT: 4924
  98. SCONJ: 583
  99. SYM  : 1131
 100. VERB : 7040
 101. X    : 210
 103. SPACE: 447


[('bitch', 473),
 ('women', 258),
 ('like', 194),
 ('whore', 152),
 ('woman', 147),
 ('hoe', 144),
 ('cunt', 140),
 ('refugees', 125),
 ('ass', 118),
 ('want', 117)]

#### Word Frequency for Examples that are not Hate Speech

In [168]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['HS'], [0], 'text', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 5217 examples
  84. ADJ  : 7958
  85. ADP  : 10639
  86. ADV  : 5168
  87. AUX  : 5378
  89. CCONJ: 2961
  90. DET  : 8299
  91. INTJ : 570
  92. NOUN : 24157
  93. NUM  : 1639
  94. PART : 3843
  95. PRON : 9613
  96. PROPN: 13798
  97. PUNCT: 12669
  98. SCONJ: 1239
  99. SYM  : 1816
 100. VERB : 15777
 101. X    : 504
 103. SPACE: 1015


[('refugees', 557),
 ('immigrant', 509),
 ('women', 405),
 ('men', 377),
 ('like', 332),
 ('migrants', 292),
 ('people', 259),
 ('rape', 234),
 ('refugee', 233),
 ('woman', 231)]

### English (ousidhoum et al)

In [169]:
df = pd.read_csv(f'{DATA_DIR}/english-ousidhoum/en_dataset.csv')

In [170]:
df.head()

Unnamed: 0,HITId,tweet,sentiment,directness,annotator_sentiment,target,group
0,1,call sis im boy girl still faggot shut,offensive,indirect,confusion_disgust,origin,gay
1,2,@user @user legit nilas retarded idk,offensive_disrespectful,indirect,anger_confusion,disability,special_needs
2,3,said retard @url,fearful_abusive_offensive_hateful,indirect,confusion_disgust,disability,special_needs
3,4,america another 8 years obama's ideology via h...,fearful_abusive_hateful_disrespectful_normal,indirect,anger_fear_shock_sadness_disgust,origin,other
4,5,@user don... fucking made cry twat.,offensive,indirect,shock_disgust,gender,women


#### 1. Word Frequency for Examples that are not Hate Speech

In [171]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['normal'], 'tweet', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 661 examples
  84. ADJ  : 784
  85. ADP  : 121
  86. ADV  : 275
  87. AUX  : 166
  89. CCONJ: 29
  90. DET  : 125
  91. INTJ : 54
  92. NOUN : 2104
  93. NUM  : 107
  94. PART : 73
  95. PRON : 195
  96. PROPN: 633
  97. PUNCT: 558
  98. SCONJ: 10
  99. SYM  : 72
 100. VERB : 1318
 101. X    : 360
 103. SPACE: 45


[('@user', 396),
 ('@url', 304),
 ('retarded', 62),
 ('shithole', 61),
 ('like', 54),
 ('faggot', 51),
 ('spic', 49),
 ('retard', 45),
 ('twat', 42),
 ('cunt', 39)]

#### 2. Word Frequency for Examples that are Hate Speech

In [172]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['offensive'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['abusive'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['hateful'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['disrespectful'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['fearful'], 'tweet', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 2954 examples
  84. ADJ  : 3441
  85. ADP  : 379
  86. ADV  : 1181
  87. AUX  : 720
  89. CCONJ: 124
  90. DET  : 417
  91. INTJ : 261
  92. NOUN : 8210
  93. NUM  : 338
  94. PART : 264
  95. PRON : 776
  96. PROPN: 2588
  97. PUNCT: 2408
  98. SCONJ: 42
  99. SYM  : 178
 100. VERB : 5376
 101. X    : 2131
 103. SPACE: 208
Found a total of 1 examples
  86. ADV  : 3
  87. AUX  : 1
  92. NOUN : 1
  95. PRON : 1
  96. PROPN: 2
  97. PUNCT: 2
 100. VERB : 2
Found a total of 315 examples
  84. ADJ  : 359
  85. ADP  : 35
  86. ADV  : 149
  87. AUX  : 80
  89. CCONJ: 12
  90. DET  : 45
  91. INTJ : 22
  92. NOUN : 930
  93. NUM  : 30
  94. PART : 37
  95. PRON : 102
  96. PROPN: 283
  97. PUNCT: 283
  98. SCONJ: 8
  99. SYM  : 20
 100. VERB : 586
 101. X    : 172
 103. SPACE: 13
Found a total of 10 examples
  84. ADJ  : 9
  85. ADP  : 1
  86. ADV  : 3
  87. AUX  : 1
  91. INTJ : 1
  92. NOUN : 31
  94. PART : 1
  95. PRON : 2
  96. PROPN: 13
  97. PUNCT: 2
  99. SYM  : 1
 10

[('@user', 2787),
 ('@url', 1340),
 ('retard', 406),
 ('retarded', 372),
 ('faggot', 324),
 ('cunt', 287),
 ('shithole', 283),
 ('twat', 280),
 ('like', 220),
 ('fucking', 207)]

### Arabic

In [173]:
df = pd.read_csv(f'{DATA_DIR}/arabic-ousidhoum/ar_dataset.csv')

In [174]:
df.head()

Unnamed: 0,HITId,tweet,sentiment,directness,annotator_sentiment,target,group
0,1,صلاة الفجر خير لك من ترديد بول البعير وسبي الن...,hateful_normal,indirect,shock,gender,individual
1,2,صراحة نفسي اشوف ولاد الوسخة اللي قالوا مدرب اج...,offensive,indirect,anger_confusion_sadness_indifference_disgust,other,other
2,3,طيب! هي متبرجة وعبايتها ملونه وطالعة من بيتهم ...,offensive,indirect,indifference,other,individual
3,4,@user @user انا اوافقك بخصوص السوريين و العراق...,normal,direct,indifference,origin,other
4,5,هذه السعودية التي شعبها شعب الخيم و بول البعير...,normal,indirect,indifference,origin,other


#### 1. Word Frequency for Examples that are not Hate Speech

In [175]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['normal'], 'tweet', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 915 examples
  84. ADJ  : 532
  85. ADP  : 254
  86. ADV  : 191
  87. AUX  : 14
  89. CCONJ: 13
  90. DET  : 91
  91. INTJ : 41
  92. NOUN : 2154
  93. NUM  : 52
  94. PART : 3
  95. PRON : 82
  96. PROPN: 7300
  97. PUNCT: 715
  99. SYM  : 54
 100. VERB : 1058
 101. X    : 603
 103. SPACE: 142


[('@user', 529),
 ('@url', 336),
 ('في', 226),
 ('من', 218),
 ('التحرش', 214),
 ('الحريم', 119),
 ('على', 105),
 ('البعير', 93),
 ('بول', 91),
 ('ما', 77)]

#### 2. Word Frequency for Examples that are Hate Speech

In [176]:
freqs = slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['offensive'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['abusive'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['hateful'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['disrespectful'], 'tweet', 'en_core_web_sm')
freqs += slice_dataframe_and_compute_word_frequency(df, ['sentiment'], ['fearful'], 'tweet', 'en_core_web_sm')
freqs.most_common()[:10]

Found a total of 950 examples
  84. ADJ  : 570
  85. ADP  : 204
  86. ADV  : 222
  87. AUX  : 14
  89. CCONJ: 10
  90. DET  : 93
  91. INTJ : 46
  92. NOUN : 2255
  93. NUM  : 31
  94. PART : 3
  95. PRON : 78
  96. PROPN: 6650
  97. PUNCT: 614
  99. SYM  : 49
 100. VERB : 1133
 101. X    : 1007
 103. SPACE: 146
Found a total of 19 examples
  84. ADJ  : 12
  85. ADP  : 5
  86. ADV  : 4
  90. DET  : 2
  92. NOUN : 67
  93. NUM  : 4
  96. PROPN: 147
  97. PUNCT: 13
  99. SYM  : 2
 100. VERB : 24
 101. X    : 44
Found a total of 460 examples
  84. ADJ  : 314
  85. ADP  : 136
  86. ADV  : 118
  87. AUX  : 10
  89. CCONJ: 2
  90. DET  : 31
  91. INTJ : 20
  92. NOUN : 1203
  93. NUM  : 23
  95. PRON : 33
  96. PROPN: 3377
  97. PUNCT: 320
  99. SYM  : 14
 100. VERB : 581
 101. X    : 634
 103. SPACE: 98
Found a total of 167 examples
  84. ADJ  : 72
  85. ADP  : 29
  86. ADV  : 52
  87. AUX  : 3
  90. DET  : 15
  91. INTJ : 8
  92. NOUN : 322
  93. NUM  : 3
  95. PRON : 6
  96. PROPN: 966
  

[('@user', 1984),
 ('@url', 488),
 ('بول', 402),
 ('البعير', 389),
 ('من', 374),
 ('يا', 374),
 ('خنازير', 342),
 ('خنزير', 248),
 ('في', 213),
 ('على', 161)]