# Inter-dataset Input Statistical Analysis for all datasets

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import glob

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Nicholas\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
DATA_DIR = '../data/all-processed'

In [3]:
def slice_dataframe_and_compute_sentiment(df, slice_cols, slice_vals):
    sliced_df = df.copy()
    for i in range(len(slice_cols)):
        sliced_df = sliced_df[sliced_df[slice_cols[i]] == slice_vals[i]]
    print(f'Found a total of {len(sliced_df)} examples')
    text = ' '.join(sliced_df['text'])
    return SentimentIntensityAnalyzer().polarity_scores(text)

## Example Sentiment For Hateful Labels in English Basile dataset

In [4]:
df = pd.read_csv(f'{DATA_DIR}/B_english_basile_processed.csv')
df.head()

Unnamed: 0,text,hs
0,"hurray , saving us $ $ $ many ways @ potus @ r...",1
1,would young fighting age men vast majority one...,1
2,@ kamalaharris illegals dump kids border like ...,1
3,ny times : 'nearly white ' states pose 'an arr...,0
4,orban brussels : european leaders ignoring peo...,0


In [5]:
print(slice_dataframe_and_compute_sentiment(df, ['hs'], [1]))

Found a total of 5470 examples
{'neg': 0.361, 'neu': 0.517, 'pos': 0.121, 'compound': -1.0}


## Sentiment for all hateful datasets 

In [6]:
hateful_sentiment_dict = {}
for path in glob.glob('../data/all-processed/*.csv'):
    path_in_str = str(path)
    print(path_in_str)
    df = pd.read_csv(path_in_str)
    hateful_sentiment_dict[path_in_str] = slice_dataframe_and_compute_sentiment(df, ['hs'], [1])
    print(hateful_sentiment_dict[path_in_str])


../data/all-processed\B_arabic_mulki_processed.csv
Found a total of 468 examples
{'neg': 0.001, 'neu': 0.981, 'pos': 0.019, 'compound': 0.9988}
../data/all-processed\B_danish_processed.csv
Found a total of 425 examples
{'neg': 0.066, 'neu': 0.894, 'pos': 0.041, 'compound': -0.9997}
../data/all-processed\B_english_basile_processed.csv
Found a total of 5470 examples
{'neg': 0.361, 'neu': 0.517, 'pos': 0.121, 'compound': -1.0}
../data/all-processed\B_english_davidson_processed.csv
Found a total of 1430 examples
{'neg': 0.359, 'neu': 0.542, 'pos': 0.099, 'compound': -1.0}
../data/all-processed\B_english_founta_processed.csv
Found a total of 2075 examples
{'neg': 0.297, 'neu': 0.603, 'pos': 0.101, 'compound': -1.0}
../data/all-processed\B_english_gilbert_processed.csv
Found a total of 1196 examples
{'neg': 0.186, 'neu': 0.66, 'pos': 0.154, 'compound': -1.0}
../data/all-processed\B_english_ousidhoum_processed.csv
Found a total of 1278 examples
{'neg': 0.317, 'neu': 0.572, 'pos': 0.111, 'comp

## Sentiment for all normal labels in hateful datasets

In [7]:
non_hateful_sentiment_dict = {}
for path in glob.glob('../data/all-processed/*.csv'):
    path_in_str = str(path)
    print(path_in_str)
    df = pd.read_csv(path_in_str)
    if 'danish' not in path_in_str and 'indonesian_ibrohim' not in path_in_str and 'german_bretschneider' not in path_in_str:
            non_hateful_sentiment_dict[path_in_str] = slice_dataframe_and_compute_sentiment(df, ['hs'], [0])
            print(non_hateful_sentiment_dict[path_in_str])

../data/all-processed\B_arabic_mulki_processed.csv
Found a total of 3649 examples
{'neg': 0.001, 'neu': 0.974, 'pos': 0.025, 'compound': 1.0}
../data/all-processed\B_danish_processed.csv
../data/all-processed\B_english_basile_processed.csv
Found a total of 7530 examples
{'neg': 0.222, 'neu': 0.646, 'pos': 0.132, 'compound': -1.0}
../data/all-processed\B_english_davidson_processed.csv
Found a total of 4163 examples
{'neg': 0.085, 'neu': 0.785, 'pos': 0.13, 'compound': 1.0}
../data/all-processed\B_english_founta_processed.csv
Found a total of 34487 examples
{'neg': 0.111, 'neu': 0.68, 'pos': 0.209, 'compound': 1.0}
../data/all-processed\B_english_gilbert_processed.csv
Found a total of 9507 examples
{'neg': 0.118, 'neu': 0.708, 'pos': 0.174, 'compound': 1.0}
../data/all-processed\B_english_ousidhoum_processed.csv
Found a total of 4369 examples
{'neg': 0.291, 'neu': 0.59, 'pos': 0.119, 'compound': -1.0}
../data/all-processed\B_english_waseem_processed.csv
Found a total of 7679 examples
{'n

In [12]:
print("Comparing sentiment for hate speech v non hatespeech")
for dataset in hateful_sentiment_dict:
    if dataset in non_hateful_sentiment_dict:
        print("For {}, \n \t HSvNormal: (Neg: {} v {}, Neu: {} v {}, Pos: {} v {})".format(dataset,
         hateful_sentiment_dict[dataset]['neg'], non_hateful_sentiment_dict[dataset]['neg'], 
         hateful_sentiment_dict[dataset]['neu'], non_hateful_sentiment_dict[dataset]['neu'], 
         hateful_sentiment_dict[dataset]['pos'], non_hateful_sentiment_dict[dataset]['pos'],))

Comparing sentiment for hate speech v non hatespeech
For ../data/all-processed\B_arabic_mulki_processed.csv, 
 	 HSvNormal: (Neg: 0.001 v 0.001, Neu: 0.981 v 0.974, Pos: 0.019 v 0.025)
For ../data/all-processed\B_english_basile_processed.csv, 
 	 HSvNormal: (Neg: 0.361 v 0.222, Neu: 0.517 v 0.646, Pos: 0.121 v 0.132)
For ../data/all-processed\B_english_davidson_processed.csv, 
 	 HSvNormal: (Neg: 0.359 v 0.085, Neu: 0.542 v 0.785, Pos: 0.099 v 0.13)
For ../data/all-processed\B_english_founta_processed.csv, 
 	 HSvNormal: (Neg: 0.297 v 0.111, Neu: 0.603 v 0.68, Pos: 0.101 v 0.209)
For ../data/all-processed\B_english_gilbert_processed.csv, 
 	 HSvNormal: (Neg: 0.186 v 0.118, Neu: 0.66 v 0.708, Pos: 0.154 v 0.174)
For ../data/all-processed\B_english_ousidhoum_processed.csv, 
 	 HSvNormal: (Neg: 0.317 v 0.291, Neu: 0.572 v 0.59, Pos: 0.111 v 0.119)
For ../data/all-processed\B_english_waseem_processed.csv, 
 	 HSvNormal: (Neg: 0.131 v 0.101, Neu: 0.743 v 0.765, Pos: 0.126 v 0.134)
For ../da