In [20]:
import numpy as np
import pandas as pd
import spacy
import emoji
from collections import Counter
from scipy.stats import beta
from calc_prob import calc_prob_between
import glob

In [21]:
DATA_DIR = '../data/all-processed'

In [22]:
language_codes = {
    'english':'en',
    'arabic':'ar',
    'german':'de',
    'indonesian':'id',
    'italian':'it',
    'portuguese':'pt',
    'spanish':'es',
    'french':'fr',
    'turkish':'tr',
    'danish':'da',
    'hindi':'hi'
}

def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

def slice_dataframe_and_compute_word_frequency(df, slice_cols, slice_vals, text_col, spacy_lang_pkg):
    sliced_df = df.copy()
    for i in range(len(slice_cols)):
        sliced_df = sliced_df[sliced_df[slice_cols[i]] == slice_vals[i]]
    print(f'Found a total of {len(sliced_df)} examples')
    nlp = spacy.load(spacy_lang_pkg)
    text = ' '.join(sliced_df[text_col])
    text = emoji.get_emoji_regexp().sub(r'', text)
    doc = nlp(text)
    words = [token.text for token in doc if not token.is_stop and not token.is_punct and len(token) > 1]
    freqs = Counter(words)
    pos_counts = doc.count_by(spacy.attrs.POS)
    total = 0
    for k,v in sorted(pos_counts.items()):
        total += v
        print(f'{k:{4}}. {doc.vocab[k].text:{5}}: {v}')
    print('total', total)
    return freqs

def slice_dataframe_and_compute_pos_tags(df, slice_cols, slice_vals, text_col, spacy_lang_pkg):
    sliced_df = df.copy()
    for i in range(len(slice_cols)):
        sliced_df = sliced_df[sliced_df[slice_cols[i]] == slice_vals[i]]
    print(f'Found a total of {len(sliced_df)} examples')
    nlp = spacy.load(spacy_lang_pkg)
    nlp.max_length = 4000000 
    text = ' '.join(sliced_df[text_col])
    text = emoji.get_emoji_regexp().sub(r'', text)
    doc = nlp(text)
    pos_counts = doc.count_by(spacy.attrs.POS)
    total = 0
    for k,v in sorted(pos_counts.items()):
        total += v
        # print(f'{k:{4}}. {doc.vocab[k].text:{5}}: {v}')
    # print('total', total)
    return pos_counts, total

def ab_test(imps_ctrl, convs_ctrl, imps_test, convs_test):
    a_C, b_C = convs_ctrl+1, imps_ctrl-convs_ctrl+1
    beta_C = beta(a_C, b_C)
    a_T, b_T = convs_test+1, imps_test-convs_test+1
    beta_T = beta(a_T, b_T)

    lift=(beta_T.mean()-beta_C.mean())/beta_C.mean()
    prob=calc_prob_between(beta_T, beta_C)
    # print (f"Test option lift Conversion Rates by {lift*100:2.2f}% with {prob*100:2.1f}% probability.")
    return lift, prob

pos_dict = {'84': 'adjective', '85': 'adposition', '86': 'adverb', '87': 'auxiliary', '89': 'coordinating conjunction', \
'90': 'determiner', '91': 'interjection', '92': 'noun', '93': 'numeral', '94': 'particle', '95': 'pronoun', \
'96': 'proper noun', '97': 'punctuation', '98': 'subordinating conjunction', '99': 'symbol', '100': 'verb', '103': 'space'}

def print_pos_stats(df, spacy_pkg):
    normal_tags, normal_total = slice_dataframe_and_compute_pos_tags(df, ['hs'], [0], 'text', spacy_pkg)
    hs_tags, hs_total = slice_dataframe_and_compute_pos_tags(df, ['hs'], [1], 'text', spacy_pkg)
    for k,v in sorted(hs_tags.items()):
        try:    
            pos = pos_dict[str(k)]
            hs_pos_count = v
            normal_pos_count = normal_tags[k]
            hs_pos_percent = v / hs_total
            normal_pos_percent = normal_pos_count / normal_total
            lift, prob = ab_test(normal_total, normal_pos_count, hs_total, hs_pos_count)
            print(f'| {pos} | {normal_pos_count} | {hs_pos_count} | {normal_pos_percent*100:2.2f}% | {hs_pos_percent*100:2.2f}% | {lift*100:2.2f}% | {prob:2.6f} |')
        except Exception:
            continue

In [23]:
hateful_sentiment_dict = {}
for path in glob.glob('../data/all-processed/*.csv'):
    try:
        path_in_str = str(path)
        print(path_in_str)
        df = pd.read_csv(path_in_str)
        print_pos_stats(df, 'tacobell')

    except Exception as e:
        print(e.with_traceback)
        continue

../data/all-processed\B_arabic_mulki_processed.csv
Found a total of 3649 examples
<built-in method with_traceback of OSError object at 0x0000020F0560FA68>
../data/all-processed\B_danish_processed.csv
Found a total of 2850 examples
<built-in method with_traceback of OSError object at 0x0000020F77A6E0D8>
../data/all-processed\B_english_basile_processed.csv
Found a total of 7530 examples
<built-in method with_traceback of OSError object at 0x0000020F04668CA8>
../data/all-processed\B_english_davidson_processed.csv
Found a total of 4163 examples
<built-in method with_traceback of OSError object at 0x0000020F0435B1F8>
../data/all-processed\B_english_founta_processed.csv
Found a total of 34487 examples
<built-in method with_traceback of OSError object at 0x0000020F04406048>
../data/all-processed\B_english_gilbert_processed.csv
Found a total of 9507 examples
<built-in method with_traceback of OSError object at 0x0000020F017860D8>
../data/all-processed\B_english_ousidhoum_processed.csv
Found a 

In [24]:
df = pd.read_csv(f'{DATA_DIR}/B_spanish_pereira_processed.csv')
try:
    print_pos_stats(df, 'es_core_news_sm')
except Exception as e:
    print(e)

Found a total of 4433 examples
Found a total of 1567 examples
| adjective | 11158 | 3924 | 16.80% | 18.45% | 9.83% | 0.000000 |
| adposition | 334 | 62 | 0.50% | 0.29% | -41.28% | 0.000017 |
| adverb | 1641 | 485 | 2.47% | 2.28% | -7.58% | 0.058753 |
| auxiliary | 932 | 355 | 1.40% | 1.67% | 19.14% | 0.002674 |
| coordinating conjunction | 250 | 83 | 0.38% | 0.39% | 4.50% | 0.371782 |
| determiner | 544 | 184 | 0.82% | 0.87% | 5.99% | 0.251655 |
| interjection | 93 | 26 | 0.14% | 0.12% | -10.31% | 0.293615 |
| noun | 14591 | 4707 | 21.97% | 22.13% | 0.74% | 0.309234 |
| numeral | 1593 | 386 | 2.40% | 1.82% | -24.19% | 0.000000 |
| particle | 3 | 2 | 0.00% | 0.01% | 134.18% | 0.157848 |
| pronoun | 378 | 117 | 0.57% | 0.55% | -2.78% | 0.386884 |
| proper noun | 9332 | 2623 | 14.05% | 12.33% | -12.21% | 0.000000 |
| punctuation | 12655 | 3856 | 19.06% | 18.13% | -4.84% | 0.001317 |
| subordinating conjunction | 720 | 192 | 1.08% | 0.90% | -16.42% | 0.011401 |
| symbol | 2962 | 1286 | 4.4