In [None]:
import numpy as np
import pandas as pd
import spacy
import emoji
from collections import Counter
from scipy.stats import beta
from calc_prob import calc_prob_between
import glob

In [None]:
DATA_DIR = '../data/all-processed'

In [None]:
language_codes = {
    'english':'en',
    'arabic':'ar',
    'german':'de',
    'indonesian':'id',
    'italian':'it',
    'portuguese':'pt',
    'spanish':'es',
    'french':'fr',
    'turkish':'tr',
    'danish':'da',
    'hindi':'hi'
}

def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

def slice_dataframe_and_compute_word_frequency(df, slice_cols, slice_vals, text_col, spacy_lang_pkg):
    sliced_df = df.copy()
    for i in range(len(slice_cols)):
        sliced_df = sliced_df[sliced_df[slice_cols[i]] == slice_vals[i]]
    print(f'Found a total of {len(sliced_df)} examples')
    nlp = spacy.load(spacy_lang_pkg)
    text = ' '.join(sliced_df[text_col])
    text = emoji.get_emoji_regexp().sub(r'', text)
    doc = nlp(text)
    words = [token.text for token in doc if not token.is_stop and not token.is_punct and len(token) > 1]
    freqs = Counter(words)
    pos_counts = doc.count_by(spacy.attrs.POS)
    total = 0
    for k,v in sorted(pos_counts.items()):
        total += v
        print(f'{k:{4}}. {doc.vocab[k].text:{5}}: {v}')
    print('total', total)
    return freqs

def slice_dataframe_and_compute_pos_tags(df, slice_cols, slice_vals, text_col, spacy_lang_pkg):
    sliced_df = df.copy()
    for i in range(len(slice_cols)):
        sliced_df = sliced_df[sliced_df[slice_cols[i]] == slice_vals[i]]
    print(f'Found a total of {len(sliced_df)} examples')
    nlp = spacy.load(spacy_lang_pkg)
    nlp.max_length = 4000000 
    text = ' '.join(sliced_df[text_col])
    text = emoji.get_emoji_regexp().sub(r'', text)
    doc = nlp(text)
    pos_counts = doc.count_by(spacy.attrs.POS)
    total = 0
    for k,v in sorted(pos_counts.items()):
        total += v
        # print(f'{k:{4}}. {doc.vocab[k].text:{5}}: {v}')
    # print('total', total)
    return pos_counts, total

def ab_test(imps_ctrl, convs_ctrl, imps_test, convs_test):
    a_C, b_C = convs_ctrl+1, imps_ctrl-convs_ctrl+1
    beta_C = beta(a_C, b_C)
    a_T, b_T = convs_test+1, imps_test-convs_test+1
    beta_T = beta(a_T, b_T)

    lift=(beta_T.mean()-beta_C.mean())/beta_C.mean()
    prob=calc_prob_between(beta_T, beta_C)
    # print (f"Test option lift Conversion Rates by {lift*100:2.2f}% with {prob*100:2.1f}% probability.")
    return lift, prob

pos_dict = {'84': 'adjective', '85': 'adposition', '86': 'adverb', '87': 'auxiliary', '89': 'coordinating conjunction', \
'90': 'determiner', '91': 'interjection', '92': 'noun', '93': 'numeral', '94': 'particle', '95': 'pronoun', \
'96': 'proper noun', '97': 'punctuation', '98': 'subordinating conjunction', '99': 'symbol', '100': 'verb', '103': 'space'}

def print_pos_stats(df, spacy_pkg):
    hs_tags, hs_total = slice_dataframe_and_compute_pos_tags(df, ['hs'], [1], 'text', spacy_pkg)
    normal_tags, normal_total = slice_dataframe_and_compute_pos_tags(df, ['hs'], [0], 'text', spacy_pkg)
    for k,v in sorted(hs_tags.items()):
        try:    
            pos = pos_dict[str(k)]
            hs_pos_count = v
            normal_pos_count = normal_tags[k]
            hs_pos_percent = v / hs_total
            normal_pos_percent = normal_pos_count / normal_total
            lift, prob = ab_test(normal_total, normal_pos_count, hs_total, hs_pos_count)
            print(f'| {pos} | {normal_pos_count} | {hs_pos_count} | {normal_pos_percent*100:2.2f}% | {hs_pos_percent*100:2.2f}% | {lift*100:2.2f}% | {prob:2.6f} |')
        except Exception:
            continue

In [None]:
# hateful_sentiment_dict = {}
# for path in glob.glob('../data/all-processed/*.csv'):
#     try:
#         path_in_str = str(path)
#         print(path_in_str)
#         df = pd.read_csv(path_in_str)
#         print_pos_stats(df, 'en_core_web_sm')

#     except Exception as e:
#         print(e.with_traceback)
#         continue

In [None]:
df = pd.read_csv(f'{DATA_DIR}/B_spanish_pereira_processed.csv')
try:
    print_pos_stats(df, 'es_core_news_sm')
except Exception as e:
    print(e)