In [1]:
%load_ext autoreload
%autoreload 2

In [33]:
import sys
sys.path.insert(0, '../libs/semaxis')
from semaxis import CoreUtil
from semaxis import SemAxis
import pandas as pd
import logging
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import random
import numpy as np
import re
import glob
import os
import gc

In [3]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                    datefmt="%Y-%m-%d %H:%M:%S")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

## Read data

In [4]:
dem = pickle.load(open('../../../data/all_tweet_texts_dem.p', 'rb'))
rep = pickle.load(open('../../../data/all_tweet_texts_rep.p', 'rb'))

In [9]:
moe_1 = pickle.load(open('../../../data/moe_sample_tweets_feb.p', 'rb'))
moe_2 = pickle.load(open('../../../data/moe_sample_tweets_mar.p', 'rb'))
moe_3 = pickle.load(open('../../../data/moe_sample_tweets_apr.p', 'rb'))
moe_4 = pd.read_csv('../../../data/moe_may_to_aug_sample.tsv', sep='\t',lineterminator='\n')
moe_4.columns = ['unnamed', 'Text', 'Time']
moe_4 = moe_4[['Text', 'Time']]
bg = pd.concat([moe_1, moe_2, moe_3, moe_4])

In [10]:
bg.head(2)

Unnamed: 0,Text,Time
0,RT @tedlieu: If you believe @realDonaldTrump’s...,Sat Feb 29 11:00:48 -0500 2020
1,No hay #Coronavirus #Curico DESCARTADO https:/...,Sat Feb 29 11:00:48 -0500 2020


In [11]:
dem['Party'] = 'Dem'
rep['Party'] = 'Rep'
bg['Party'] = 'Bg'

In [12]:
covid_vocab = pickle.load(open('../../../data/all_covid_words.p', 'rb'))

In [13]:
big_regex = re.compile('|'.join(map(re.escape, covid_vocab)))

In [14]:
dem['Text'] = dem.apply(lambda row: big_regex.sub('covid', row['Text'].lower()), axis=1)
rep['Text'] = dem.apply(lambda row: big_regex.sub('covid', row['Text'].lower()), axis=1)
bg['Text'] = dem.apply(lambda row: big_regex.sub('covid', row['Text'].lower()), axis=1)

###  If using non-covid politician tweets as background

In [22]:
# def get_covid_tweets(df):
#     all_covid_tweets = []
#     all_bg_tweets = []
#     for idx, row in df.iterrows():
#         try:
#             t = row['Text']
#             if 'covid' in t or 'coronavirus' in t:
#                 all_covid_tweets.append(row)
#             else:
#                 all_bg_tweets.append(row)
#         except Exception as ex:
#             continue
        
#     return pd.DataFrame(all_covid_tweets), pd.DataFrame(all_bg_tweets)

In [23]:
# dem_covid, dem_bg = get_covid_tweets(dem)
# rep_covid, rep_bg = get_covid_tweets(rep)
# bg = pd.concat([dem_bg, rep_bg])
# bg['Party'] = 'Bg'

### End if

In [15]:
df_all = pd.concat([bg, dem, rep])

In [16]:
df_all.head(2)

Unnamed: 0,Text,Time,Party
0,we had our own bad weather &amp; power outages...,Sat Feb 29 11:00:48 -0500 2020,Bg
1,rt @agneronha: questions about scams? price go...,Sat Feb 29 11:00:48 -0500 2020,Bg


In [17]:
df_all['Party'].value_counts()

Bg     837156
Dem    134337
Rep     85456
Name: Party, dtype: int64

In [18]:
len(df_all)

1056949

## Create semaxis object

In [31]:
# from gensim.test.utils import datapath, get_tmpfile
# from gensim.models import KeyedVectors
# from gensim.scripts.glove2word2vec import glove2word2vec
# glove_file = '../../data/embeddings/glove.840B.300d.txt'
# tmp_file = "../../data/embeddings/test_word2vec.txt"
# output = glove2word2vec(glove_file, tmp_file)


In [32]:
# output

In [19]:
# %%time
# sa = SemAxis(CoreUtil.load_embedding("glove/all_glove_gensim_word2vec.txt", is_binary=False), 
#                axes_str=CoreUtil.load_wordnet_antonyms_axes())

In [22]:
# pickle.dump(sa, open('sa_object.p', 'wb'))

In [20]:
sa = pickle.load(open('sa_object.p', 'rb'))

In [21]:
sa

<semaxis.semaxis.SemAxis at 0x1a26cb6f90>

In [22]:
len(sa.axes)

1367

In [25]:
df_all = df_all.head(1000)

# big table for bias

In [26]:
COLUMNS = [str(c) for c in sorted(sa.axes.keys()) if len(c) == 2]
with open("big_table_by_average.tsv", "w") as fo_a, open("big_table_by_kurtosis.tsv", "w") as fo_k:
    fo_a.write("party\ttext\t{}\n".format("\t".join(COLUMNS)))
    fo_k.write("party\ttext\t{}\n".format("\t".join(COLUMNS)))
    print(df_all['Party'].value_counts())
    for loop_index, (row_index, row) in enumerate(df_all.iterrows()):
        if loop_index % (1000) == 0:
            logger.info(loop_index)
        try:
            _, mean, kurtosis = sa.compute_document_mean_kurtosis_with_tf([row['Text']], min_freq = 1)
        except ValueError as ve:
#             logger.exception("no vocab")
            continue
        except:
            logger.exception("???")
            continue
        fo_a.write("{}\t{}\t{}\n".format(row['Party'], row['Text'], "\t".join([str(v) for v in mean])))
        fo_k.write("{}\t{}\t{}\n".format(row['Party'], row['Text'], "\t".join([str(v) for v in kurtosis])))


2020-08-31 17:12:43 __main__     INFO     0


Bg    1000
Name: Party, dtype: int64


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# big table for intensity

In [30]:
COLUMNS = [str(c) for c in sorted(sa.axes.keys()) if len(c) == 2]

with open("big_table_by_second_moment_with_corpus_mean.tsv", "w") as fo_k:
    fo_k.write("party\ttext\t{}\n".format("\t".join(COLUMNS)))
    df_corpus = pd.read_csv("big_table_by_average.tsv", sep="\t")
    df_corpus.dropna(inplace=True)
    corpus_mean = np.mean(df_corpus.drop(
        columns=[c for c in df_corpus.columns if '(' not in c]).values, axis=0)
    print(df_all['Party'].value_counts())
    for loop_index, (row_index, row) in enumerate(df_all.iterrows()):
        if loop_index % (1000) == 0:
            logger.info(loop_index)
        try:
            sm = sa.compute_document_second_moment_with_tf([row['Text']], corpus_mean, min_freq = 1)  
        except ValueError as ve:
#             logger.exception("no vocab")
            continue
        except:
            logger.exception("???")
            continue
        fo_k.write("{}\t{}\t{}\n".format(row['Party'], row['Text'], "\t".join([str(v) for v in sm[0]])))



2020-08-31 17:15:40 __main__     INFO     0


Bg    1000
Name: Party, dtype: int64


## Bootstrap avg

In [42]:
df_bias = pd.read_csv('big_table_by_average.tsv', sep='\t')

In [44]:
df_bias.head(1)

Unnamed: 0,party,text,"('abaxial', 'adaxial')","('able', 'unable')","('abnormal', 'normal')","('aboral', 'oral')","('abridged', 'unabridged')","('absent', 'present')","('absolute', 'relative')","('abstemious', 'gluttonous')",...,"('unwrinkled', 'wrinkled')","('unwritten', 'written')","('useful', 'useless')","('valuable', 'worthless')","('vernal', 'wintry')","('virtuous', 'wicked')","('waning', 'waxing')","('weightless', 'weighty')","('winged', 'wingless')","('wired', 'wireless')"
0,Bg,we had our own bad weather &amp; power outages...,0.022656,-0.051889,0.000475,-0.013127,0.020974,0.033418,0.018463,0.007195,...,0.003468,0.00167,-0.019633,-0.02396,0.02789,0.017726,-0.007187,-0.005524,0.026845,-0.019947


In [45]:
N = 1000
mode = "average"
logger.info(mode)
try:
    os.mkdir(mode)
except:
    pass

# df = pd.read_csv("big_table_by_{}.tsv".format(mode), sep="\t").dropna()
COLUMNS = [c for c in df.columns if '(' in c]
len(COLUMNS)

2020-08-31 17:26:52 __main__     INFO     average


1367

In [46]:
df_bias = df_bias[df_bias['party'].isin(['Bg', 'Dem', 'Rep'])]
df_bias.dropna(inplace=True)

In [47]:
for party, a_count in df_bias['party'].value_counts().iteritems():
    logger.info("{}...".format(party))
    with open("{}/bootstrap_{}_average_by_party.tsv".format(mode, party), "w") as fo:
        fo.write("{}\n".format("\t".join(COLUMNS)))
        A = df.drop(columns=[c for c in df.columns if '(' not in c]).values
        for i in range(N):
            if i % 100 == 0:
                logger.info(i)
            fmean = np.mean(A[np.random.choice(A.shape[0], a_count, replace=False), :], axis=0)
            fo.write("{}\n".format("\t".join([str(v) for v in (fmean)])))        


2020-08-31 17:27:05 __main__     INFO     Bg...
2020-08-31 17:27:05 __main__     INFO     0
2020-08-31 17:27:06 __main__     INFO     100
2020-08-31 17:27:06 __main__     INFO     200
2020-08-31 17:27:07 __main__     INFO     300
2020-08-31 17:27:08 __main__     INFO     400
2020-08-31 17:27:09 __main__     INFO     500
2020-08-31 17:27:09 __main__     INFO     600
2020-08-31 17:27:10 __main__     INFO     700
2020-08-31 17:27:11 __main__     INFO     800
2020-08-31 17:27:12 __main__     INFO     900


## Significant axes avg

In [48]:
N = 1000
BOOTSTRAP_TEMPLATE = "{}/bootstrap_{}_{}_by_party.tsv"
OUT_TEMPLATE = "{}/significant_axes_{}_{}_by_party.tsv"
OUT_TEMPLATE2 = "{}/effect_size_significant_axes_{}_{}_by_party.tsv"

for party, a_count in df_bias['party'].value_counts().iteritems():
    
    results = []        
    logger.info("{}...".format(party))
    
    df_actual = df.query('party==@party')
    df_bootstrap = pd.read_csv(BOOTSTRAP_TEMPLATE.format(mode, party, mode), sep="\t").dropna()

    for axis in COLUMNS:
        actual = np.mean(df_actual[axis], axis=0)
        significance = sum(abs(df_bootstrap[axis]) > abs(actual))/float(N)
        results.append([axis, actual-np.mean(df_bootstrap[axis], axis=0), significance])

    pd.DataFrame(sorted(results, key=lambda x:x[2]), 
                 columns = ["axis", "diff_a_b", "p"]
                ).to_csv(OUT_TEMPLATE.format(mode, party, mode), sep="\t", index=False)

    pd.DataFrame(sorted(results, key=lambda x:abs(x[1]), reverse=True), 
         columns = ["axis", "diff_a_b", "p"]
        ).query('p <= 0.05').to_csv(OUT_TEMPLATE2.format(mode, party, mode), sep="\t", index=False)

2020-08-31 17:27:15 __main__     INFO     Bg...


## Bootstrap second moment

In [49]:
df_intensity = pd.read_csv('big_table_by_second_moment_with_corpus_mean.tsv', sep='\t')

In [50]:
df_intensity = df_intensity[df_intensity['party'].isin(['Bg', 'Dem', 'Rep'])]
df_intensity.dropna(inplace=True)


In [51]:
df_intensity.head(1)

Unnamed: 0,party,text,"('abaxial', 'adaxial')","('able', 'unable')","('abnormal', 'normal')","('aboral', 'oral')","('abridged', 'unabridged')","('absent', 'present')","('absolute', 'relative')","('abstemious', 'gluttonous')",...,"('unwrinkled', 'wrinkled')","('unwritten', 'written')","('useful', 'useless')","('valuable', 'worthless')","('vernal', 'wintry')","('virtuous', 'wicked')","('waning', 'waxing')","('weightless', 'weighty')","('winged', 'wingless')","('wired', 'wireless')"
0,Bg,we had our own bad weather &amp; power outages...,0.001675,0.005763,0.007386,0.004856,0.003896,0.006247,0.002556,0.002861,...,0.001968,0.005208,0.001898,0.003583,0.001832,0.006001,0.003959,0.003805,0.00401,0.004992


In [52]:
N = 1000
mode = "second_moment"
logger.info(mode)
try:
    os.mkdir(mode)
except:
    pass

COLUMNS = [c for c in df_intensity.columns if '(' in c]
len(COLUMNS)

2020-08-31 17:28:02 __main__     INFO     second_moment


1367

In [53]:
for party, a_count in df['party'].value_counts().iteritems():
    logger.info("{}...".format(party))
    with open("{}/bootstrap_{}_second_moment_with_corpus_mean_by_party.tsv".format(mode, party), "w") as fo:
        fo.write("{}\n".format("\t".join(COLUMNS)))
        A = df.drop(columns=[c for c in df.columns if '(' not in c]).values

        for i in range(N):
            if i % 100 == 0:
                logger.info(i)
            fmean = np.mean(A[np.random.choice(A.shape[0], a_count, replace=False), :], axis=0)
            fo.write("{}\n".format("\t".join([str(v) for v in (fmean)])))        

2020-08-31 17:31:13 __main__     INFO     Bg...
2020-08-31 17:31:13 __main__     INFO     0
2020-08-31 17:31:14 __main__     INFO     100
2020-08-31 17:31:15 __main__     INFO     200
2020-08-31 17:31:15 __main__     INFO     300
2020-08-31 17:31:16 __main__     INFO     400
2020-08-31 17:31:17 __main__     INFO     500
2020-08-31 17:31:18 __main__     INFO     600
2020-08-31 17:31:19 __main__     INFO     700
2020-08-31 17:31:19 __main__     INFO     800
2020-08-31 17:31:20 __main__     INFO     900


## Significant axes second moment

In [54]:
N = 1000
BOOTSTRAP_TEMPLATE = "{}/bootstrap_{}_second_moment_with_corpus_mean_by_party.tsv"
OUT_TEMPLATE = "{}/significant_axes_{}_second_moment_with_corpus_mean_by_party.tsv"
OUT_TEMPLATE2 = "{}/effect_size_significant_axes_{}_second_moment_with_corpus_mean_by_party.tsv"

for party, a_count in df['party'].value_counts().iteritems():

    results = []        
    logger.info("{}...".format(party))
    
    df_actual = df.query('party==@party')
    df_bootstrap = pd.read_csv(BOOTSTRAP_TEMPLATE.format(mode, party), sep="\t").dropna()
    for axis in COLUMNS:
        actual = np.mean(df_actual[axis], axis=0)
        significance = sum(abs(df_bootstrap[axis]) > abs(actual))/float(N)
        results.append([axis, actual-np.mean(df_bootstrap[axis], axis=0), significance])

    pd.DataFrame(sorted(results, key=lambda x:x[2]), 
                 columns = ["axis", "diff_a_b", "p"]
                ).to_csv(OUT_TEMPLATE.format(mode, party), sep="\t", index=False)

    pd.DataFrame(sorted(results, key=lambda x:abs(x[1]), reverse=True), 
         columns = ["axis", "diff_a_b", "p"]
        ).query('p <= 0.05').to_csv(OUT_TEMPLATE2.format(mode, party), sep="\t", index=False)

2020-08-31 17:32:38 __main__     INFO     Bg...
