In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../results_kalpha.csv')

In [3]:
def clean_df(row):
    row['est.alpha'] = round(row['est.alpha'],3)
    row['ci.boot.alpha'] = round(row['ci.boot.alpha'],3)
    if '2.5%' in str(row['Unnamed: 0']):
        row['CI'] = '2.5%'
    if '97.5%' in str(row['Unnamed: 0']):
        row['CI'] = '97.5%'
    return row
        

In [4]:
df.columns

Index(['Unnamed: 0', 'obs.agr.alpha', 'est.alpha', 'ci.boot.alpha', 'X', 'Y'], dtype='object')

In [5]:
df = df.apply(clean_df, axis=1)

In [6]:
df25 = df[df.CI=='2.5%'].rename(columns={'ci.boot.alpha': 'CI2.5'})
df97 = df[df.CI=='97.5%'].rename(columns={'ci.boot.alpha': 'CI97.5'})

In [7]:
del df25['Unnamed: 0']
del df97['Unnamed: 0']
del df25['CI']
del df97['CI']

In [8]:
data = df25.merge(df97)

In [9]:
df97.head()

Unnamed: 0,obs.agr.alpha,est.alpha,CI97.5,X,Y
1,0.098177,-0.024,-0.002,text_recessie_gold,text_recessie
3,0.293128,0.247,0.288,text_boukes_gold,text_boukes
5,0.427069,0.349,0.393,text_LIWC_gold,text_LIWC
7,0.373773,0.155,0.204,text_sentistrength_gold,text_sentistrength
9,0.330996,0.087,0.138,text_pattern_gold,text_pattern


In [10]:
df25.head()

Unnamed: 0,obs.agr.alpha,est.alpha,CI2.5,X,Y
0,0.098177,-0.024,-0.046,text_recessie_gold,text_recessie
2,0.293128,0.247,0.201,text_boukes_gold,text_boukes
4,0.427069,0.349,0.307,text_LIWC_gold,text_LIWC
6,0.373773,0.155,0.104,text_sentistrength_gold,text_sentistrength
8,0.330996,0.087,0.034,text_pattern_gold,text_pattern


In [11]:
data

Unnamed: 0,obs.agr.alpha,est.alpha,CI2.5,X,Y,CI97.5
0,0.098177,-0.024,-0.046,text_recessie_gold,text_recessie,-0.002
1,0.293128,0.247,0.201,text_boukes_gold,text_boukes,0.288
2,0.427069,0.349,0.307,text_LIWC_gold,text_LIWC,0.393
3,0.373773,0.155,0.104,text_sentistrength_gold,text_sentistrength,0.204
4,0.330996,0.087,0.034,text_pattern_gold,text_pattern,0.138
5,0.398317,0.258,0.209,text_polyglot_gold,text_polyglot,0.303
6,0.365358,0.154,0.099,text_DANEW_gold,text_DANEW,0.200
7,0.424264,0.315,0.268,text_top4_gold,text_top4,0.360
8,0.432974,-0.010,-0.017,title_recessie_gold,title_recessie,-0.005
9,0.447629,0.073,0.055,title_boukes_gold,title_boukes,0.094


# Results for Table 1
Comparison of title vs. text for each tool & gold

In [12]:
def format_data(row):
    row['result'] = str(round(row['est.alpha'],2)) + ' [' + str(round(row['CI2.5'],2)) + ', ' + str(round(row['CI97.5'],2)) + ']'
    return row

In [13]:
data = data.apply(format_data, axis=1)

In [14]:
data[(data.X.str.contains('gold')) & (data.X.str.contains('title'))][['Y', 'est.alpha', 'CI2.5', 'CI97.5', 'result']]

Unnamed: 0,Y,est.alpha,CI2.5,CI97.5,result
8,title_recessie,-0.01,-0.017,-0.005,"-0.01 [-0.02, -0.01]"
9,title_boukes,0.073,0.055,0.094,"0.07 [0.06, 0.09]"
10,title_LIWC,0.226,0.199,0.251,"0.23 [0.2, 0.25]"
11,title_sentistrength,0.179,0.15,0.205,"0.18 [0.15, 0.2]"
12,title_pattern,0.169,0.142,0.197,"0.17 [0.14, 0.2]"
13,title_polyglot,0.25,0.222,0.276,"0.25 [0.22, 0.28]"
14,title_DANEW,0.218,0.191,0.246,"0.22 [0.19, 0.25]"
15,title_top4,0.298,0.27,0.324,"0.3 [0.27, 0.32]"


In [15]:
data[(data.X.str.contains('gold')) & (data.X.str.contains('text'))][['Y', 'est.alpha', 'CI2.5', 'CI97.5']]

Unnamed: 0,Y,est.alpha,CI2.5,CI97.5
0,text_recessie,-0.024,-0.046,-0.002
1,text_boukes,0.247,0.201,0.288
2,text_LIWC,0.349,0.307,0.393
3,text_sentistrength,0.155,0.104,0.204
4,text_pattern,0.087,0.034,0.138
5,text_polyglot,0.258,0.209,0.303
6,text_DANEW,0.154,0.099,0.2
7,text_top4,0.315,0.268,0.36


# Table 4

In [16]:
titles = data[data.X.str.contains('title')]

In [17]:
columns = ['gold', 'recessie', 'boukes', 'LIWC', 'sentistrength', 'pattern', 'polyglot', 'DANEW' ]

In [18]:
titles.to_excel('table4_titles.xlsx')

In [19]:
texts = data[data.X.str.contains('text')]

In [20]:
texts.to_excel('table4_text.xlsx')