# Preperation

## Configure IPython

In [None]:
# activate plots
%matplotlib inline

## Load libraries

In [2]:
from scripts import analyze # some analysis helper functions for this paper
import pandas 
import datetime
import os
import numpy
import statsmodels.formula.api as sm
import seaborn

ModuleNotFoundError: No module named 'pandas'

## prepare some visualization stuff

In [None]:
seaborn.set(rc={'figure.figsize':(11.7,8.27)})
corplot = lambda cortable, title: seaborn.heatmap(cortable, linewidth=.1, annot=True, cmap="RdBu_r", vmin=-1, vmax=1).set_title(title)
tool_order = [ "recessie" , "boukes","LIWC","sentistrength","pattern","polyglot","DANEW"]
text_order = ["text_"+tool for tool in tool_order]
title_order = ["title_"+tool for tool in tool_order]

# Preprocessing

In [None]:
path_to_files = "./data"
coded_files   = "./results/data_with_sentiment.csv"
os.makedirs('results',exist_ok=True)

## Do or Load Sentiment analysis

In [None]:
# If sentiments have not yet been computed
if not os.path.exists(coded_files):
    print("Starting from scratch")
    raw_data = analyze.load_files(path_to_files)
    print("Recoding")
    rec_data = analyze.recode_annotations(raw_data) 
    print("Adding sentiment")
    sen_data = analyze.add_sentiments(analyze.add_sentiments(rec_data,'text'),'title')
    print("Adding linguistic features for error analysis")
    tex_data = analyze.add_text_properties(analyze.add_text_properties(sen_data,'text'),'title')
    print("Writing results to disk")
    tex_data.to_csv(coded_files)
    data = tex_data
# If sentiments have been computed
else:
    print("Using previously stored computations")
    data = pandas.read_csv(coded_files)

data.index = data.ID

# Add LIWC results if available
if "LIWC2015 Results (ID_Text.csv).csv" in os.listdir(path_to_files):
    text_liwc = pandas.read_csv(os.path.join(path_to_files,"LIWC2015 Results (ID_Text.csv).csv"))
    text_liwc = pandas.DataFrame({'ID':text_liwc.A, 'text_LIWC':text_liwc.Posemo - text_liwc.Negemo}).loc[2:,:]
    text_liwc.index = text_liwc.ID.astype("float64")
    data = data.join(text_liwc.drop('ID',1))
    
if "LIWC2015 Results (ID_Title.csv).csv" in os.listdir(path_to_files):
    title_liwc = pandas.read_csv(os.path.join(path_to_files,"LIWC2015 Results (ID_Title.csv).csv"))
    title_liwc = pandas.DataFrame({'ID':title_liwc.A, 'title_LIWC':title_liwc.Posemo - title_liwc.Negemo}).loc[2:,:]
    title_liwc.index = title_liwc.ID.astype("float64")
    data = data.join(title_liwc.drop('ID',1))
    
# Set time-based index
todate = lambda d: datetime.datetime.strptime(d.replace('T',' ').split('.')[0],'%Y-%m-%d %H:%M:%S')
data.index = data.date_y.map(todate) # saving as CSV breaks the timeindex, so we reconstruct it here
    

# Filter out timepoints after which data becomes too sparse
filtered = (data.index >= datetime.datetime(year=2015,month=7,day=7)).sum()
data = data[data.index < datetime.datetime(year=2015,month=7,day=7)] 
print("filtered out",filtered, "observations")

data = data[~data.index.dayofweek.isin([0,6])]

## order columns appropriately

In [None]:
data.index = range(len(data))
red = data.copy()
data = data.drop(text_order,axis=1)
data = data.drop(title_order,axis=1)
data = data.join(red[text_order])
data = data.join(red[title_order])
del(red)
data.index = data.index = data.date_y.map(todate) # saving as CSV breaks the timeindex, so we reconstruct it here

### Option to toggle non-relevant human annotations to '0'

In [None]:
#data.text_gold = data.text_gold.replace({numpy.nan:0})
#data.title_gold = data.title_gold.replace({numpy.nan:0})

## Resulting crosstabs for gold annotations

In [None]:
print("Fulltext crosstab")
pandas.DataFrame(
    {
        'online':data.text_gold[data.online].value_counts(),
        'online_%': round(data.text_gold[data.online].value_counts()/data.text_gold[data.online].value_counts().sum(),2),
        'offline':data.text_gold[~data.online].value_counts(),
        'offline_%': round(data.text_gold[~data.online].value_counts()/data.text_gold[~data.online].value_counts().sum(),2),
        'total': data.text_gold.value_counts()
    }
).sort_index()

In [None]:
print("Headline crosstab")
pandas.DataFrame(
    {
        'online':data.title_gold[data.online].value_counts(),
        'online_%': round(data.title_gold[data.online].value_counts()/data.title_gold[data.online].value_counts().sum(),2),
        'offline':data.title_gold[~data.online].value_counts(),
        'offline_%': round(data.title_gold[~data.online].value_counts()/data.title_gold[~data.online].value_counts().sum(),2),
        'total': data.title_gold.value_counts()
    }
).sort_index()

## Computing fleiss-scores

In [None]:
annotations = pandas.read_csv(os.path.join(path_to_files, "Inhoudsanalyse_AllesMerged_noICR_toneOnly.csv"),delimiter=";")
annotations = annotations.replace({" ":numpy.nan})
#annotations[["Toon_Kop","Posit_Nega"]] = annotations[["Toon_Kop","Posit_Nega"]].astype(float)


print("Headline Fleiss score, N: ", analyze.calculate_intercoder_reliability(annotations, "ID","Codeur","Toon_Kop"))
print("Fulltext Fleiss score, N: ", analyze.calculate_intercoder_reliability(annotations, "ID","Codeur","Posit_Nega"))

## Add 'the best of the best' models

In [None]:
data = analyze.z_best(data, 'title', list(reversed(["LIWC","polyglot","sentistrength","DANEW"])))
data = analyze.z_best(data, 'text', list(reversed(["LIWC","polyglot","sentistrength","DANEW"])))

## Agregations 

In [None]:
article_data = data.copy()
daily_data   = data.copy().resample("1D").mean()
weekly_data  = data.copy().resample("1W").mean()

In [None]:
article_data.resample("1D").count().date_y.plot()

### Errors

In [None]:
# article level errors
article_text_errors  = analyze.calculate_errors(article_data,'text')
article_title_errors = analyze.calculate_errors(article_data,'title')
# daily level errors
daily_text_errors  = analyze.calculate_errors(daily_data, 'text')
daily_title_errors = analyze.calculate_errors(daily_data, 'title')
# weekly level errors
weekly_text_errors  = analyze.calculate_errors(weekly_data, 'text')
weekly_title_errors = analyze.calculate_errors(weekly_data, 'title')

# Analysis

## Error analysis

### Predictors of title absolute errors

In [None]:
analyze.analyze_errors(article_title_errors,'title')[tool_order]

### Predictors of text absolute errors

In [None]:
analyze.analyze_errors(article_data,'text')[tool_order]

## Article-level

### Distributions

In [None]:
standardized = analyze.compare_sentiment_means(article_data.loc[:,["title_gold"]+title_order],field='title')
plot = standardized.drop('ID',1).plot(kind='box',figsize=(10,15),title="Boxplot of headline sentiment scores at the Article level")
plot.set_xticklabels(['Human\nAnnotation','Recession','Damstra &\nBoukes','LIWC','Sentistrength','Pattern','Polyglot','DANEW'])

In [None]:
article_data.loc[:,["title_gold"]+title_order].describe()

In [None]:
from scipy.stats import kstest,kurtosis,skew
print("Online")
for col in text_order:
    k,pval = kstest(article_data[article_data.online==True][col].dropna(),cdf='norm')
    kurt   = kurtosis(article_data[article_data.online==True][col].dropna())
    sk     = skew(article_data[article_data.online==True][col].dropna())
    print("{:20.20s}: D={:0.3f} pval={:0.3f} kurtosis={:6.2f} skew={:6.2f}".format(col,k,pval,kurt,sk))
print("Offline")
for col in text_order:
    k,pval = kstest(article_data[article_data.online==False][col].dropna(),cdf='norm')
    kurt   = kurtosis(article_data[article_data.online==False][col].dropna())
    sk     = skew(article_data[article_data.online==False][col].dropna())
    print("{:20.20s}: D={:0.3f} pval={:0.3f} kurtosis={:6.2f} skew={:6.2f}".format(col,k,pval,kurt,sk))

In [None]:
seaborn.violinplot(x="variable", y="value",data=standardized.drop('ID',1).melt())
seaborn.stripplot(x="variable", y="value",data=standardized.drop('ID',1).melt(),color=".3",jitter=True)

In [None]:
standardized = analyze.compare_sentiment_means(article_data.loc[:,["text_gold"]+text_order],field='text')
plot=standardized.drop('ID',1).plot(
    kind='box',
    figsize=(10,15),
    title="Boxplot of fulltext sentiment scores at the Article level"
    )
plot.set_xticklabels(['Human\nAnnotation','Recession','Damstra &\nBoukes','LIWC','Sentistrength','Pattern','Polyglot','DANEW'])

In [None]:
standardized = analyze.compare_sentiment_means(article_data[article_data.online].loc[:,["text_gold"]+text_order],field='text')
plot=standardized.drop('ID',1).plot(
    kind='box',
    figsize=(10,10),
    title="Boxplot of online text sentiment scores at the Article level",
    )
plot.set_xticklabels(['Manual\nAnnotation','Recession','Damstra &\nBoukes','LIWC','Sentistrength','Pattern','Polyglot','DANEW'])

In [None]:
standardized = analyze.compare_sentiment_means(article_data[~article_data.online].loc[:,["text_gold"]+text_order],field='text')
plot=standardized.drop('ID',1).plot(
    kind='box',
    figsize=(10,10),
    title="Boxplot of offline text sentiment scores at the Article level"
    )
plot.set_xticklabels(['Manual\nAnnotation','Recession','Damstra &\nBoukes','LIWC','Sentistrength','Pattern','Polyglot','DANEW'])

In [None]:
standardized.describe().T

In [None]:
seaborn.violinplot(x="variable", y="value",data=standardized.drop('ID',1).melt())
seaborn.stripplot(x="variable", y="value",data=standardized.drop('ID',1).melt(),color=".3",jitter=True)

### Quality of models

In [None]:
analyze.check_quality(article_data,'title').loc[title_order,:]

In [None]:
analyze.check_quality(article_data,'text').loc[text_order,:]

### Correlations of results

In [None]:
analyze.correlation_tests(article_data,'title').loc[['title_gold']+title_order,['title_gold']+title_order]

In [None]:
analyze.correlation_tests(article_data,'text').loc[['text_gold']+text_order,['text_gold']+text_order]

### article-level correlations in online publications

In [None]:
analyze.correlation_tests(article_data[article_data.online],'title').loc[['title_gold']+title_order,['title_gold']+title_order]

In [None]:
article_data.title_recessie[article_data.online].describe()

### article-level correlations in offline publications

In [None]:
analyze.correlation_tests(article_data[~article_data.online],'title').loc[['title_gold']+title_order,['title_gold']+title_order]

In [None]:
corplot(analyze.correlate_results(article_data.drop("title_recessie",axis=1),'title'),"Correlation heatmap of title sentiment scores at the article level")

In [None]:
analyze.correlation_tests(article_data,'text').loc[['text_gold']+text_order,['text_gold']+text_order]

### Article-level body online correlations

In [None]:
analyze.correlation_tests(article_data[article_data.online],'text').loc[['text_gold']+text_order,['text_gold']+text_order]

### Article-level body offline correlations

In [None]:
analyze.correlation_tests(article_data[~article_data.online],'text').loc[['text_gold']+text_order,['text_gold']+text_order]

## Comparison table

In [None]:
online_full  = analyze.correlation_tests(article_data[article_data.online],'text').loc[['text_gold']+text_order,['text_gold']+text_order].text_gold
online_head  = analyze.correlation_tests(article_data[article_data.online],'title').loc[['title_gold']+title_order,['title_gold']+title_order].title_gold
offline_full = analyze.correlation_tests(article_data[~article_data.online],'text').loc[['text_gold']+text_order,['text_gold']+text_order].text_gold
offline_head = analyze.correlation_tests(article_data[~article_data.online],'title').loc[['title_gold']+title_order,['title_gold']+title_order].title_gold
overall_full = analyze.correlation_tests(article_data,'text').loc[['text_gold']+text_order,['text_gold']+text_order].text_gold
overall_head = analyze.correlation_tests(article_data,'title').loc[['title_gold']+title_order,['title_gold']+title_order].title_gold

online_full.index = ["Human"] + tool_order
offline_full.index = ["Human"] + tool_order
offline_head.index = ["Human"] + tool_order
online_head.index = ["Human"] + tool_order
overall_head.index = ["Human"] + tool_order
overall_full.index = ["Human"] + tool_order

compared_cors = pandas.DataFrame({
        "online_fulltext" : online_full,
        "online_headline" : online_head,
        "offline_fulltext": offline_full,
        "offline_headline": offline_head,
        "all_fulltext"    : overall_full,
        "all_headline"    : overall_head
    })
compared_cors

In [None]:
av_h, av_f = [], []
for classifier, headline, fulltext in zip(compared_cors.index, compared_cors.all_headline,compared_cors.all_fulltext):
    to_num = lambda x: float(x.replace('*','').strip())
    c_compare = analyze.cor_compare(to_num(headline), to_num(fulltext), len(data[article_data.text_gold.isnull()]), len(data[article_data.title_gold.isnull()]))
    av_h.append(to_num(headline))
    av_f.append(to_num(fulltext))
    print(classifier, round(c_compare['cordiff'],2), c_compare['p_value'])
print("average", numpy.nanmean(av_h),numpy.nanmean(av_f),analyze.cor_compare(numpy.nanmean(av_h),numpy.nanmean(av_f), len(data[article_data.text_gold.isnull()]), len(data[article_data.title_gold.isnull()]))['p_value'])

In [None]:
seaborn.heatmap(analyze.correlate_results(article_data,'text'), linewidth=.1, annot=True,cmap='RdBu_r',vmin=-1).set_title(
"Correlation heatmap of text sentiment scores at the article level")

### Correlations of errors

In [None]:
analyze.correlation_tests(article_data, 'title', errors=True)


In [None]:
corplot(analyze.correlate_results(article_data,'title',errors=True),"Heatmap of Article-level title error correlation coefficients")

In [None]:
analyze.correlation_tests(article_data, 'text',errors=True)

In [None]:
corplot(analyze.correlate_results(article_data,'text',errors=True),"Heatmap of Article-level text error correlation coefficients")

## Daily-level

### Distributions

In [None]:
standardized = analyze.compare_sentiment_means(daily_data,field='title')
standardized.drop('ID',1).plot(kind='box',figsize=(10,10),title="Boxplot of title sentiment scores at the Daily level")

In [None]:
seaborn.violinplot(x="variable", y="value",data=standardized.drop('ID',1).melt())
seaborn.stripplot(x="variable", y="value",data=standardized.drop('ID',1).melt(),color=".3",jitter=True)

In [None]:
standardized = analyze.compare_sentiment_means(daily_data,field='text')
standardized.drop('ID',1).plot(kind='box',figsize=(10,10),title="Boxplot of text sentiment scores at the Daily level")

In [None]:
seaborn.violinplot(x="variable", y="value",data=standardized.drop('ID',1).melt())
seaborn.stripplot(x="variable", y="value",data=standardized.drop('ID',1).melt(),color=".3",jitter=True)

### Quality of models

In [None]:
analyze.check_quality(daily_data,'title').loc[title_order,:]

In [None]:
analyze.check_quality(daily_data,'text').loc[text_order,:]

### Correlations of results

In [None]:
analyze.correlation_tests(daily_data,'title')

In [None]:
corplot(analyze.correlate_results(daily_data.drop("title_recessie",axis=1),'title'),"Correlation heatmap of title sentiment scores at the daily level")

In [None]:
analyze.correlation_tests(daily_data,'text')

In [None]:
corplot(analyze.correlate_results(daily_data,'text'),"Correlation heatmap of text sentiment scores at the article level")

### Correlations of errors

In [None]:
analyze.correlation_tests(daily_data, 'title', errors=True)

In [None]:
corplot(analyze.correlate_results(daily_data,'title',errors=True),"Heatmap of Daily-level title error correlation coefficients")

In [None]:
analyze.correlation_tests(daily_data, 'text',errors=True)

In [None]:
corplot(analyze.correlate_results(daily_data,'text',errors=True),"Heatmap of Daily-level text error correlation coefficients")

## Weekly-level

### Distributions

In [None]:
standardized = analyze.compare_sentiment_means(weekly_data,field='title')
standardized.drop('ID',1).plot(kind='box',figsize=(10,10),title="Boxplot of title sentiment scores at the week level")

In [None]:
seaborn.violinplot(x="variable", y="value",data=standardized.drop('ID',1).melt())
seaborn.stripplot(x="variable", y="value",data=standardized.drop('ID',1).melt(),color=".3",jitter=True)

In [None]:
standardized = analyze.compare_sentiment_means(weekly_data,field='text')
standardized.drop('ID',1).plot(kind='box',figsize=(10,10),title="Boxplot of text sentiment scores at the Week level")

In [None]:
seaborn.violinplot(x="variable", y="value",data=standardized.drop('ID',1).melt())
seaborn.stripplot(x="variable", y="value",data=standardized.drop('ID',1).melt(),color=".3",jitter=True)

### Quality of models

In [None]:
analyze.check_quality(weekly_data,'title').loc[title_order,:]

In [None]:
analyze.check_quality(weekly_data,'text').loc[text_order+["text_top3"],:]

### Correlations of results

In [None]:
analyze.correlation_tests(weekly_data,'title')

In [None]:
corplot(analyze.correlate_results(weekly_data.drop("title_recessie",axis=1),'title'),"Correlation heatmap of title sentiment scores at the Week level")

In [None]:
analyze.correlation_tests(weekly_data,'text').loc[['text_gold']+text_order,["text_gold"]+text_order]

In [None]:
seaborn.heatmap(analyze.correlate_results(weekly_data,'text'), linewidth=.1, annot=True,cmap='RdBu_r',vmin=-1).set_title(
"Correlation heatmap of text sentiment scores at the Week level")

### Correlations of errors

In [None]:
analyze.correlation_tests(weekly_data, 'title', errors=True)


In [None]:
corplot(analyze.correlate_results(weekly_data,'title',errors=True),"Heatmap of Week-level title error correlation coefficients")

In [None]:
analyze.correlation_tests(weekly_data, 'text',errors=True)

In [None]:
corplot(analyze.correlate_results(weekly_data,'text',errors=True),"Heatmap of Week-level text error correlation coefficients")

## Method correlations with  baseline compared across granularities

In [None]:
analyze.mean_correlations(data.loc[:,["title_"+t for t in tool_order]+["title_gold","title_top3"]].dropna(),'title').loc[title_order+['title_top3','N'],:]

In [None]:
analyze.mean_correlations(data.loc[:,["text_"+t for t in tool_order]+["text_gold","text_top3"]].dropna(),'text').loc[text_order+['text_top3','N'],:]

In [None]:
#min_max_norm = lambda x: (x-x.median())/(x.max()-x.min())
weekly_data[[col for col in weekly_data.columns if "text_" in col and "_err" in col and not "text_DANEW" in col ]].plot()

In [None]:
weekly_data[[col for col in weekly_data.columns if "text_" in col and not "_err" in col and not "text_DANEW" in col ]].plot()

# Boukes bonus bonanza

In [None]:
print("Predicting complexity of titles")
analyze.analyze_complexity(article_data, field='title')

In [None]:
print("Predicting complexity of texts")
analyze.analyze_complexity(article_data, field='text')