In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import pearsonr, spearmanr

In [2]:
def get_regression_df(subreddit_focus, year, political=True):
    df = pd.read_csv('../data/regression_data_' + subreddit_focus + '-' + str(year) + 
                 '-8-1-61d.tsv', sep = '\t').dropna()
    if political:
        df = df[
            (df['parent_author_leaning_before'] != 0) &
            (df['child_author_leaning_before'] != 0) &
            (df['child_comment_political'] > 0.5)
        ]
    else:
        df = df[
            (df['parent_author_leaning_before'] != 0) &
            (df['child_author_leaning_before'] != 0) &
            (df['child_comment_political'] <= 0.5)
        ]
    
    # Binarize terms
    df['parent_author_left'] = (df['parent_author_leaning_before'] < 0).astype(int)
    df['child_author_left'] = (df['child_author_leaning_before'] < 0).astype(int)
    df['cross_party'] = (((df['parent_author_leaning_before'] < 0) & (df['child_author_leaning_before'] > 0)) | ((df['parent_author_leaning_before'] > 0) & (df['child_author_leaning_before'] < 0))).astype(int)
    df['same_party'] = (((df['parent_author_leaning_before'] < 0) & (df['child_author_leaning_before'] < 0)) | ((df['parent_author_leaning_before'] > 0) & (df['child_author_leaning_before'] > 0))).astype(int)

    df['parent_author_extreme'] = ((df['parent_author_leaning_before'] == -2) | (df['parent_author_leaning_before'] == 2)).astype(int)
    df['child_author_extreme'] = ((df['child_author_leaning_before'] == -2) | (df['child_author_leaning_before'] == 2)).astype(int)
    df['parent_author_extreme_left'] = df['parent_author_left'] * df['parent_author_extreme']

    df['child_comment_pos'] = (df['child_comment_polarity'] > 0).astype(int)
    df['child_comment_neg'] = (df['child_comment_polarity'] < 0).astype(int)
    df['child_comment_extreme'] = ((df['child_comment_polarity'] > 0.5) | (df['child_comment_polarity'] < -0.5)).astype(int)
    df['child_comment_toxic'] = (df['child_comment_toxicity'] > 0.7).astype(int)
    df['child_comment_url'] = (df['child_comment_n_urls'] > 0).astype(int)
    
    df['conflict'] = ((df['child_comment_score'] - 1) * (df['parent_comment_score'] - 1) < 0).astype(int)

    # Take logarithm
    df['log_child_score'] = np.log10(np.maximum(df['child_comment_score'], 1))
    df['log_child_length'] = np.log10(np.maximum(df['child_comment_length'], 1))

    df['log_pa_cp_actv_before'] = np.log10(np.maximum(df['parent_cp_actv_before'], 1))
    df['log_pa_sp_actv_before'] = np.log10(np.maximum(df['parent_sp_actv_before'], 1))
    df['log_pa_np_actv_before'] = np.log10(np.maximum(df['parent_np_actv_before'], 1))
    df['log_ca_cp_actv_before'] = np.log10(np.maximum(df['child_cp_actv_before'], 1))
    df['log_ca_sp_actv_before'] = np.log10(np.maximum(df['child_sp_actv_before'], 1))
    df['log_ca_np_actv_before'] = np.log10(np.maximum(df['child_np_actv_before'], 1))

    # Standardization
    for feature in ['log_child_score', 'log_child_length', 'log_pa_cp_actv_before', 'log_pa_sp_actv_before', 'log_pa_np_actv_before', 'log_ca_cp_actv_before', 'log_ca_sp_actv_before', 'log_ca_np_actv_before']:
        df[feature] = (df[feature] - df[feature].mean()) / (2 * df[feature].std(ddof=0))

    # Interaction terms
    df['interaction_child_url'] = df['interaction'] * df['child_comment_url']
    df['interaction_child_neg'] = df['interaction'] * df['child_comment_neg']
    df['interaction_child_pos'] = df['interaction'] * df['child_comment_pos']
    df['interaction_child_toxic'] = df['interaction'] * df['child_comment_toxic']
    df['interaction_conflict'] = df['interaction'] * df['conflict']
    df['interaction_log_child_score'] = df['interaction'] * df['log_child_score']
    df['interaction_log_child_length'] = df['interaction'] * df['log_child_length']

    df['cp_interaction'] = df['cross_party'] * df['interaction']
    df['sp_interaction'] = df['same_party'] * df['interaction']
    
    df['cp_interaction_nested'] = df['cp_interaction'] * df['nested']
    df['sp_interaction_nested'] = df['sp_interaction'] * df['nested']
    df['cp_interaction_nonnested'] = df['cp_interaction'] * (1 - df['nested'])
    df['sp_interaction_nonnested'] = df['sp_interaction'] * (1 - df['nested'])
    
    df['cp_interaction_conflict'] = df['cp_interaction_nonnested'] * df['conflict']
    df['sp_interaction_conflict'] = df['sp_interaction_nonnested'] * df['conflict']
    
    df['cp_interaction_ca_extreme'] = df['cp_interaction'] * df['child_author_extreme']
    df['sp_interaction_ca_extreme'] = df['sp_interaction'] * df['child_author_extreme']

    # Outcome variable
    df['more_cp_actv'] = (df['parent_cp_actv_after'] - df['parent_cp_actv_before'] > 0).astype(float)
    df['more_sp_actv'] = (df['parent_sp_actv_after'] - df['parent_sp_actv_before'] > 0).astype(float)
    df['more_np_actv'] = (df['parent_np_actv_after'] - df['parent_np_actv_before'] > 0).astype(float)

    return df


In [3]:
year2df = dict()
for sub in ('news', ):
    for year in range(2014, 2019):
        year2df[year] = get_regression_df(sub, year, political=True)

In [4]:
all_years_df = pd.concat([df for df in year2df.values()])
len(all_years_df)

92642

In [5]:
pvalue2asterisk = lambda x: (
    '***' if x < 0.001 else '**' if x < 0.01 else '*' if x < 0.05 else ''
)

def add_results_to_table(res, df=None, method='fdr_bh', baseline_BIC=0):
    K = len(res.params)
    adj_pseudo_r2 = 1 - (res.llf - K) / res.llnull # https://www.statease.com/docs/v12/contents/advanced-topics/glm/adj-mcfadden-pseudo-r-squared/
    if df is None:
        main_stat = pd.Series(str(0), index=['BIC'])
    else:
        main_stat = pd.Series(str(int(res.bic - baseline_BIC)), index=['BIC'])
    r = pd.concat([res.pvalues, res.params], axis=1)
    r.columns = ['pvalues_uncorrected', 'logits']
    new_row = r.transpose().apply(lambda x: '{:.3f}'.format(x.iloc[1]) + pvalue2asterisk(x.iloc[0]))
    new_row = pd.concat([main_stat, new_row])
    if df is None:
        return pd.DataFrame([new_row]), res.bic
    else:
        return pd.concat([df, pd.DataFrame([new_row])], ignore_index=True), res.bic
    

In [6]:
base_model = ['parent_author_left', 
              'parent_author_extreme', 
              'log_pa_cp_actv_before', 
              'log_pa_sp_actv_before', 
              'log_pa_np_actv_before']
feature_sets = [[
    'cp_interaction_nested',
    'cp_interaction_nonnested',
    'sp_interaction_nested',
    'sp_interaction_nonnested'
    ],[
    'interaction_log_child_length',
    'interaction_child_url',
    'interaction_child_pos',
    'interaction_child_neg',
    'interaction_child_toxic'
    ],[
    'interaction_log_child_score'
    ],[
    'cp_interaction_conflict',
    'sp_interaction_conflict'
]]

In [7]:
base_model_results = sm.logit(formula="more_cp_actv ~ " + ' + '.join(base_model),
                               data=all_years_df).fit()
results_df, baseline_BIC = add_results_to_table(base_model_results)

features = list(base_model)
best_BIC = baseline_BIC
for feature_set in feature_sets:
    new_features = features + feature_set
    res = sm.logit(formula="more_cp_actv ~ " + ' + '.join(new_features),
                               data=all_years_df).fit()
    results_df, BIC = add_results_to_table(res, df=results_df, baseline_BIC=baseline_BIC)
    if BIC < best_BIC:
        best_BIC = BIC
        features = new_features
    


Optimization terminated successfully.
         Current function value: 0.449683
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.449021
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.448965
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.449021
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.449012
         Iterations 6


In [8]:
varname2label = {
    'parent_author_left': 'Left-Wing User', 
    'parent_author_extreme': 'Hyper-Partisan User', 
    'log_pa_cp_actv_before': 'User Out-Party Activity', 
    'log_pa_sp_actv_before': 'User In-Party Activity', 
    'log_pa_np_actv_before': 'User Non-Partisan Activity',
    'sp_interaction_nonnested': 'Same-Party Interaction',
    'sp_interaction_nested': 'Nested Same-Party Interaction',
    'cp_interaction_nonnested': 'Cross-Party Interaction',
    'cp_interaction_nested': 'Nested Cross-Party Interaction',
    'interaction_child_neg': 'Negative Reply Text',
    'interaction_child_pos': 'Positive Reply Text',
    'interaction_child_toxic': 'Toxic Reply Text',
    'interaction_child_url': 'Reply with URL',
    'interaction_log_child_length': 'Reply Length',
    'interaction_log_child_score': 'Reply Score',
    'cp_interaction_conflict': 'Cross-Party Conflictual Scores',
    'sp_interaction_conflict': 'Same-Party Conflictual Scores'
}

def rename(x):
    if x in varname2label:
        return varname2label[x]
    if x.startswith('log_'):
        return rename(x[4:]) + ' (log)'
    if x.startswith('pa_'):
        return rename(x[3:]) + ' Subject'
    if x.startswith('parent_'):
        return rename(x[len('parent_'):]) + ' Subject'
    if x.startswith('ca_'):
        return rename(x[3:]) + ' Interlocutor'
    if x.startswith('child_'):
        return rename(x[len('child_'):]) + ' Interlocutor'
    return x.replace('_', ' ')

In [9]:
results_table = results_df.copy()
results_table[results_table.isna()] = ""
results_table = results_table.transpose()
results_table.index = results_table.index.map(rename)
results_table.columns = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"[:len(results_table.columns)])
results_table

Unnamed: 0,A,B,C,D,E
BIC,0,-76,-30,-65,-55
Intercept,-1.132***,-1.252***,-1.252***,-1.252***,-1.252***
Left-Wing User,-0.686***,-0.688***,-0.688***,-0.688***,-0.688***
Hyper-Partisan User,-0.318***,-0.241***,-0.241***,-0.241***,-0.241***
User Out-Party Activity,0.208***,0.213***,0.213***,0.213***,0.213***
User In-Party Activity,0.115***,0.108***,0.108***,0.108***,0.108***
User Non-Partisan Activity,0.259***,0.189***,0.189***,0.189***,0.189***
Nested Cross-Party Interaction,,0.177***,0.156***,0.177***,0.177***
Cross-Party Interaction,,0.080,0.057,0.080,0.078
Nested Same-Party Interaction,,0.262***,0.242***,0.262***,0.262***


In [10]:
# print(results_table.to_latex())