In [None]:
!pip install econml numpy scikit-learn pandas pyarrow scipy matplotlib

In [None]:
type = "summary_50"

In [None]:
import pandas as pd

type_train_df_x = pd.read_parquet(f"df/{type}_train_df_x.gzip")
type_test_df_x = pd.read_parquet(f"df/{type}_test_df_x.gzip")

type_train_df_y = pd.read_parquet(f"df/{type}_train_df_y.gzip")
type_test_df_y = pd.read_parquet(f"df/{type}_test_df_y.gzip")
joined_train_df = pd.concat([type_train_df_x, type_train_df_y], axis=1).reset_index(drop=True)
joined_test_df = pd.concat([type_test_df_x, type_test_df_y], axis=1).reset_index(drop=True)

joined = pd.concat([joined_train_df, joined_test_df], axis=0).reset_index(drop=True)

In [None]:
joined.shape

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
from econml.dr import DRLearner, ForestDRLearner
import xgboost as xgb
from sklearn.preprocessing import LabelBinarizer
import numpy as np
from scipy.special import expit

# Filter to valid support where topic_X is active
sentiment_col = 'sentiment Donald Trump'
topic_col = 'topic Donald Trump'
outcome_col = 'int_bias'

In [None]:
import time

def calculate_ate(sentiment_col, topic_col):
    outcome_col = 'int_bias'

    start = time.time()
    print(f"investigating {sentiment_col}")
    train_sub = joined[joined[topic_col] == True].copy()

    # Treatment (categorical: -1, 0, 1)
    T = train_sub[[sentiment_col]].astype(int).values
    
    # Confounders: embeddings + all topics except the one used
    W = train_sub[
        [col for col in train_sub.columns
         if col not in [sentiment_col, topic_col, outcome_col]]
    ].values
    
    # One-vs-rest binary outcomes
    lb = LabelBinarizer()
    Y_multi = lb.fit_transform(train_sub[outcome_col])
    bias_classes = lb.classes_
    
    # GPU-accelerated XGBoost models
    model_y = xgb.XGBClassifier(
        tree_method='hist',
        eval_metric='logloss',
        n_estimators=100,
        max_depth=8,
        random_state=0,
        device=device,
        objective='binary:logistic',
    )
    
    model_t = xgb.XGBClassifier(
        tree_method='hist',
        eval_metric='mlogloss',
        n_estimators=100,
        max_depth=8,
        random_state=0,
        device=device,
        objective='binary:logistic',
    )

    # ATE estimates per bias class
    ate_results = {}

    cate_models = {}

    # Effect modifiers X: same as W, or just topics

    total = [col for col in train_sub.columns if col.startswith("topic")]
    nonzero_columns = [col for col in train_sub.columns if col.startswith("topic") and (train_sub[col] != False).any()]
    print(f"effect modifier columns = {len(nonzero_columns)}, total columns = {len(total)}")
    
    X = train_sub[nonzero_columns].astype(int).values
    
    for i, cls in enumerate(bias_classes):
        Y_bin = Y_multi[:, i]
    
        if np.bincount(Y_bin).min() < 5:
            print(f"Skipping class {cls} due to insufficient samples.")
            continue
    
        dr = DRLearner(model_propensity=model_t, model_regression=model_y, discrete_outcome=True)
        dr.fit(Y=Y_bin, T=T, W=W)
    
        # Estimate ATE for categorical contrasts
        ate_pos_vs_neutral = dr.ate(T0=np.array([[0]]), T1=np.array([[1]]))
        ate_neg_vs_neutral = dr.ate(T0=np.array([[0]]), T1=np.array([[-1]]))
        ate_pos_vs_neg = dr.ate(T0=np.array([[-1]]), T1=np.array([[1]]))
        ate_neg_vs_pos = dr.ate(T0=np.array([[1]]), T1=np.array([[-1]]))
    
        ate_results[cls] = {
            "+1 vs 0": round(expit(ate_pos_vs_neutral[0]) - expit(0), 4),
            "-1 vs 0": round(expit(ate_neg_vs_neutral[0]) - expit(0), 4),
            "+1 vs -1": round(expit(ate_pos_vs_neg[0]) - expit(0), 4), 
        }

        # Estimate CATE
        forest_dr = ForestDRLearner(
            model_regression=model_y,
            model_propensity=model_t,
            discrete_outcome=True,
        )
        forest_dr.fit(Y=Y_bin, T=T, X=X, W=W)
        cate_models[cls] = forest_dr
    
    # # Display ATEs
    # for cls, effects in ate_results.items():
    #     print(f"\nBias = {cls}:")
    #     for contrast, val in effects.items():
    #         print(f"  ATE ({contrast}): {val:.4f}")

    end = time.time()

    print(f"Elapsed time: {end - start:.4f} seconds")

    return (ate_results, cate_models)


In [None]:
gain_features = [
'sentiment US Senate',
 'sentiment Healthcare',
 'sentiment Immigration',
 'sentiment Donald Trump',
 'sentiment GOP',
 'sentiment Elizabeth Warren',
 'sentiment Elections',
 'sentiment Politics',
 'sentiment Terrorism',
 'sentiment Joe Biden',
 'sentiment Hillary Clinton',
 'sentiment Impeachment',
 'sentiment Media Bias',
 'sentiment White House',
 'sentiment Justice Department']

In [None]:
ate_features = {}
cate_feaures = {}

In [None]:
for sentiment_feature in gain_features:
    topic_feature = sentiment_feature.replace("sentiment", "topic")
    ate, cate = calculate_ate(sentiment_feature, topic_feature)
    ate_features[sentiment_feature] = ate
    cate_feaures[sentiment_feature] = cate
    

In [None]:
ate_features

In [None]:
cate['sentiment US Senate'][0]

In [None]:
train_sub = joined[joined['topic US Senate'] == True].copy()
nonzero_columns = [col for col in train_sub.columns if col.startswith("topic") and (train_sub[col] != False).any()]

In [None]:
model = cate['sentiment US Senate'][0]
test = nonzero_columns[0]

In [None]:
model.cate_treatment_names()

In [None]:
x_example = joined[nonzero_columns].values

In [None]:
cate = model.effect(X=x_example)
print(f"CATE for Bias = 0: {cate[0]:.4f}")

In [None]:
cate_values = cate.ravel()
len(cate_values)

In [None]:
# Build CATE DataFrame
df_real_cate = joined[nonzero_columns].copy()
df_real_cate["cate"] = list(cate_values)

In [None]:
# Compute per-topic CATE variation
real_cate_by_topic = {}
for topic in nonzero_columns:
    group_means = df_real_cate.groupby(topic)["cate"].mean()
    real_cate_by_topic[topic] = {
        "Present": group_means.get(1, np.nan)
    }

real_cate_by_topic_df = pd.DataFrame(real_cate_by_topic).T

In [None]:
df_real_cate.groupby('topic Border')["cate"].mean()

In [None]:
real_cate_by_topic_df

In [None]:
test = real_cate_by_topic_df[real_cate_by_topic_df['Present (1)'] > 3]

In [None]:
test

In [None]:
import matplotlib.pyplot as plt

test.plot(kind="bar", figsize=(14, 6))
plt.axhline(0, color="gray", linestyle="--")
plt.ylabel("Average CATE")
plt.title("CATE Variation by Topic Presence")
plt.xticks(rotation=90)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
ate.keys()

In [None]:
ate

In [None]:
total_cate_results = {}

In [None]:
for sentiment, cate_models in cate_feaures.items():
    topic = sentiment.replace('sentiment', 'topic')
    train_sub = joined[joined[topic] == True].copy()
    nonzero_columns = [col for col in train_sub.columns if col.startswith("topic") and (train_sub[col] != False).any()]
    
    X_df = joined[nonzero_columns].copy()
    top_cate_results = {}

    for cls, model in cate_models.items():
        print(f"{topic} - bias {cls}")
        cates = model.effect(X_df.values).ravel()
        df_real_cate = X_df.copy()
        df_real_cate["CATE"] = cates
        real_cate_by_topic = {}
        for col in nonzero_columns:
            group_means = df_real_cate.groupby(col)["CATE"].mean()
            real_cate_by_topic[col] = {
                "Present": group_means.iloc[1],
                "Absent": group_means.iloc[0]
            }

        df = pd.DataFrame(real_cate_by_topic).T
        top10 = df.loc[df['Present'].abs().sort_values(ascending=False).index].head(10)
        top_cate_results[cls] = top10.to_dict()


    total_cate_results[sentiment] = top_cate_results

In [None]:
group_means

In [None]:
group_means.iloc[0]

In [None]:
test = pd.DataFrame.from_dict(total_cate_results, orient='index')

In [None]:
import json

pd.DataFrame.from_dict(ate_features, orient='index').to_csv("summary_50_ate.csv")
pd.DataFrame.from_dict(total_cate_results, orient='index').to_csv("summary_50_total_cate.csv")