In [1]:
!pip install econml numpy scikit-learn pandas pyarrow scipy matplotlib

[0m

In [4]:
type = "summary_100"

In [5]:
import pandas as pd

type_train_df_x = pd.read_parquet(f"df/{type}_train_df_x.gzip")
type_test_df_x = pd.read_parquet(f"df/{type}_test_df_x.gzip")

type_train_df_y = pd.read_parquet(f"df/{type}_train_df_y.gzip")
type_test_df_y = pd.read_parquet(f"df/{type}_test_df_y.gzip")
joined_train_df = pd.concat([type_train_df_x, type_train_df_y], axis=1).reset_index(drop=True)
joined_test_df = pd.concat([type_test_df_x, type_test_df_y], axis=1).reset_index(drop=True)

joined = pd.concat([joined_train_df, joined_test_df], axis=0).reset_index(drop=True)

In [6]:
joined.shape

(45089, 5637)

In [7]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [19]:


!pip install \
    --extra-index-url=https://pypi.nvidia.com \
    "cudf-cu12==25.4.*" "dask-cudf-cu12==25.4.*" "cuml-cu12==25.4.*" \
    "cugraph-cu12==25.4.*" "nx-cugraph-cu12==25.4.*" "cuspatial-cu12==25.4.*" \
    "cuproj-cu12==25.4.*" "cuxfilter-cu12==25.4.*" "cucim-cu12==25.4.*" \
    "pylibraft-cu12==25.4.*" "raft-dask-cu12==25.4.*" "cuvs-cu12==25.4.*" \
    "nx-cugraph-cu12==25.4.*"



Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cudf-cu12==25.4.*
  Downloading https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.4.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dask-cudf-cu12==25.4.*
  Downloading https://pypi.nvidia.com/dask-cudf-cu12/dask_cudf_cu12-25.4.0-py3-none-any.whl (50 kB)
Collecting cuml-cu12==25.4.*
  Downloading https://pypi.nvidia.com/cuml-cu12/cuml_cu12-25.4.0-cp311-cp311-manylinux_2_28_x86_64.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m69.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cugraph-cu12==25.4.*
  Downloading https://pypi.nvidia.com/cugraph-cu12/cugraph_cu12-25.4.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2

In [25]:
import cudf

joined_cudf = cudf.from_pandas(joined)

In [30]:
import time
from econml.dr import DRLearner, ForestDRLearner
import xgboost as xgb
from sklearn.preprocessing import LabelBinarizer
import numpy as np
from scipy.special import expit

def calculate_ate(sentiment_col, topic_col):
    outcome_col = 'int_bias'

    start = time.time()
    print(f"investigating {sentiment_col}")
    train_sub = joined[joined[topic_col] == True].copy()

    # Treatment (categorical: -1, 0, 1)
    T = train_sub[[sentiment_col]].astype(int).values
    
    # Confounders: embeddings + all topics except the one used
    W = train_sub[
        [col for col in train_sub.columns
         if col not in [sentiment_col, topic_col, outcome_col]]
    ].values
    
    # One-vs-rest binary outcomes
    lb = LabelBinarizer()
    Y_multi = lb.fit_transform(train_sub[outcome_col])
    bias_classes = lb.classes_
    
    # GPU-accelerated XGBoost models
    model_y = xgb.XGBClassifier(
        tree_method='hist',
        eval_metric='logloss',
        n_estimators=100,
        max_depth=8,
        random_state=0,
        device=device,
        objective='binary:logistic',
    )
    
    model_t = xgb.XGBClassifier(
        tree_method='hist',
        eval_metric='mlogloss',
        n_estimators=100,
        max_depth=8,
        random_state=0,
        device=device,
        objective='binary:logistic',
    )

    # ATE estimates per bias class
    ate_results = {}

    cate_models = {}

    # Effect modifiers X: same as W, or just topics

    total = [col for col in train_sub.columns if col.startswith("topic")]
    nonzero_columns = [col for col in train_sub.columns if col.startswith("topic") and (train_sub[col] != False).any()]
    print(f"effect modifier columns = {len(nonzero_columns)}, total columns = {len(total)}")
    
    X = train_sub[nonzero_columns].astype(int).values
    
    for i, cls in enumerate(bias_classes):
        Y_bin = Y_multi[:, i]
    
        if np.bincount(Y_bin).min() < 5:
            print(f"Skipping class {cls} due to insufficient samples.")
            continue
    
        dr = DRLearner(model_propensity=model_t, model_regression=model_y, discrete_outcome=True)
        dr.fit(Y=Y_bin, T=T, W=W)
    
        # Estimate ATE for categorical contrasts
        ate_pos_vs_neutral = dr.ate(T0=np.array([[0]]), T1=np.array([[1]]))
        ate_neg_vs_neutral = dr.ate(T0=np.array([[0]]), T1=np.array([[-1]]))
        ate_pos_vs_neg = dr.ate(T0=np.array([[-1]]), T1=np.array([[1]]))
        ate_neg_vs_pos = dr.ate(T0=np.array([[1]]), T1=np.array([[-1]]))
    
        ate_results[cls] = {
            "+1 vs 0": round(expit(ate_pos_vs_neutral[0]) - expit(0), 4),
            "-1 vs 0": round(expit(ate_neg_vs_neutral[0]) - expit(0), 4),
            "+1 vs -1": round(expit(ate_pos_vs_neg[0]) - expit(0), 4), 
        }

        # Estimate CATE
        forest_dr = ForestDRLearner(
            model_regression=model_y,
            model_propensity=model_t,
            discrete_outcome=True,
        )
        forest_dr.fit(Y=Y_bin, T=T, X=X, W=W)
        cate_models[cls] = forest_dr
    
    # # Display ATEs
    # for cls, effects in ate_results.items():
    #     print(f"\nBias = {cls}:")
    #     for contrast, val in effects.items():
    #         print(f"  ATE ({contrast}): {val:.4f}")

    end = time.time()

    print(f"Elapsed time: {end - start:.4f} seconds")

    return (ate_results, cate_models)


In [31]:
gain_features = [
'sentiment US Senate',
 'sentiment Healthcare',
 'sentiment Immigration',
 'sentiment Donald Trump',
 'sentiment GOP',
 'sentiment Elizabeth Warren',
 'sentiment Elections',
 'sentiment Politics',
 'sentiment Terrorism',
 'sentiment Joe Biden',
 'sentiment Hillary Clinton',
 'sentiment Impeachment',
 'sentiment Media Bias',
 'sentiment White House',
 'sentiment Justice Department']

In [34]:
ate_features = {}
cate_feaures = {}

In [35]:
for sentiment_feature in gain_features:
    topic_feature = sentiment_feature.replace("sentiment", "topic")
    ate, cate = calculate_ate(sentiment_feature, topic_feature)
    ate_features[sentiment_feature] = ate
    cate_feaures[sentiment_feature] = cate
    

investigating sentiment US Senate
effect modifier columns = 380, total columns = 2434
Elapsed time: 75.1341 seconds
investigating sentiment Healthcare
effect modifier columns = 313, total columns = 2434
Elapsed time: 124.0428 seconds
investigating sentiment Immigration
effect modifier columns = 365, total columns = 2434
Elapsed time: 134.4093 seconds
investigating sentiment Donald Trump
effect modifier columns = 1048, total columns = 2434
Elapsed time: 365.5475 seconds
investigating sentiment GOP
effect modifier columns = 143, total columns = 2434
Elapsed time: 32.2232 seconds
investigating sentiment Elizabeth Warren
effect modifier columns = 110, total columns = 2434
Elapsed time: 22.2809 seconds
investigating sentiment Elections
effect modifier columns = 658, total columns = 2434
Elapsed time: 335.3834 seconds
investigating sentiment Politics
effect modifier columns = 1120, total columns = 2434
Elapsed time: 525.3960 seconds
investigating sentiment Terrorism
effect modifier columns =

In [41]:
total_cate_results = {}

In [42]:
for sentiment, cate_models in cate_feaures.items():
    topic = sentiment.replace('sentiment', 'topic')
    train_sub = joined[joined[topic] == True].copy()
    nonzero_columns = [col for col in train_sub.columns if col.startswith("topic") and (train_sub[col] != False).any()]
    
    X_df = joined[nonzero_columns].copy()
    top_cate_results = {}

    for cls, model in cate_models.items():
        print(f"{topic} - bias {cls}")
        cates = model.effect(X_df.values).ravel()
        df_real_cate = X_df.copy()
        df_real_cate["CATE"] = cates
        real_cate_by_topic = {}
        for col in nonzero_columns:
            group_means = df_real_cate.groupby(col)["CATE"].mean()
            real_cate_by_topic[col] = {
                "Present": group_means.iloc[1],
                "Absent": group_means.iloc[0]
            }

        df = pd.DataFrame(real_cate_by_topic).T
        top10 = df.loc[df['Present'].abs().sort_values(ascending=False).index].head(10)
        top_cate_results[cls] = top10.to_dict()


    total_cate_results[sentiment] = top_cate_results

topic US Senate - bias 0
topic US Senate - bias 1
topic US Senate - bias 2
topic Healthcare - bias 0
topic Healthcare - bias 1
topic Healthcare - bias 2
topic Immigration - bias 0
topic Immigration - bias 1
topic Immigration - bias 2
topic Donald Trump - bias 0
topic Donald Trump - bias 1
topic Donald Trump - bias 2
topic GOP - bias 0
topic GOP - bias 1
topic GOP - bias 2
topic Elizabeth Warren - bias 0
topic Elizabeth Warren - bias 1
topic Elizabeth Warren - bias 2
topic Elections - bias 0
topic Elections - bias 1
topic Elections - bias 2
topic Politics - bias 0
topic Politics - bias 1
topic Politics - bias 2
topic Terrorism - bias 0
topic Terrorism - bias 1
topic Terrorism - bias 2
topic Joe Biden - bias 0
topic Joe Biden - bias 1
topic Joe Biden - bias 2
topic Hillary Clinton - bias 0
topic Hillary Clinton - bias 1
topic Hillary Clinton - bias 2
topic Impeachment - bias 0
topic Impeachment - bias 1
topic Impeachment - bias 2
topic Media Bias - bias 0
topic Media Bias - bias 1
topic 

In [43]:
total_cate_results

{'sentiment US Senate': {0: {'Present': {'topic Ken Starr': 0.9300555559127949,
    'topic 2024 Senate Elections': -0.5348799056676445,
    'topic Vaccine Mandates': -0.5003540353707941,
    'topic Coronavirus': -0.47945845382484087,
    'topic Coronavirus Vaccine': -0.4703063980298561,
    'topic Coronavirus Stimulus': -0.46144784938342476,
    'topic Social Distancing': -0.4497557332736658,
    'topic Steven Mnuchin': -0.43661876757178947,
    'topic Senate Intelligence Committee': -0.4339993892090521,
    'topic Safety And Sanity During COVID-19': -0.42882040058017346},
   'Absent': {'topic Ken Starr': -0.06571937306357643,
    'topic 2024 Senate Elections': -0.06559854190722693,
    'topic Vaccine Mandates': -0.06507799585809215,
    'topic Coronavirus': -0.04311891712217433,
    'topic Coronavirus Vaccine': -0.06324361793416676,
    'topic Coronavirus Stimulus': -0.06529265120726505,
    'topic Social Distancing': -0.06535318727491746,
    'topic Steven Mnuchin': -0.06557603332257

In [44]:
pd.DataFrame.from_dict(ate_features, orient='index').to_csv("summary_100_ate.csv")
pd.DataFrame.from_dict(total_cate_results, orient='index').to_csv("summary_100_total_cate.csv")