In [56]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split # type: ignore
import matplotlib.pyplot as plt # type: ignore

In [57]:
# Get current working directory
import os
cwd = os.getcwd()
print("Current working directory:", cwd)
os.chdir("n:/Incubator2025_ComputationalLifeCourse")

Current working directory: n:\Incubator2025_ComputationalLifeCourse


In [58]:
import sys
sys.path.append("Scripts/g_comp")  # relative path from your current working directory
import g_comp as gc

In [89]:
# Reload the module (if it's been edited and needs to be reloaded)
import importlib
importlib.reload(gc)

<module 'g_comp' from 'n:\\Incubator2025_ComputationalLifeCourse\\Scripts/g_comp\\g_comp.py'>

In [60]:
def set_seed(seed =42):
    """Set seed for reproducibility across multiple libraries"""
    random.seed(seed)  # Python's built-in random
    np.random.seed(seed)  # NumPy
    torch.manual_seed(seed)  # PyTorch
    torch.cuda.manual_seed_all(seed)  # PyTorch CUDA
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(2025)  # Call this at the beginning of your code

In [61]:
## Load the data 
import pandas as pd
df = pd.read_csv("Data/Processed/g_data.csv")
df = df.sort_values(["mergeid", "t_age"]) 

In [62]:
## Check the number of dropped cases 
complete_y_self_rated_65_75_dic_mergeids = gc.summarize_mergeid_completeness(df, ["y_self_rated_65_75_dic", "dt_n_years_disease_dic"], "self_rated 65–75")

[self_rated 65–75]
Original mergeids: 20806
Complete mergeids (no missing values in Y): 14961
Number of unique mergeids dropped: 5845



In [63]:
###############
## Data prep ##
###############

# N = number of individuals, T = number of time points (i.e., 33)
# Generate datasets for pooled and by regime 
T = 33
df_self_rated_65_75 = gc.get_valid_df(df, complete_y_self_rated_65_75_dic_mergeids, "self_rated 65–75", T)
df_self_rated_med_65_75 = df_self_rated_65_75[df_self_rated_65_75["mod_welfare_regime_mediterranean"] == 1.0]
df_self_rated_cor_65_75 = df_self_rated_65_75[df_self_rated_65_75["mod_welfare_regime_corporatist"] == 1.0]
df_self_rated_scan_65_75 = df_self_rated_65_75[df_self_rated_65_75["mod_welfare_regime_scandinavian"] == 1.0]
 

# Mediterranean 
feature_cols_outcome_med_65_75 = gc.get_feature_cols(df_self_rated_med_65_75, context="outcome and med")
feature_cols_tv_covar_med_65_75 = gc.get_feature_cols(df_self_rated_med_65_75, context="tv_covar and med")

# Corporatist
feature_cols_outcome_cor_65_75 = gc.get_feature_cols(df_self_rated_cor_65_75, context="outcome and cor")
feature_cols_tv_covar_cor_65_75 = gc.get_feature_cols(df_self_rated_cor_65_75, context="tv_covar and cor")

# Scandinavian
feature_cols_outcome_scan_65_75 = gc.get_feature_cols(df_self_rated_scan_65_75, context="outcome and scan")
feature_cols_tv_covar_scan_65_75 = gc.get_feature_cols(df_self_rated_scan_65_75, context="tv_covar and scan")


# Convert to (N, T, D) tensor
# Treatment and covariates from the dataset 
 
# Mediterranean 
N_self_rated_med_65_75 = df_self_rated_med_65_75['mergeid'].nunique()
X_self_rated_med_65_75 = gc.convert_df_to_X(df_self_rated_med_65_75, feature_cols_outcome_med_65_75, N_self_rated_med_65_75, T)
X_disease_med_65_75  = gc.convert_df_to_X(df_self_rated_med_65_75, feature_cols_tv_covar_med_65_75, N_self_rated_med_65_75, T)

# Corporatist
N_self_rated_cor_65_75 = df_self_rated_cor_65_75['mergeid'].nunique()
X_self_rated_cor_65_75 = gc.convert_df_to_X(df_self_rated_cor_65_75, feature_cols_outcome_cor_65_75, N_self_rated_cor_65_75, T)
X_disease_cor_65_75  = gc.convert_df_to_X(df_self_rated_cor_65_75, feature_cols_tv_covar_cor_65_75, N_self_rated_cor_65_75, T)

# Scandinavian
N_self_rated_scan_65_75 = df_self_rated_scan_65_75['mergeid'].nunique()
X_self_rated_scan_65_75 = gc.convert_df_to_X(df_self_rated_scan_65_75, feature_cols_outcome_scan_65_75, N_self_rated_scan_65_75, T)
X_disease_scan_65_75  = gc.convert_df_to_X(df_self_rated_scan_65_75, feature_cols_tv_covar_scan_65_75, N_self_rated_scan_65_75, T)

[self_rated 65–75]
Mergeids with complete outcome and 33 unique ages: 14961
[get_feature_cols] Context: outcome and med | Exclude: ['mod_country_italy']
⚠️ dt_n_years_disease_dic IS STILL INCLUDED
[get_feature_cols] Context: tv_covar and med | Exclude: ['mod_country_italy', 'dt_n_years_disease_dic']
✅ dt_n_years_disease_dic successfully excluded
[get_feature_cols] Context: outcome and cor | Exclude: ['mod_country_germany']
⚠️ dt_n_years_disease_dic IS STILL INCLUDED
[get_feature_cols] Context: tv_covar and cor | Exclude: ['mod_country_germany', 'dt_n_years_disease_dic']
✅ dt_n_years_disease_dic successfully excluded
[get_feature_cols] Context: outcome and scan | Exclude: ['mod_country_sweden']
⚠️ dt_n_years_disease_dic IS STILL INCLUDED
[get_feature_cols] Context: tv_covar and scan | Exclude: ['mod_country_sweden', 'dt_n_years_disease_dic']
✅ dt_n_years_disease_dic successfully excluded


In [64]:
###############
## Y and L_t ##
###############

# Binary outcomes  
y_self_rated_med_65_75_dic = gc.extract_y_tensor(df_self_rated_med_65_75, "y_self_rated_65_75_dic")
y_self_rated_cor_65_75_dic = gc.extract_y_tensor(df_self_rated_cor_65_75, "y_self_rated_65_75_dic")
y_self_rated_scan_65_75_dic = gc.extract_y_tensor(df_self_rated_scan_65_75, "y_self_rated_65_75_dic")

# Binary outcomes for tv covar 
# Mediterranean 
y_self_rated_disease_med_65_75_dic = gc.extract_y_tensor(df_self_rated_med_65_75, "dt_n_years_disease_dic")
y_self_rated_disease_cor_65_75_dic = gc.extract_y_tensor(df_self_rated_cor_65_75, "dt_n_years_disease_dic")
y_self_rated_disease_scan_65_75_dic = gc.extract_y_tensor(df_self_rated_scan_65_75, "dt_n_years_disease_dic")

In [65]:
#####################
## \vec{a} and L_0 ##
#####################

# Extract high-level treatment trajectory features with covariates for each X matrix
# Mediterranean
X_med_features_self_rated_65_75_dic = gc.extract_features(X_self_rated_med_65_75, feature_cols_outcome_med_65_75)
X_med_features_disease_65_75_dic  = gc.extract_features(X_disease_med_65_75, feature_cols_tv_covar_med_65_75)

# Corporatist
X_cor_features_self_rated_65_75_dic = gc.extract_features(X_self_rated_cor_65_75, feature_cols_outcome_cor_65_75)
X_cor_features_disease_65_75_dic  = gc.extract_features(X_disease_cor_65_75, feature_cols_tv_covar_cor_65_75)

# Scandinavian
X_scan_features_self_rated_65_75_dic = gc.extract_features(X_self_rated_scan_65_75, feature_cols_outcome_scan_65_75)
X_scan_features_disease_65_75_dic  = gc.extract_features(X_disease_scan_65_75, feature_cols_tv_covar_scan_65_75)



# Feature Names
trt_features_med_self_rated_65_75_dic   = gc.create_feature_df(X_med_features_self_rated_65_75_dic, feature_cols_outcome_med_65_75, "self_rated 65–75")
trt_features_cor_self_rated_65_75_dic   = gc.create_feature_df(X_cor_features_self_rated_65_75_dic, feature_cols_outcome_cor_65_75, "self_rated 65–75")
trt_features_scan_self_rated_65_75_dic   = gc.create_feature_df(X_scan_features_self_rated_65_75_dic, feature_cols_outcome_scan_65_75, "self_rated 65–75")


self_rated 65–75: Feature dataframe shape = (4706, 48)
self_rated 65–75: Feature dataframe shape = (7382, 48)
self_rated 65–75: Feature dataframe shape = (2873, 48)


In [66]:
#############################
## Counterfactual analysis ## 
#############################

import pandas as pd
# Load CSV
medoid_df = pd.read_csv("Data/Processed/medoid_seq_data.csv")

# Sort by cluster and t_age
medoid_df = medoid_df.sort_values(["cluster", "t_age"])

# Reorder columns so cluster is first
cols = ["cluster"] + [col for col in medoid_df.columns if col != "cluster"]
medoid_df = medoid_df[cols]

N_medoid = medoid_df['cluster'].nunique()
X_medoid_self_rated_65_75_dic   = gc.convert_df_to_X(medoid_df, feature_cols_outcome_med_65_75, N_medoid, T)
X_medoid_disease_65_75_dic  = gc.convert_df_to_X(medoid_df, feature_cols_tv_covar_med_65_75, N_medoid, T)
X_medoid_features_65_75_dic   = gc.extract_treatment_features_from_medoid(X_medoid_self_rated_65_75_dic, feature_cols_outcome_med_65_75) 

In [67]:
# Filter columns that start with 'trt_'
trt_cols = [col for col in medoid_df.columns if col.startswith('trt_')]

# Group by 'cluster' and take the mean
trt_mean_by_cluster = medoid_df.groupby('cluster')[trt_cols].mean().reset_index()

print(trt_mean_by_cluster)


   cluster  trt_full_time_employment  trt_part_time_employment  \
0        1                  0.969697                  0.000000   
1        2                  0.939394                  0.000000   
2        3                  0.090909                  0.000000   
3        4                  0.212121                  0.757576   

   trt_in_education  trt_not_working  trt_cohabit_0_children  \
0          0.030303         0.000000                0.030303   
1          0.060606         0.000000                0.575758   
2          0.000000         0.909091                0.030303   
3          0.030303         0.000000                0.090909   

   trt_not_cohabit_0_children  trt_cohabit_with_children  \
0                    0.151515                   0.818182   
1                    0.424242                   0.000000   
2                    0.151515                   0.818182   
3                    0.151515                   0.757576   

   trt_not_cohabit_with_children  
0           

In [68]:
#######################
## Load Final models ##
#######################
from joblib import load
 
# Med
y_self_rated_disease_med_super_learner = load("N:/Incubator2025_ComputationalLifeCourse/Intermediate/self_rated_disease_65-75_(med)_super_learner.pkl")
y_self_rated_med_super_learner = load("N:/Incubator2025_ComputationalLifeCourse/Intermediate/self_rated_65-75_(med)_super_learner.pkl")

# Cor  
y_self_rated_disease_cor_super_learner = load("N:/Incubator2025_ComputationalLifeCourse/Intermediate/self_rated_disease_65-75_(cor)_super_learner.pkl")
y_self_rated_cor_super_learner =  load("N:/Incubator2025_ComputationalLifeCourse/Intermediate/self_rated_65-75_(cor)_super_learner.pkl")

# Scan 
y_self_rated_disease_scan_super_learner = load("N:/Incubator2025_ComputationalLifeCourse/Intermediate/self_rated_disease_65-75_(scan)_super_learner.pkl")
y_self_rated_scan_super_learner =  load("N:/Incubator2025_ComputationalLifeCourse/Intermediate/self_rated_65-75_(scan)_super_learner.pkl")

In [91]:
from sklearn.utils import resample

# Feature Names
trt_features_med_self_rated_65_75_dic   = gc.create_feature_df(X_med_features_self_rated_65_75_dic, feature_cols_outcome_med_65_75, "self_rated 65–75")
trt_features_cor_self_rated_65_75_dic   = gc.create_feature_df(X_cor_features_self_rated_65_75_dic, feature_cols_outcome_cor_65_75, "self_rated 65–75")
trt_features_scan_self_rated_65_75_dic   = gc.create_feature_df(X_scan_features_self_rated_65_75_dic, feature_cols_outcome_scan_65_75, "self_rated 65–75")


def run_mc_bootstrap_once_regime_fixed_model(seed):
    # Step 1: Bootstrap indices
    # Bootstrap sample (m-out-of-n) from X and y.
    def bootstrap(X, y, seed, frac=0.7):
        m = int(len(X) * frac)
        idx = resample(np.arange(len(X)), replace=True, n_samples=m, random_state=seed)
        return X[idx], y[idx]
 

    X_mob_med, y_mob_med = bootstrap(X_med_features_self_rated_65_75_dic, y_self_rated_med_65_75_dic, seed+1)
    X_dis_med, _ = bootstrap(X_med_features_disease_65_75_dic, y_self_rated_disease_med_65_75_dic, seed+1)

    X_mob_scan, y_mob_scan = bootstrap(X_scan_features_self_rated_65_75_dic, y_self_rated_scan_65_75_dic, seed+2)
    X_dis_scan, _ = bootstrap(X_scan_features_disease_65_75_dic, y_self_rated_disease_scan_65_75_dic, seed+2)

    X_mob_corp, y_mob_corp = bootstrap(X_cor_features_self_rated_65_75_dic, y_self_rated_cor_65_75_dic, seed+3)
    X_dis_corp, _ = bootstrap(X_cor_features_disease_65_75_dic, y_self_rated_disease_cor_65_75_dic, seed+3)

    # Step 2: Update with medoid features
    def update_with_medoids(X_dis, X_mob):
        return gc.generate_updated_list(X_dis, X_medoid_features_65_75_dic, cols_to_replace=30), \
               gc.generate_updated_list(X_mob, X_medoid_features_65_75_dic, cols_to_replace=30)
 
    X_dis_med_updated, X_mob_med_updated = update_with_medoids(X_dis_med, X_mob_med)
    X_dis_scan_updated, X_mob_scan_updated = update_with_medoids(X_dis_scan, X_mob_scan)
    X_dis_corp_updated, X_mob_corp_updated = update_with_medoids(X_dis_corp, X_mob_corp)

    # Step 3: Use pre-trained models (must be defined globally or passed in)
    # e.g., sl_dis, sl_mob, etc. are pre-trained super learners
 
    _, _, ates_med, stand_ates_med, risk_ratios_med = gc.counterfactual_y_under_X_medoid_features(
        tv_cov_model=y_self_rated_disease_med_super_learner,
        y_model=y_self_rated_med_super_learner,
        y_dic=y_mob_med,
        X_features_disease_65_75_dic_updated_list=X_dis_med_updated,
        X_features_outcome_65_75_dic_updated_list=X_mob_med_updated,
        tv_cov_name="dt_n_years_disease_dic",
        outcome_feature_names=trt_features_med_self_rated_65_75_dic.columns.tolist()
    )

    _, _, ates_cor, stand_ates_cor, risk_ratios_cor =  gc.counterfactual_y_under_X_medoid_features(
        tv_cov_model=y_self_rated_disease_cor_super_learner,
        y_model=y_self_rated_cor_super_learner,
        y_dic=y_mob_corp,
        X_features_disease_65_75_dic_updated_list=X_dis_corp_updated,
        X_features_outcome_65_75_dic_updated_list=X_mob_corp_updated,
        tv_cov_name="dt_n_years_disease_dic",
        outcome_feature_names=trt_features_cor_self_rated_65_75_dic.columns.tolist()
    )

    _, _, ates_scan, stand_ates_scan, risk_ratios_scan =  gc.counterfactual_y_under_X_medoid_features(
        tv_cov_model=y_self_rated_disease_scan_super_learner,
        y_model=y_self_rated_scan_super_learner,
        y_dic=y_mob_scan,
        X_features_disease_65_75_dic_updated_list=X_dis_scan_updated,
        X_features_outcome_65_75_dic_updated_list=X_mob_scan_updated,
        tv_cov_name="dt_n_years_disease_dic",
        outcome_feature_names=trt_features_scan_self_rated_65_75_dic.columns.tolist()
    )

    # Step 4: Format results
    ate_result = {"seed": seed} 
    ate_result.update(gc.flatten_ates_dict(ates_med, "Mediterranean"))
    ate_result.update(gc.flatten_ates_dict(ates_cor, "Corporatist"))
    ate_result.update(gc.flatten_ates_dict(ates_scan, "Scandinavian"))

    stand_ate_result = {"seed": seed} 
    stand_ate_result.update(gc.flatten_ates_dict(stand_ates_med, "Mediterranean"))
    stand_ate_result.update(gc.flatten_ates_dict(stand_ates_cor, "Corporatist"))
    stand_ate_result.update(gc.flatten_ates_dict(stand_ates_scan, "Scandinavian"))
    
    risk_ratio_result = {"seed": seed} 
    risk_ratio_result.update(gc.flatten_ates_dict(risk_ratios_med, "Mediterranean"))
    risk_ratio_result.update(gc.flatten_ates_dict(risk_ratios_cor, "Corporatist"))
    risk_ratio_result.update(gc.flatten_ates_dict(risk_ratios_scan, "Scandinavian"))
    

    return ate_result, stand_ate_result, risk_ratio_result


self_rated 65–75: Feature dataframe shape = (4706, 48)
self_rated 65–75: Feature dataframe shape = (7382, 48)
self_rated 65–75: Feature dataframe shape = (2873, 48)


In [84]:
results_self_rated_65_75_dic = [run_mc_bootstrap_once_regime_fixed_model(seed) for seed in range(42, 50)]
df_results_self_rated_65_75_dic = pd.DataFrame(results_self_rated_65_75_dic)


--- Medoid 0 ---
  Predicted dt_n_years_disease_dic: mean=0.004
  Potential outcome: mean=0.464

--- Medoid 1 ---
  Predicted dt_n_years_disease_dic: mean=0.008
  Potential outcome: mean=0.557

--- Medoid 2 ---
  Predicted dt_n_years_disease_dic: mean=0.007
  Potential outcome: mean=0.502

--- Medoid 3 ---
  Predicted dt_n_years_disease_dic: mean=0.009
  Potential outcome: mean=0.543
medoid_0: ATE=-0.038, Std ATE=-0.076, RR=0.924
medoid_1: ATE=0.056, Std ATE=0.112, RR=1.112
medoid_2: ATE=0.000, Std ATE=0.000, RR=1.000
medoid_3: ATE=0.041, Std ATE=0.082, RR=1.082

--- Medoid 0 ---
  Predicted dt_n_years_disease_dic: mean=0.000
  Potential outcome: mean=0.299

--- Medoid 1 ---
  Predicted dt_n_years_disease_dic: mean=0.000
  Potential outcome: mean=0.278

--- Medoid 2 ---
  Predicted dt_n_years_disease_dic: mean=0.002
  Potential outcome: mean=0.347

--- Medoid 3 ---
  Predicted dt_n_years_disease_dic: mean=0.000
  Potential outcome: mean=0.351
medoid_0: ATE=-0.048, Std ATE=-0.139, RR=0

In [92]:
# Fixed modles 
# Separate outcome models 

from joblib import Parallel, delayed
import pandas as pd
import time

start = time.time()

# Define number of bootstrap replicates and parallel jobs
n_bootstrap = 5  # or any number you like
n_jobs = 8 
batch_size = 15

# Run bootstrap in parallel
bootstrap_results_self_rated_65_75_dic = Parallel(n_jobs=n_jobs, batch_size = batch_size)(
    delayed(run_mc_bootstrap_once_regime_fixed_model)(seed) for seed in range(42, 42 + n_bootstrap)
)

end = time.time()
print(f"Finished 1000 bootstraps in {(end - start)/60:.2f} minutes")

# Convert to DataFrame
df_bootstrap_results_self_rated_65_75_dic = pd.DataFrame(bootstrap_results_self_rated_65_75_dic)


Finished 1000 bootstraps in 0.15 minutes


In [93]:
df_bootstrap_results_self_rated_65_75_dic

Unnamed: 0,0,1,2
0,"{'seed': 42, 'Mediterranean_medoid_0': -0.0734...","{'seed': 42, 'Mediterranean_medoid_0': -0.1367...","{'seed': 42, 'Mediterranean_medoid_0': 0.86321..."
1,"{'seed': 43, 'Mediterranean_medoid_0': -0.0851...","{'seed': 43, 'Mediterranean_medoid_0': -0.1542...","{'seed': 43, 'Mediterranean_medoid_0': 0.84571..."
2,"{'seed': 44, 'Mediterranean_medoid_0': -0.0963...","{'seed': 44, 'Mediterranean_medoid_0': -0.1713...","{'seed': 44, 'Mediterranean_medoid_0': 0.82867..."
3,"{'seed': 45, 'Mediterranean_medoid_0': -0.0933...","{'seed': 45, 'Mediterranean_medoid_0': -0.1673...","{'seed': 45, 'Mediterranean_medoid_0': 0.83268..."
4,"{'seed': 46, 'Mediterranean_medoid_0': -0.0839...","{'seed': 46, 'Mediterranean_medoid_0': -0.1546...","{'seed': 46, 'Mediterranean_medoid_0': 0.84535..."


In [95]:
# Separate out raw ATE, standardized ATE, and risk ratio results
raw_ate_results_self_rated_65_75_dic = [r[0] for r in bootstrap_results_self_rated_65_75_dic]
std_ate_results_self_rated_65_75_dic = [r[1] for r in bootstrap_results_self_rated_65_75_dic]
risk_ratio_results_self_rated_65_75_dic = [r[2] for r in bootstrap_results_self_rated_65_75_dic]

df_raw_ate_self_rated_65_75_dic = pd.DataFrame(raw_ate_results_self_rated_65_75_dic)
df_std_ate_self_rated_65_75_dic = pd.DataFrame(std_ate_results_self_rated_65_75_dic)
df_risk_ratio_self_rated_65_75_dic = pd.DataFrame(risk_ratio_results_self_rated_65_75_dic)

df_raw_ate_summary_self_rated_65_75_dic = gc.summarize_bootstrap_percentile_ci(df_raw_ate_self_rated_65_75_dic)
df_std_ate_summary_self_rated_65_75_dic = gc.summarize_bootstrap_percentile_ci(df_std_ate_self_rated_65_75_dic)
df_risk_ratio_summary_self_rated_65_75_dic = gc.summarize_bootstrap_percentile_ci(df_risk_ratio_self_rated_65_75_dic, risk_ratio=True)

# Save
gc.save_results_df(df_raw_ate_self_rated_65_75_dic, prefix = "raw_ate_df", label="self_rated 65–75")
gc.save_results_df(df_std_ate_self_rated_65_75_dic, prefix = "std_ate_df", label="self_rated 65–75")
gc.save_results_df(df_risk_ratio_self_rated_65_75_dic, prefix = "risk_ratio_df", label="self_rated 65–75")


gc.save_results_df(df_raw_ate_summary_self_rated_65_75_dic, prefix = "raw_ate_summary", label="self_rated 65–75")
gc.save_results_df(df_std_ate_summary_self_rated_65_75_dic, prefix = "std_ate_summary", label="self_rated 65–75")
gc.save_results_df(df_risk_ratio_summary_self_rated_65_75_dic, prefix = "risk_ratio_summary", label="self_rated 65–75")

Saved: Results\raw_ate_df_self_rated_65-75_all.csv
Saved: Results\std_ate_df_self_rated_65-75_all.csv
Saved: Results\risk_ratio_df_self_rated_65-75_all.csv
Saved: Results\raw_ate_summary_self_rated_65-75_all.csv
Saved: Results\std_ate_summary_self_rated_65-75_all.csv
Saved: Results\risk_ratio_summary_self_rated_65-75_all.csv


In [96]:
df_risk_ratio_self_rated_65_75_dic

Unnamed: 0,seed,Mediterranean_medoid_0,Mediterranean_medoid_1,Mediterranean_medoid_2,Mediterranean_medoid_3,Corporatist_medoid_0,Corporatist_medoid_1,Corporatist_medoid_2,Corporatist_medoid_3,Scandinavian_medoid_0,Scandinavian_medoid_1,Scandinavian_medoid_2,Scandinavian_medoid_3
0,42,0.863211,1.037974,0.933844,1.010518,0.723041,0.671872,0.839815,0.849975,0.80145,0.769646,0.851969,1.03655
1,43,0.845719,1.01467,0.914358,0.988692,0.749468,0.696763,0.870188,0.880825,0.821195,0.788504,0.87238,1.061545
2,44,0.828673,0.995153,0.895968,0.969558,0.764446,0.710535,0.887594,0.898658,0.766106,0.736825,0.815225,0.991486
3,45,0.832683,0.999932,0.900288,0.974087,0.739936,0.687769,0.85985,0.870059,0.820679,0.787119,0.870924,1.059715
4,46,0.845358,1.017892,0.91483,0.990917,0.735361,0.683673,0.852826,0.863342,0.78959,0.75917,0.840101,1.0219


In [None]:
import numpy as np
import pandas as pd

# Assuming your DataFrame is called df
# and looks like the one you pasted with "seed" as the first column

# 1. Drop the seed column if it's still in df
if 'seed' in df_risk_ratio_self_rated_65_75_dic.columns:
    df = df.drop(columns=['seed'])

# 2. Define the reference medoid_2 column for each regime
ref_cols = {
    "Mediterranean": "Mediterranean_medoid_2",
    "Corporatist": "Corporatist_medoid_2",
    "Scandinavian": "Scandinavian_medoid_2"
}

# 3. Subtract medoid_2 values from other medoids (row by row)
for regime, ref_col in ref_cols.items():
    for col in df.columns:
        if col.startswith(regime) and "medoid" in col and col != ref_col:
            df[col] = df[col] - df[ref_col]

    # Drop the reference column itself (optional)
    if ref_col in df.columns:
        df = df.drop(columns=[ref_col])

# 4. Compute summary stats: mean and 95% percentile-based CI for each column
summary_rows = []
for col in df.columns:
    mean_val = df[col].mean()
    ci_lower = np.percentile(df[col], 2.5)
    ci_upper = np.percentile(df[col], 97.5)

    summary_rows.append({
        "medoid_gap": col,
        "mean": mean_val,
        "ci_lower": ci_lower,
        "ci_upper": ci_upper
    })

# 5. Create summary DataFrame
summary_df = pd.DataFrame(summary_rows)
 
