In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from lifelines.fitters.kaplan_meier_fitter import KaplanMeierFitter
from tableone import TableOne
from scipy.stats import pearsonr
%matplotlib inline

In [None]:
treatment_col = 'W'
duration_col = 'Y'
event_col = 'D'
iv_col = 'Z'

T_max = 26*7


included_groups = ['Control', 'JSI', 'HI']


df = pd.read_csv('example_data/illi_data.csv', index_col=0)

df.rename({'control': 'Control', 'jsie': 'JSI', 'hie': 'HI'}, axis=1, inplace=True)

df['group'] = df[['Control', 'JSI', 'HI']].idxmax(axis=1)

df = df[df['group'].isin(included_groups)]

df['lagree'].fillna(0, inplace=True)

df[iv_col] = (df['group'] != 'Control').astype(int)

df[treatment_col] = df['lagree']

df[event_col] = df['rehirein'].astype(int)

df[duration_col] = df['rehiredt'] - df['claimdt']

weeks_paid_idx = df[df['rehiredt'].isnull()].index
df.loc[weeks_paid_idx, duration_col] = 7*df.loc[weeks_paid_idx, 'wkpaid'].values

df = df[df[duration_col] >= 0]

df['other_race'] = 1 - df['black'] - df['white']

df

In [None]:
columns = ['age', 'male', 'black', 'white', 'other_race', iv_col, treatment_col, event_col, duration_col, ]
categorical = ['male', 'black', 'other_race', 'white', iv_col, treatment_col, event_col]
groupby = ['group']
mytable = TableOne(df, columns, categorical, groupby, missing=False)


mytable.tableone

In [None]:
print(mytable.tableone.to_latex(escape=False))

In [None]:
df.columns

In [None]:

group_names = ['JSI', 'HI']

fig_max = 7*24

mol = 18
node = 18

all_lims = []

coefs = {}


fig, axes = plt.subplots(2,3, figsize=(12,8))

for idg, g in enumerate(['jsie', 'hie']):
    res_df = pd.read_csv(f'/Users/tomer/git/csf/output/illinois/{g}_train_imputed_with_htes_ci_18_node_{node}_mol_{mol}_nimp_2500_nt_2000_rep_1.csv', index_col=0)




    ax = axes[idg, 0]

    ax.scatter(res_df['MEAN_HTE'].values, res_df['CSF'].values, s=1)
    p = pearsonr(res_df['MEAN_HTE'].values, res_df['CSF'].values)[0]
    ax.set_title(f'{group_names[idg]} - CSF vs. MISTR - cor: {p:.3f}', fontsize=13)
    ax.set_xlabel('MISTR', fontsize=14)
    ax.set_ylabel('CSF', fontsize=14)
    ax.tick_params(axis='both', which='major', labelsize=11)
    ax.tick_params(axis='both', which='minor', labelsize=11)

    coefs[f"{idg}_0"] = np.polyfit(res_df['MEAN_HTE'].values, res_df['CSF'].values, 1)  # Fit a line (degree 1)
    


    
    ax = axes[idg, 1]

    ax.scatter(res_df['IV_MEAN_HTE'].values, res_df['MEAN_HTE'].values, s=1)
    p = pearsonr(res_df['IV_MEAN_HTE'].values, res_df['MEAN_HTE'].values)[0]
    ax.set_title(f'{group_names[idg]} - MISTR-IV vs. MISTR - cor: {p:.3f}', fontsize=13)
    ax.set_xlabel('MISTR-IV', fontsize=14)
    ax.set_ylabel('MISTR', fontsize=14)
    ax.tick_params(axis='both', which='major', labelsize=11)
    ax.tick_params(axis='both', which='minor', labelsize=11)

    coefs[f"{idg}_1"] =  np.polyfit(res_df['IV_MEAN_HTE'].values, res_df['MEAN_HTE'].values, 1)  # Fit a line (degree 1)

    


    ax = axes[idg, 2]

    ax.scatter(res_df['IV_MEAN_HTE'].values, res_df['CSF'].values, s=1)
    p = pearsonr(res_df['IV_MEAN_HTE'].values, res_df['CSF'].values)[0]
    ax.set_title(f'{group_names[idg]} - MISTR-IV vs. CSF - cor: {p:.3f}', fontsize=13)
    ax.set_xlabel('MISTR-IV', fontsize=14)
    ax.set_ylabel('CSF', fontsize=14)
    ax.tick_params(axis='both', which='major', labelsize=11)
    ax.tick_params(axis='both', which='minor', labelsize=11)

    coefs[f"{idg}_2"] = np.polyfit(res_df['IV_MEAN_HTE'].values, res_df['CSF'].values, 1)  # Fit a line (degree 1)



for kk in range(2):
    for rr in range(3):
        all_lims.append(axes[kk,rr].get_xlim())
        all_lims.append(axes[kk,rr].get_ylim())

min_lim = np.min(np.array(all_lims))
max_lim = np.max(np.array(all_lims))

x_line = np.linspace(min_lim, max_lim, 50)


for kk in range(2):
    for rr in range(3):
        ax = axes[kk, rr]
        x_line = np.linspace(min_lim, max_lim, 50)

        slope, intercept = coefs[f'{kk}_{rr}']
        line_y = slope * x_line + intercept  # Equation of the line

        ax.plot(x_line, x_line, ls='--', color='r')
        ax.plot(x_line, line_y, ls='--', color='k')

        ax.set_xlim([min_lim, max_lim])
        ax.set_ylim([min_lim, max_lim])



fig.tight_layout()
fig.savefig('/Users/tomer/Downloads/jsie_usecase.png', dpi=300)

In [None]:
g = 'jsie'

summary_df = pd.DataFrame()

res_df = pd.read_csv(f'/Users/tomer/git/csf/output/illinois/{g}_train_imputed_with_htes_ci_18_node_{node}_mol_{mol}_nimp_2500_nt_2000_rep_1.csv', index_col=0)


sorted_df = res_df.sort_values('CSF')

tmp_df = sorted_df.iloc[:int(np.floor(0.1*len(sorted_df)))]

summary_df = pd.concat([summary_df, pd.Series({
    "Median Age (Years)": tmp_df['age'].median(),
    "Male (%)": 100*tmp_df['male'].mean(),
    "White (%)": 100*tmp_df['white'].mean(),
}, name='CSF (top 10%)')], axis=1)

   
sorted_df = res_df.sort_values('MEAN_HTE')

tmp_df = sorted_df.iloc[:int(np.floor(0.1*len(sorted_df)))]

summary_df = pd.concat([summary_df, pd.Series({
    "Median Age (Years)": tmp_df['age'].median(),
    "Male (%)": 100*tmp_df['male'].mean(),
    "White (%)": 100*tmp_df['white'].mean(),
}, name='MISTR (top 10%)')], axis=1)

sorted_df = res_df.sort_values('IV_MEAN_HTE')

tmp_df = sorted_df.iloc[:int(np.floor(0.1*len(sorted_df)))]

summary_df = pd.concat([summary_df, pd.Series({
    "Median Age (Years)": tmp_df['age'].median(),
    "Male (%)": 100*tmp_df['male'].mean(),
    "White (%)": 100*tmp_df['white'].mean(),
}, name='MISTR-IV (top 10%)')], axis=1)


sorted_df = res_df.sort_values('CSF')

tmp_df = sorted_df.iloc[int(np.floor(0.9*len(sorted_df))):]

summary_df = pd.concat([summary_df, pd.Series({
    "Median Age (Years)": tmp_df['age'].median(),
    "Male (%)": 100*tmp_df['male'].mean(),
    "White (%)": 100*tmp_df['white'].mean(),
}, name='CSF (bottom 10%)')], axis=1)
   
sorted_df = res_df.sort_values('MEAN_HTE')

tmp_df = sorted_df.iloc[int(np.floor(0.9*len(sorted_df))):]

summary_df = pd.concat([summary_df, pd.Series({
    "Median Age (Years)": tmp_df['age'].median(),
    "Male (%)": 100*tmp_df['male'].mean(),
    "White (%)": 100*tmp_df['white'].mean(),
}, name='MISTR (bottom 10%)')], axis=1)



sorted_df = res_df.sort_values('IV_MEAN_HTE')

tmp_df = sorted_df.iloc[int(np.floor(0.9*len(sorted_df))):]

summary_df = pd.concat([summary_df, pd.Series({
    "Median Age (Years)": tmp_df['age'].median(),
    "Male (%)": 100*tmp_df['male'].mean(),
    "White (%)": 100*tmp_df['white'].mean(),
}, name='MISTR-IV (bottom 10%)')], axis=1)


summary_df.round(1)
summary_df_jsie = summary_df.round(1).copy()

In [None]:
g = 'hie'

summary_df = pd.DataFrame()

res_df = pd.read_csv(f'/Users/tomer/git/csf/output/illinois/{g}_train_imputed_with_htes_ci_18_node_{node}_mol_{mol}_nimp_2500_nt_2000_rep_1.csv', index_col=0)


sorted_df = res_df.sort_values('CSF')

tmp_df = sorted_df.iloc[:int(np.floor(0.1*len(sorted_df)))]

summary_df = pd.concat([summary_df, pd.Series({
    "Median Age (Years)": tmp_df['age'].median(),
    "Male (%)": 100*tmp_df['male'].mean(),
    "White (%)": 100*tmp_df['white'].mean(),
}, name='CSF (top 10%)')], axis=1)

   
sorted_df = res_df.sort_values('MEAN_HTE')

tmp_df = sorted_df.iloc[:int(np.floor(0.1*len(sorted_df)))]

summary_df = pd.concat([summary_df, pd.Series({
    "Median Age (Years)": tmp_df['age'].median(),
    "Male (%)": 100*tmp_df['male'].mean(),
    "White (%)": 100*tmp_df['white'].mean(),
}, name='MISTR (top 10%)')], axis=1)

sorted_df = res_df.sort_values('IV_MEAN_HTE')

tmp_df = sorted_df.iloc[:int(np.floor(0.1*len(sorted_df)))]

summary_df = pd.concat([summary_df, pd.Series({
    "Median Age (Years)": tmp_df['age'].median(),
    "Male (%)": 100*tmp_df['male'].mean(),
    "White (%)": 100*tmp_df['white'].mean(),
}, name='MISTR-IV (top 10%)')], axis=1)


sorted_df = res_df.sort_values('CSF')

tmp_df = sorted_df.iloc[int(np.floor(0.9*len(sorted_df))):]

summary_df = pd.concat([summary_df, pd.Series({
    "Median Age (Years)": tmp_df['age'].median(),
    "Male (%)": 100*tmp_df['male'].mean(),
    "White (%)": 100*tmp_df['white'].mean(),
}, name='CSF (bottom 10%)')], axis=1)
   
sorted_df = res_df.sort_values('MEAN_HTE')

tmp_df = sorted_df.iloc[int(np.floor(0.9*len(sorted_df))):]

summary_df = pd.concat([summary_df, pd.Series({
    "Median Age (Years)": tmp_df['age'].median(),
    "Male (%)": 100*tmp_df['male'].mean(),
    "White (%)": 100*tmp_df['white'].mean(),
}, name='MISTR (bottom 10%)')], axis=1)



sorted_df = res_df.sort_values('IV_MEAN_HTE')

tmp_df = sorted_df.iloc[int(np.floor(0.9*len(sorted_df))):]

summary_df = pd.concat([summary_df, pd.Series({
    "Median Age (Years)": tmp_df['age'].median(),
    "Male (%)": 100*tmp_df['male'].mean(),
    "White (%)": 100*tmp_df['white'].mean(),
}, name='MISTR-IV (bottom 10%)')], axis=1)


summary_df.round(1)
summary_df_hie = summary_df.round(1).copy()

In [None]:
pd.concat([summary_df_jsie, summary_df_hie], keys=['JSIE', 'HIE'])

In [None]:
print(pd.concat([summary_df_jsie, summary_df_hie], keys=['JSIE', 'HIE']).to_latex(float_format="%.1f"))