# Description

Initial EDA for potential propensity score matching leakage.

# Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pingouin as pg

%matplotlib inline

# Load data

In [10]:
psm_leak = pd.read_csv("psm_leakage.csv")

# create a unique study id via index
psm_leak = psm_leak.reset_index()
psm_leak = psm_leak.rename({"index": "study_id"}, axis="columns")
psm_leak.head()

Unnamed: 0,study_id,study_type,year,clinical_setting,estimate,lower_ci,upper_ci
0,0,RCT,2001,APC in sepsis (inhospital),0.85,0.5,1.44
1,1,RCT,2001,APC in sepsis (inhospital),0.8,0.69,0.94
2,2,RCT,2005,APC in sepsis (inhospital),0.92,0.78,1.06
3,3,RCT,2009,APC in sepsis (inhospital),1.26,0.86,1.85
4,4,RCT,2012,APC in sepsis (inhospital),1.09,0.92,1.26


In [11]:
psm_leak['study_type'].value_counts()

RCT    62
PS     27
Name: study_type, dtype: int64

# Compute pairwise differences

In [44]:
pair_df = pd.DataFrame()
for clinic_set, group in psm_leak.groupby("clinical_setting"):
    ps_df = group[group['study_type'] == 'PS']
    rct_df = group[group['study_type'] == 'RCT']
    for idx, ps in ps_df.iterrows():
        ps_dict = {
            "study_id": [],
            "rct_year": [],
            "ps_year": [],
            "year_diff": [],
            "rct_est": [],
            "ps_est": []
        }
        for idx, rct in rct_df.iterrows():
            ps_dict['study_id'].append(ps['study_id'])
            ps_dict['ps_year'].append(ps['year'])
            ps_dict['rct_year'].append(rct['year'])
            ps_dict['year_diff'].append(ps['year'] - rct['year'])
            ps_dict['rct_est'].append(rct['estimate'])
            ps_dict['ps_est'].append(ps['estimate'])

        
        df = pd.DataFrame.from_dict(ps_dict)
        pair_df = pair_df.append(df)

In [45]:
pair_df['raw_diff'] = pair_df['ps_est'] - pair_df['rct_est']
pair_df['norm_diff'] = pair_df['raw_diff'] / (pair_df['ps_est'] + pair_df['rct_est'])

In [46]:
no_zero_pairs = pair_df[pair_df['year_diff'] != 0]

In [47]:
no_zero_pairs['ps_after'] = (no_zero_pairs['year_diff'] > 0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [48]:
no_zero_pairs.head()

Unnamed: 0,study_id,rct_year,ps_year,year_diff,rct_est,ps_est,raw_diff,norm_diff,ps_after
0,5,2001,2008,7,0.85,0.72,-0.13,-0.082803,1
1,5,2001,2008,7,0.8,0.72,-0.08,-0.052632,1
2,5,2005,2008,3,0.92,0.72,-0.2,-0.121951,1
3,5,2009,2008,-1,1.26,0.72,-0.54,-0.272727,0
4,5,2012,2008,-4,1.09,0.72,-0.37,-0.20442,0


In [52]:
no_zero_pairs.groupby('ps_after')[['raw_diff', 'norm_diff']].describe().transpose()

Unnamed: 0,ps_after,0,1
raw_diff,count,33.0,56.0
raw_diff,mean,0.119394,0.112679
raw_diff,std,0.526135,0.630764
raw_diff,min,-0.67,-1.7
raw_diff,25%,-0.22,-0.26
raw_diff,50%,0.07,0.015
raw_diff,75%,0.39,0.2175
raw_diff,max,1.61,2.05
norm_diff,count,33.0,56.0
norm_diff,mean,0.034064,0.044259


# Look at "closest" RCT before

In [53]:
before_pairs = pair_df[pair_df['year_diff'] > 0]

In [60]:
close_before = before_pairs.sort_values(["study_id", "year_diff"]).groupby("study_id").head(1)
close_before[['raw_diff', 'norm_diff']].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
raw_diff,21.0,-0.004762,0.627133,-0.88,-0.37,-0.07,0.06,2.05
norm_diff,21.0,-0.014938,0.233414,-0.301775,-0.156118,-0.049645,0.021429,0.580737


# Look at "closest" RCT after

In [61]:
after_pairs = pair_df[pair_df['year_diff'] < 0]

In [66]:
close_after = after_pairs.sort_values(["study_id", "year_diff"], ascending=False).groupby("study_id").head(1)
close_after[['raw_diff', 'norm_diff']].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
raw_diff,13.0,0.07,0.5677,-0.67,-0.22,-0.07,0.22,1.61
norm_diff,13.0,0.030867,0.296692,-0.362162,-0.112245,-0.033175,0.108696,0.842932


In [70]:
target_col = 'norm_diff'
pg.ttest(close_after[target_col], close_before[target_col])

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.473302,21.098716,two-sided,0.640858,"[-0.16, 0.25]",0.176877,0.366,0.077506
