In [1]:
import pandas as pd
from sklearn import linear_model
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [2]:
PATH = "/home/dtd/Documents/interpretable_machine_learning/Source Code/my_work/Experiment"
df = pd.read_csv(PATH + "/recovery.csv")

In [3]:
df.head()

Unnamed: 0,sex,age,severity,medication,const,recovery
0,0,24.518667,0.85895,0,1.0,92
1,1,11.080205,0.905123,1,1.0,40
2,0,37.014895,0.601475,0,1.0,65
3,0,35.657701,0.74984,0,1.0,116
4,0,36.735167,0.38546,0,1.0,47


In [4]:
features = ['sex', 'age', 'severity', 'const']
observed_common_causes = df[features]
treatment = df['medication']
outcome = df['recovery']

In [5]:
propensity_score_model = linear_model.LogisticRegression(solver="lbfgs")
propensity_score_model.fit(observed_common_causes, treatment)
df['propensity_score'] = propensity_score_model.predict_proba(observed_common_causes)[:,1]

## K-nearest neighbor

In [6]:
treated = df.loc[df['medication'] == 1]
control = df.loc[df['medication'] == 0]


control_neighbors = (
    NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
    .fit(control['propensity_score'].values.reshape(-1, 1))
)
distances, indices = control_neighbors.kneighbors(treated['propensity_score'].values.reshape(-1, 1))

att = 0
numtreatedunits = treated.shape[0]
for i in range(numtreatedunits):
    treated_outcome = treated.iloc[i]['recovery'].item()
    control_outcome = control.iloc[indices[i]]['recovery'].item()
    att += treated_outcome - control_outcome

att /= numtreatedunits

control_neighbors = (
    NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
    .fit(treated['propensity_score'].values.reshape(-1, 1))
)
distances, indices = control_neighbors.kneighbors(control['propensity_score'].values.reshape(-1, 1))

atc = 0
numcontrolunits = control.shape[0]
for i in range(numtreatedunits):
    treated_outcome = treated.iloc[i]['recovery'].item()
    control_outcome = control.iloc[indices[i]]['recovery'].item()
    atc += treated_outcome - control_outcome
atc /= numcontrolunits

numunits = numtreatedunits + numcontrolunits
est = (att*numtreatedunits + atc*numcontrolunits)/(numtreatedunits+numcontrolunits)

print(est)

-34.3663


## Inverse weighting

In [7]:
min_ps_score=0.05
max_ps_score=0.95

df['propensity_score'] = np.minimum(df['propensity_score'], max_ps_score)
df['propensity_score'] = np.maximum(df['propensity_score'], min_ps_score)

In [8]:
ipst_sum = sum(df['medication'] / df['propensity_score'])
ipsc_sum = sum((1 - df['medication']) / (1-df['propensity_score']))

$ips\_weight = \frac{1}{p}$ treated

$ips\_weight = \frac{1}{1- p}$ control

$treated_ips = \frac{1}{num\_treated} * (treatment + (1-treatment)*\frac{ps}{1-ps}$

$treated_ips = \frac{1}{num\_control} * (1 - treatment + treatment*\frac{1- ps}{ps}$


In [13]:
df['ips_weight'] = 1/(numtreatedunits + numcontrolunits)*(
    df['medication'] / df['propensity_score'] +
    (1 - df['medication']) / (1 - df['propensity_score'])
)
        
df['tips_weight'] = (1/numtreatedunits) * (
    df['medication'] +
    (1 - df['medication']) * df['propensity_score']/ (1 - df['propensity_score'])
)

df['cips_weight'] = (1/numcontrolunits) * (
    (1 - df['medication']) + 
    df['medication'] * (1 - df['propensity_score'])/ df['propensity_score']
    
)

In [14]:
df['d_y'] = (
            df['ips_weight'] *
            df['medication'] *
            df['recovery']
        )
df['dbar_y'] = (
            df['ips_weight'] *
            (1 - df['medication']) *
            df['recovery']
        )
est = df['d_y'].sum() - df['dbar_y'].sum()

In [15]:
est

-77.6328344814307