In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from lightgbm import LGBMRegressor,LGBMClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV

In [4]:
data_x = pd.read_csv("./data/practice/acic_practice_0001.csv")
data_y = pd.read_csv("./data/practice_year/acic_practice_year_0001.csv")

# propensity score

In [3]:
data_t = pd.merge(data_x, data_y[['id.practice','Z']].drop_duplicates(), on='id.practice')
df = data_t
ct = ['X2','X4']
for c in ct:
    df[c] = df[c].astype('category')

debias_m = LGBMClassifier(max_depth=3)

X = ['X1','X2','X3','X4','X5','X6','X7','X8','X9']
T = ['Z']
ps_res =  df[T] - cross_val_predict(debias_m, df[X], df[T].values.ravel(), cv=5, method='predict_proba')[:,1].reshape(-1,1) + df[T].mean()

### (1) 调整max_depth 和 num_leaves
确定树的大小及复杂度

In [56]:
parameters = {
    'max_depth': [4,6,8],
    'num_leaves': [20,30,40],
}

model = LGBMClassifier(objective = 'binary',
                         is_unbalance = True,
                         #metric = 'log_loss',
                         metric = 'binary_logloss,auc',
                         max_depth = 6,
                         num_leaves = 40,
                         learning_rate = 0.1,
                         #feature_fraction = 0.7,
                         min_child_samples=21,
                         min_child_weight=0.001,
                         #bagging = 1,
                         #subsample_freq = 2,
                         reg_alpha = 0.001,
                         reg_lambda = 8,
                         cat_smooth = 0,
                         n_estimators = 200,   
                        )
gsearch = GridSearchCV(model, param_grid=parameters, scoring='roc_auc', cv=10)
gsearch.fit(df[X], df[T].values.ravel())
print('参数的最佳取值:{0}'.format(gsearch.best_params_))
print('最佳模型得分:{0}'.format(gsearch.best_score_))
print(gsearch.cv_results_['mean_test_score'])
print(gsearch.cv_results_['params'])

参数的最佳取值:{'max_depth': 4, 'num_leaves': 20}
最佳模型得分:0.8248431855500822
[0.82484319 0.82484319 0.82484319 0.82257143 0.82257143 0.82257143
 0.81984072 0.81984072 0.81984072]
[{'max_depth': 4, 'num_leaves': 20}, {'max_depth': 4, 'num_leaves': 30}, {'max_depth': 4, 'num_leaves': 40}, {'max_depth': 6, 'num_leaves': 20}, {'max_depth': 6, 'num_leaves': 30}, {'max_depth': 6, 'num_leaves': 40}, {'max_depth': 8, 'num_leaves': 20}, {'max_depth': 8, 'num_leaves': 30}, {'max_depth': 8, 'num_leaves': 40}]


### (2) 调整min_data_in_leaf 和 min_sum_hessian_in_leaf
防止树过拟合

In [57]:
parameters = {
    #'min_child_samples': [18,19,20,21,22],
    #'min_child_weight': [0.001,0.002],
    'reg_alpha': [0.0005, 0.001, 0.0015, 0.002],
    'reg_lambda': [6, 7, 8, 9, 10]
}

model = LGBMClassifier(objective = 'binary',
                         is_unbalance = True,
                         #metric = 'log_loss',
                         metric = 'binary_logloss,auc',
                         max_depth = 4,
                         num_leaves = 20,
                         learning_rate = 0.1,
                         #feature_fraction = 0.7,
                         min_child_samples=21,
                         min_child_weight=0.001,
                         #bagging = 1,
                         #subsample_freq = 2,
                         reg_alpha = 0.0005,
                         reg_lambda = 8,
                         cat_smooth = 0,
                         n_estimators = 200,   
                        )
gsearch = GridSearchCV(model, param_grid=parameters, scoring='roc_auc', cv=10)
gsearch.fit(df[X], df[T].values.ravel())
print('参数的最佳取值:{0}'.format(gsearch.best_params_))
print('最佳模型得分:{0}'.format(gsearch.best_score_))
print(gsearch.cv_results_['mean_test_score'])
print(gsearch.cv_results_['params'])

参数的最佳取值:{'reg_alpha': 0.002, 'reg_lambda': 10}
最佳模型得分:0.8294532019704434
[0.8225     0.82811494 0.8243087  0.82628654 0.82663218 0.82482841
 0.82746552 0.82484319 0.8279532  0.82827422 0.82318637 0.82697291
 0.82634072 0.82629146 0.82911987 0.82451232 0.82746305 0.82816174
 0.8274532  0.8294532 ]
[{'reg_alpha': 0.0005, 'reg_lambda': 6}, {'reg_alpha': 0.0005, 'reg_lambda': 7}, {'reg_alpha': 0.0005, 'reg_lambda': 8}, {'reg_alpha': 0.0005, 'reg_lambda': 9}, {'reg_alpha': 0.0005, 'reg_lambda': 10}, {'reg_alpha': 0.001, 'reg_lambda': 6}, {'reg_alpha': 0.001, 'reg_lambda': 7}, {'reg_alpha': 0.001, 'reg_lambda': 8}, {'reg_alpha': 0.001, 'reg_lambda': 9}, {'reg_alpha': 0.001, 'reg_lambda': 10}, {'reg_alpha': 0.0015, 'reg_lambda': 6}, {'reg_alpha': 0.0015, 'reg_lambda': 7}, {'reg_alpha': 0.0015, 'reg_lambda': 8}, {'reg_alpha': 0.0015, 'reg_lambda': 9}, {'reg_alpha': 0.0015, 'reg_lambda': 10}, {'reg_alpha': 0.002, 'reg_lambda': 6}, {'reg_alpha': 0.002, 'reg_lambda': 7}, {'reg_alpha': 0.002, 'reg

### (3) 调整lambda_l1(reg_alpha)和lambda_l2(reg_lambda)
通过L1正则化和L2正则化降低过拟合

In [39]:
parameters = {
    'reg_alpha': [0.0005, 0.001, 0.0015, 0.0002],
    'reg_lambda': [6, 7, 8, 9, 10]
}

model = LGBMClassifier(objective = 'binary',
                         is_unbalance = True,
                         #metric = 'log_loss',
                         metric = 'binary_logloss,auc',
                         max_depth = 4,
                         num_leaves = 20,
                         learning_rate = 0.1,
                         #feature_fraction = 0.7,
                         min_child_samples=21,
                         min_child_weight=0.001,
                         #bagging = 1,
                         #subsample_freq = 2,
                         reg_alpha = 0.001,
                         reg_lambda = 8,
                         cat_smooth = 10,
                         n_estimators = 200,   
                        )
gsearch = GridSearchCV(model, param_grid=parameters, scoring='roc_auc', cv=10)
gsearch.fit(df[X], df[T].values.ravel())
print('参数的最佳取值:{0}'.format(gsearch.best_params_))
print('最佳模型得分:{0}'.format(gsearch.best_score_))
print(gsearch.cv_results_['mean_test_score'])
print(gsearch.cv_results_['params'])

参数的最佳取值:{'reg_alpha': 0.0015, 'reg_lambda': 10}
最佳模型得分:0.82911986863711
[0.8225     0.82811494 0.8243087  0.82628654 0.82663218 0.82482841
 0.82746552 0.82484319 0.8279532  0.82827422 0.82318637 0.82697291
 0.82634072 0.82629146 0.82911987 0.82216667 0.82611248 0.82697537
 0.82826929 0.82662726]
[{'reg_alpha': 0.0005, 'reg_lambda': 6}, {'reg_alpha': 0.0005, 'reg_lambda': 7}, {'reg_alpha': 0.0005, 'reg_lambda': 8}, {'reg_alpha': 0.0005, 'reg_lambda': 9}, {'reg_alpha': 0.0005, 'reg_lambda': 10}, {'reg_alpha': 0.001, 'reg_lambda': 6}, {'reg_alpha': 0.001, 'reg_lambda': 7}, {'reg_alpha': 0.001, 'reg_lambda': 8}, {'reg_alpha': 0.001, 'reg_lambda': 9}, {'reg_alpha': 0.001, 'reg_lambda': 10}, {'reg_alpha': 0.0015, 'reg_lambda': 6}, {'reg_alpha': 0.0015, 'reg_lambda': 7}, {'reg_alpha': 0.0015, 'reg_lambda': 8}, {'reg_alpha': 0.0015, 'reg_lambda': 9}, {'reg_alpha': 0.0015, 'reg_lambda': 10}, {'reg_alpha': 0.0002, 'reg_lambda': 6}, {'reg_alpha': 0.0002, 'reg_lambda': 7}, {'reg_alpha': 0.0002, 'r

### (4) 调整cat_smooth
cat_smooth为设置每个类别拥有最小的个数，主要用于去噪。

In [58]:
parameters = {
    'cat_smooth': [0,10,20],
}

model = LGBMClassifier(objective = 'binary',
                         is_unbalance = True,
                         #metric = 'log_loss',
                         metric = 'binary_logloss,auc',
                         max_depth = 4,
                         num_leaves = 20,
                         learning_rate = 0.1,
                         #feature_fraction = 0.7,
                         min_child_samples=19,
                         min_child_weight=0.001,
                         #bagging = 1,
                         #subsample_freq = 2,
                         reg_alpha = 0.002,
                         reg_lambda = 10,
                         cat_smooth = 0,
                         n_estimators = 200,   
                        )
gsearch = GridSearchCV(model, param_grid=parameters, scoring='roc_auc', cv=10)
gsearch.fit(df[X], df[T].values.ravel())
print('参数的最佳取值:{0}'.format(gsearch.best_params_))
print('最佳模型得分:{0}'.format(gsearch.best_score_))
print(gsearch.cv_results_['mean_test_score'])
print(gsearch.cv_results_['params'])

参数的最佳取值:{'cat_smooth': 0}
最佳模型得分:0.8254433497536946
[0.82544335 0.82544335 0.82544335]
[{'cat_smooth': 0}, {'cat_smooth': 10}, {'cat_smooth': 20}]


## fit the model

In [5]:
debias_m = LGBMClassifier(objective = 'binary',
                         is_unbalance = True,
                         #metric = 'log_loss',
                         metric = 'binary_logloss,auc',
                         max_depth = 4,
                         num_leaves = 20,
                         learning_rate = 0.1,
                         #feature_fraction = 0.7,
                         min_child_samples=21,
                         min_child_weight=0.001,
                         #bagging = 1,
                         #subsample_freq = 2,
                         reg_alpha = 0.002,
                         reg_lambda = 10,
                         cat_smooth = 0,
                         n_estimators = 200,   
                        )

ps = cross_val_predict(debias_m, df[X], df[T].values.ravel(), cv=10, method='predict_proba')[:,1]
#ps_res =  df[T] - cross_val_predict(debias_m, df[X], df[T].values.ravel(), cv=5, method='predict_proba')[:,1].reshape(-1,1) + df[T].mean()



In [94]:
ps = cross_val_predict(debias_m, df[X], df[T].values.ravel(), cv=5, method='predict_proba')[:,1]

In [60]:
tmp=df.assign(propensity_score=ps)
tp = sum((tmp['propensity_score'] >= 0.5) & (tmp['Z']==1))
tn = sum((tmp['propensity_score'] < 0.5) & (tmp['Z']==0))
fp = sum((tmp['propensity_score'] >= 0.5) & (tmp['Z']==0))
fn =sum((tmp['propensity_score'] < 0.5) & (tmp['Z']==1))
tp,tn,fp,fn

(152, 235, 61, 52)

In [62]:
# one-hot encoding
df = pd.get_dummies(data_t, columns=['X2','X4'])
X = ['X1','X2_A','X2_B','X2_C','X3','X4_A','X4_B','X4_C','X5','X6','X7','X8','X9']
T = ['Z']
df.dtypes

id.practice      int64
X1               int64
X3               int64
X5               int64
X6             float64
X7             float64
X8             float64
X9             float64
Z                int64
X2_A             uint8
X2_B             uint8
X2_C             uint8
X4_A             uint8
X4_B             uint8
X4_C             uint8
dtype: object

In [63]:
scaler = StandardScaler()
df[['X6','X7','X8','X9']] = scaler.fit_transform(df[['X6','X7','X8','X9']]) / 2
ps_model = LogisticRegression(C=1e6, max_iter=1000).fit(df[X], df[T].values.ravel())
#ps_model = MLPClassifier(hidden_layer_sizes=(50,), solver='lbfgs', random_state=1, max_iter=500, learning_rate='invscaling').fit(df[X], df[T].values.ravel())

In [6]:
tmp=df.assign(propensity_score=ps_model.predict_proba(df[X])[:, 1])
tp = sum((tmp['propensity_score'] >= 0.5) & (tmp['Z']==1))
tn = sum((tmp['propensity_score'] < 0.5) & (tmp['Z']==0))
fp = sum((tmp['propensity_score'] >= 0.5) & (tmp['Z']==0))
fn =sum((tmp['propensity_score'] < 0.5) & (tmp['Z']==1))
tp,tn,fp,fn

NameError: name 'ps_model' is not defined

# average outcome

In [8]:
data_o = pd.merge(data_x, data_y, on='id.practice')

In [None]:
df = pd.get_dummies(data_o, columns=['X2','X4'])
XV = df.columns.drop(['id.practice','post','Z','n.patients'])
Y = ['Y']
ao_model = MLPRegressor(hidden_layer_sizes=(50,), solver='lbfgs', alpha=0.1, random_state=1, max_iter=1000, learning_rate='invscaling', early_stopping=True).fit(df[XV], df[Y].values.ravel())

In [96]:
data_o.iloc[:,1:15].head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,year,Y,Z,post,n.patients
0,0,A,1,A,1,20.774076,14.153255,0.161126,43.431874,1,1025.523263,1,0,113
1,0,A,1,A,1,20.774076,14.153255,0.161126,43.431874,2,1613.777568,1,0,109
2,0,A,1,A,1,20.774076,14.153255,0.161126,43.431874,3,1189.200788,1,1,121
3,0,A,1,A,1,20.774076,14.153255,0.161126,43.431874,4,1619.829704,1,1,131
4,0,A,0,C,0,33.565928,3.284657,0.556784,12.721988,1,834.169421,1,0,264


In [68]:
df = data_o
ct = ['X2','X4']
for c in ct:
    df[c] = df[c].astype('category')
    
XV = df.columns.drop(['id.practice','year','post','Z','n.patients'])
Y = ['Y']

In [107]:
for i in range(1,5):
    print(df[(df['year']==i) & (df['Z']==0)]['Y'].mean())

870.6451336763548
1010.0760840822331
1134.3928342316492
1250.6285559851644


In [74]:
denoise_m = LGBMRegressor(max_depth = 6)
denoise_m.fit(df[XV], df[T], sample_weight=df['n.patients'])

LGBMRegressor(max_depth=6)

In [None]:
parameters = {
    'max_depth': [4,6,8],
    'num_leaves': [20,30,40],
}

model = LGBMRegressor(objective = 'binary',
                         is_unbalance = True,
                         #metric = 'log_loss',
                         metric = 'neg_mean_squared_error',
                         max_depth = 6,
                         num_leaves = 40,
                         learning_rate = 0.1,
                         #feature_fraction = 0.7,
                         min_child_samples=21,
                         min_child_weight=0.001,
                         #bagging = 1,
                         #subsample_freq = 2,
                         reg_alpha = 0.001,
                         reg_lambda = 8,
                         cat_smooth = 0,
                         n_estimators = 200,   
                        )
gsearch = GridSearchCV(model, param_grid=parameters, scoring='neg_mean_squared_error', cv=10)
gsearch.fit(df[X], df[T], weight=df['n.patients'])
print('参数的最佳取值:{0}'.format(gsearch.best_params_))
print('最佳模型得分:{0}'.format(gsearch.best_score_))
print(gsearch.cv_results_['mean_test_score'])
print(gsearch.cv_results_['params'])