# Lasso Logistic Regression: 

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import  make_scorer
from sklearn import metrics
from sklearn.utils import resample
import pickle
import pandas as pd

In [2]:
predictorList = ['FCSStaus_lag', 'urban','NL_District_log', 'FoodInsecureMonthly_lag', 
       'precipitationMean', 'NDVI.Anomaly.Mean',
       'Average..mm.',  'X1.Month.Anomaly....', 'X3.Months.Anomaly....', 
       'fatalitiesMean_log','temperatureMean_log','NDVIMean_log',
       'Kind.Income.Ratio','MaleRatio', 'AvgAge', 'SelfArg', 'SelfHerd', 'ShareToilet_Yes', 'Salt_Yes',
       'HouseType_feq', 'RoofType_feq', 'WaterSource_feq', 'IncomeStab_Somewhat stable',
       'IncomeStab_Very unstable',
       'SubjectivePoverty_Neither poor nor rich', 'SubjectivePoverty_Poor',
       'SubjectivePoverty_Very poor', 'RelLivStandard_Better off',
       'RelLivStandard_Same', 'RelLivStandard_Worse off',
       'LivStandChange_Decreased', 'LivStandChange_Increased',
       'LivStandChange_Stayed at the same', 'DistDrinkingWaterBig3', 'FamilySize', 
       'SelfStapleTypes', 'valueNow_MobilePhone', 'valueNowTotal', 'valueNow_Furniture', 
        'valueNow_MobilePhone_new', 'valueNow_ArgLand_new', 'valueNow_ArgLand', 'valueNow_Livestock', 
       'valueNow_Furniture_new', 'valueNow_Livestock_new', 'valueNowTotal_new', 'Income_new', 'Income', 
       'ValueAgoTotal_new', 'valueNow_FixPhone',  
       'valueNow_Refrigerator']

In [3]:
file_paths = [
    r"bld\datasets\generated\data_before_county.pkl",
    r"bld\datasets\generated\data_before_district.pkl",
    r"bld\datasets\generated\data_before_subcounty.pkl",
    r"bld\datasets\generated\data_during_county.pkl",
    r"bld\datasets\generated\data_during_district.pkl",
    r"bld\datasets\generated\data_during_subcounty.pkl"
]

data_dict = {}

for path in file_paths:
    with open(path, 'rb') as file:
        data_dict[path] = pickle.load(file)

In [5]:
data_before_county = data_dict['bld\\datasets\\generated\\data_before_county.pkl']
data_before_district = data_dict['bld\\datasets\\generated\\data_before_district.pkl']
data_before_subcounty = data_dict['bld\\datasets\\generated\\data_before_subcounty.pkl']
data_during_county = data_dict['bld\\datasets\\generated\\data_during_county.pkl']
data_during_district = data_dict['bld\\datasets\\generated\\data_during_district.pkl']
data_during_subcounty = data_dict['bld\\datasets\\generated\\data_during_subcounty.pkl']

Here we get the train and test data

In [9]:
data_before_county_train = data_before_county['StdTrain_county_during']['train_10']
data_before_district_train = data_before_district['StdTrain_district_during']['train_10']
data_before_subcounty_train = data_before_subcounty['StdTrain_subcounty_during']['train_10']
data_during_county_train = data_during_county['StdTrain_county']['train_10']
data_during_district_train = data_during_district['StdTrain_district']['train_10']
data_during_subcounty_train = data_during_subcounty['StdTrain_subcounty']['train_10']
data_before_county_test = data_before_county['StdTest_county_during']['test_10']
data_before_district_test = data_before_district['StdTest_district_during']['test_10']
data_before_subcounty_test = data_before_subcounty['StdTest_subcounty_during']['test_10']
data_during_county_test = data_during_county['StdTest_county']['test_10']
data_during_district_test = data_during_district['StdTest_district']['test_10']
data_during_subcounty_test = data_during_subcounty['StdTest_subcounty']['test_10']

data_before_county_train_ADA = data_before_county['ADASYN_Train_county_during']['train_10']
data_before_district_train_ADA = data_before_district['ADASYN_Train_district_during']['train_10']
data_before_subcounty_train_ADA = data_before_subcounty['ADASYN_Train_subcounty_during']['train_10']
data_before_county_train_SMOTE = data_before_county['SMOTE_Train_county_during']['train_10']
data_before_district_train_SMOTE = data_before_district['SMOTE_Train_district_during']['train_10']
data_before_subcounty_train_SMOTE = data_before_subcounty['SMOTE_Train_subcounty_during']['train_10']
data_before_county_train_SMOTEENN = data_before_county['SMOTEENN_Train_county_during']['train_10']
data_before_district_train_SMOTEENN = data_before_district['SMOTEENN_Train_district_during']['train_10']
data_before_subcounty_train_SMOTEENN = data_before_subcounty['SMOTEENN_Train_subcounty_during']['train_10']
data_before_county_train_SMOTETOM = data_before_county['SMOTETOM_Train_county_during']['train_10']
data_before_district_train_SMOTETOM = data_before_district['SMOTETOM_Train_district_during']['train_10']
data_before_subcounty_train_SMOTETOM = data_before_subcounty['SMOTETOM_Train_subcounty_during']['train_10']

data_during_county_train_ADA = data_during_county['ADASYN_Train_county']['train_10']
data_during_district_train_ADA = data_during_district['ADASYN_Train_district']['train_10']
data_during_subcounty_train_ADA = data_during_subcounty['ADASYN_Train_subcounty']['train_10']
data_during_county_train_SMOTE = data_during_county['SMOTE_Train_county']['train_10']
data_during_district_train_SMOTE = data_during_district['SMOTE_Train_district']['train_10']
data_during_subcounty_train_SMOTE = data_during_subcounty['SMOTE_Train_subcounty']['train_10']
data_during_county_train_SMOTEENN = data_during_county['SMOTEENN_Train_county']['train_10']
data_during_district_train_SMOTEENN = data_during_district['SMOTEENN_Train_district']['train_10']
data_during_subcounty_train_SMOTEENN = data_during_subcounty['SMOTEENN_Train_subcounty']['train_10']
data_during_county_train_SMOTETOM = data_during_county['SMOTETOM_Train_county']['train_10']
data_during_district_train_SMOTETOM = data_during_district['SMOTETOM_Train_district']['train_10']
data_during_subcounty_train_SMOTETOM = data_during_subcounty['SMOTETOM_Train_subcounty']['train_10']

## ML analysis

### Logistic regression 

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import resample
import numpy as np
from scipy.stats import uniform, loguniform

def train_LR_and_bootstrap(train_data, test_data, predictor_list, scoring='roc_auc'):
    '''
    train_data: dataframe of training data, this case should be ...Train_...['train_10']
    test_data: dictionary of testing data
    '''
    X_train = train_data[predictor_list]
    Y_train = train_data['FCSStaus']
    
    LR_l1 = LogisticRegression(penalty='l1', random_state=527, solver='liblinear', max_iter=1000, class_weight='balanced')
    # grid = dict(C=loguniform(0.001, 1000))
    grid = dict(C=loguniform(0.001, 1000))
    crossval = RandomizedSearchCV(LR_l1, grid, cv=5, scoring=scoring, random_state=527)
    # crossval = RandomizedSearchCV(LR_l1, grid, cv=5, scoring=scoring)
    crossval.fit(X_train, Y_train)
    
    LR_l2 = LogisticRegression(penalty='l1', random_state=527, max_iter=1000, **crossval.best_params_, solver='liblinear', class_weight='balanced')
    LR_l2.fit(X_train, Y_train)
    # we first train and test the model with the original data 
    X_test = test_data[predictor_list]
    Y_test = test_data['FCSStaus']
    y_LRpredprob_test = LR_l2.predict_proba(X_test)[:, 1]
    AUC_LR = roc_auc_score(Y_test, y_LRpredprob_test)
    output_df = {'Prob': y_LRpredprob_test, 'Y': Y_test}
    
    return output_df, AUC_LR, LR_l2

In [None]:
output_df_LR_district_before, AUC_LR_district_before, LR_l2_district_before = train_LR_and_bootstrap( data_before_district_train, data_before_district_test, predictorList)
output_df_LR_district_during, AUC_LR_district_during, LR_l2_district_during = train_LR_and_bootstrap( data_during_district_train, data_during_district_test, predictorList)
output_df_LR_subcounty_before, AUC_LR_subcounty_before, LR_l2_subcounty_before = train_LR_and_bootstrap( data_before_subcounty_train, data_before_subcounty_test, predictorList)
output_df_LR_subcounty_during, AUC_LR_subcounty_during, LR_l2_subcounty_during = train_LR_and_bootstrap( data_during_subcounty_train, data_during_subcounty_test, predictorList)
output_df_LR_county_before, AUC_LR_county_before, LR_l2_county_before = train_LR_and_bootstrap( data_before_county_train, data_before_county_test, predictorList)
output_df_LR_county_during, AUC_LR_county_during, LR_l2_county_during = train_LR_and_bootstrap( data_during_county_train, data_during_county_test, predictorList)

output_df_LR_district_before_ADA, AUC_LR_district_before_ADA, LR_l2_district_before_ADA = train_LR_and_bootstrap( data_before_district_train_ADA, data_before_district_test, predictorList)
output_df_LR_district_during_ADA, AUC_LR_district_during_ADA, LR_l2_district_during_ADA = train_LR_and_bootstrap( data_during_district_train_ADA, data_during_district_test, predictorList)
output_df_LR_county_before_ADA, AUC_LR_county_before_ADA, LR_l2_county_before_ADA = train_LR_and_bootstrap( data_before_county_train_ADA, data_before_county_test, predictorList)
output_df_LR_county_during_ADA, AUC_LR_county_during_ADA, LR_l2_county_during_ADA = train_LR_and_bootstrap( data_during_county_train_ADA, data_during_county_test, predictorList)
output_df_LR_subcounty_before_ADA, AUC_LR_subcounty_before_ADA, LR_l2_subcounty_before_ADA = train_LR_and_bootstrap( data_before_subcounty_train_ADA, data_before_subcounty_test, predictorList)
output_df_LR_subcounty_during_ADA, AUC_LR_subcounty_during_ADA, LR_l2_subcounty_during_ADA = train_LR_and_bootstrap( data_during_subcounty_train_ADA, data_during_subcounty_test, predictorList)
output_df_LR_district_before_SMOTE, AUC_LR_district_before_SMOTE, LR_l2_district_before_SMOTE = train_LR_and_bootstrap( data_before_district_train_SMOTE, data_before_district_test, predictorList)
output_df_LR_district_during_SMOTE, AUC_LR_district_during_SMOTE, LR_l2_district_during_SMOTE = train_LR_and_bootstrap( data_during_district_train_SMOTE, data_during_district_test, predictorList)
output_df_LR_county_before_SMOTE, AUC_LR_county_before_SMOTE, LR_l2_county_before_SMOTE = train_LR_and_bootstrap( data_before_county_train_SMOTE, data_before_county_test, predictorList)
output_df_LR_county_during_SMOTE, AUC_LR_county_during_SMOTE, LR_l2_county_during_SMOTE = train_LR_and_bootstrap( data_during_county_train_SMOTE, data_during_county_test, predictorList)
output_df_LR_subcounty_before_SMOTE, AUC_LR_subcounty_before_SMOTE, LR_l2_subcounty_before_SMOTE = train_LR_and_bootstrap( data_before_subcounty_train_SMOTE, data_before_subcounty_test, predictorList)
output_df_LR_subcounty_during_SMOTE, AUC_LR_subcounty_during_SMOTE, LR_l2_subcounty_during_SMOTE = train_LR_and_bootstrap( data_during_subcounty_train_SMOTE, data_during_subcounty_test, predictorList)
output_df_LR_district_before_SMOTEENN, AUC_LR_district_before_SMOTEENN, LR_l2_district_before_SMOTEENN = train_LR_and_bootstrap( data_before_district_train_SMOTEENN, data_before_district_test, predictorList)
output_df_LR_district_during_SMOTEENN, AUC_LR_district_during_SMOTEENN, LR_l2_district_during_SMOTEENN = train_LR_and_bootstrap( data_during_district_train_SMOTEENN, data_during_district_test, predictorList)
output_df_LR_county_before_SMOTEENN, AUC_LR_county_before_SMOTEENN, LR_l2_county_before_SMOTEENN = train_LR_and_bootstrap( data_before_county_train_SMOTEENN, data_before_county_test, predictorList)
output_df_LR_county_during_SMOTEENN, AUC_LR_county_during_SMOTEENN, LR_l2_county_during_SMOTEENN = train_LR_and_bootstrap( data_during_county_train_SMOTEENN, data_during_county_test, predictorList)
output_df_LR_subcounty_before_SMOTEENN, AUC_LR_subcounty_before_SMOTEENN, LR_l2_subcounty_before_SMOTEENN = train_LR_and_bootstrap( data_before_subcounty_train_SMOTEENN, data_before_subcounty_test, predictorList)
output_df_LR_subcounty_during_SMOTEENN, AUC_LR_subcounty_during_SMOTEENN, LR_l2_subcounty_during_SMOTEENN = train_LR_and_bootstrap( data_during_subcounty_train_SMOTEENN, data_during_subcounty_test, predictorList)
output_df_LR_district_before_SMOTETOM, AUC_LR_district_before_SMOTETOM, LR_l2_district_before_SMOTETOM = train_LR_and_bootstrap( data_before_district_train_SMOTETOM, data_before_district_test, predictorList)
output_df_LR_district_during_SMOTETOM, AUC_LR_district_during_SMOTETOM, LR_l2_district_during_SMOTETOM = train_LR_and_bootstrap( data_during_district_train_SMOTETOM, data_during_district_test, predictorList)
output_df_LR_county_before_SMOTETOM, AUC_LR_county_before_SMOTETOM, LR_l2_county_before_SMOTETOM = train_LR_and_bootstrap( data_before_county_train_SMOTETOM, data_before_county_test, predictorList)
output_df_LR_county_during_SMOTETOM, AUC_LR_county_during_SMOTETOM, LR_l2_county_during_SMOTETOM = train_LR_and_bootstrap( data_during_county_train_SMOTETOM, data_during_county_test, predictorList)
output_df_LR_subcounty_before_SMOTETOM, AUC_LR_subcounty_before_SMOTETOM, LR_l2_subcounty_before_SMOTETOM = train_LR_and_bootstrap( data_before_subcounty_train_SMOTETOM, data_before_subcounty_test, predictorList)
output_df_LR_subcounty_during_SMOTETOM, AUC_LR_subcounty_during_SMOTETOM, LR_l2_subcounty_during_SMOTETOM = train_LR_and_bootstrap( data_during_subcounty_train_SMOTETOM, data_during_subcounty_test, predictorList)

In [None]:
pd.DataFrame(output_df_LR_district_before).to_csv('bld/single_case_before/output_df_LR_district_before.csv', index=False)
pd.DataFrame(output_df_LR_district_during).to_csv('bld/single_case/output_df_LR_district_during.csv', index=False)
pd.DataFrame(output_df_LR_subcounty_before).to_csv('bld/single_case_before/output_df_LR_subcounty_before.csv', index=False)
pd.DataFrame(output_df_LR_subcounty_during).to_csv('bld/single_case/output_df_LR_subcounty_during.csv', index=False)
pd.DataFrame(output_df_LR_county_before).to_csv('bld/single_case_before/output_df_LR_county_before.csv', index=False)
pd.DataFrame(output_df_LR_county_during).to_csv('bld/single_case/output_df_LR_county_during.csv', index=False)
pd.DataFrame(output_df_LR_district_before_ADA).to_csv('bld/single_case_before/output_df_LR_district_before_ADA.csv', index=False)
pd.DataFrame(output_df_LR_district_during_ADA).to_csv('bld/single_case/output_df_LR_district_during_ADA.csv', index=False)
pd.DataFrame(output_df_LR_county_before_ADA).to_csv('bld/single_case_before/output_df_LR_county_before_ADA.csv', index=False)
pd.DataFrame(output_df_LR_county_during_ADA).to_csv('bld/single_case/output_df_LR_county_during_ADA.csv', index=False)
pd.DataFrame(output_df_LR_subcounty_before_ADA).to_csv('bld/single_case_before/output_df_LR_subcounty_before_ADA.csv', index=False)
pd.DataFrame(output_df_LR_subcounty_during_ADA).to_csv('bld/single_case/output_df_LR_subcounty_during_ADA.csv', index=False)
pd.DataFrame(output_df_LR_district_before_SMOTE).to_csv('bld/single_case_before/output_df_LR_district_before_SMOTE.csv', index=False)
pd.DataFrame(output_df_LR_district_during_SMOTE).to_csv('bld/single_case/output_df_LR_district_during_SMOTE.csv', index=False)
pd.DataFrame(output_df_LR_county_before_SMOTE).to_csv('bld/single_case_before/output_df_LR_county_before_SMOTE.csv', index=False)
pd.DataFrame(output_df_LR_county_during_SMOTE).to_csv('bld/single_case/output_df_LR_county_during_SMOTE.csv', index=False)
pd.DataFrame(output_df_LR_subcounty_before_SMOTE).to_csv('bld/single_case_before/output_df_LR_subcounty_before_SMOTE.csv', index=False)
pd.DataFrame(output_df_LR_subcounty_during_SMOTE).to_csv('bld/single_case/output_df_LR_subcounty_during_SMOTE.csv', index=False)
pd.DataFrame(output_df_LR_district_before_SMOTEENN).to_csv('bld/single_case_before/output_df_LR_district_before_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_LR_district_during_SMOTEENN).to_csv('bld/single_case/output_df_LR_district_during_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_LR_county_before_SMOTEENN).to_csv('bld/single_case_before/output_df_LR_county_before_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_LR_county_during_SMOTEENN).to_csv('bld/single_case/output_df_LR_county_during_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_LR_subcounty_before_SMOTEENN).to_csv('bld/single_case_before/output_df_LR_subcounty_before_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_LR_subcounty_during_SMOTEENN).to_csv('bld/single_case/output_df_LR_subcounty_during_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_LR_district_before_SMOTETOM).to_csv('bld/single_case_before/output_df_LR_district_before_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_LR_district_during_SMOTETOM).to_csv('bld/single_case/output_df_LR_district_during_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_LR_county_before_SMOTETOM).to_csv('bld/single_case_before/output_df_LR_county_before_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_LR_county_during_SMOTETOM).to_csv('bld/single_case/output_df_LR_county_during_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_LR_subcounty_before_SMOTETOM).to_csv('bld/single_case_before/output_df_LR_subcounty_before_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_LR_subcounty_during_SMOTETOM).to_csv('bld/single_case/output_df_LR_subcounty_during_SMOTETOM.csv', index=False)

In [None]:
LR_dict_models = {
     'LR_l2_district_before': LR_l2_district_before,
        'LR_l2_district_during': LR_l2_district_during,
        'LR_l2_subcounty_before': LR_l2_subcounty_before,
        'LR_l2_subcounty_during': LR_l2_subcounty_during,
        'LR_l2_county_before': LR_l2_county_before,
        'LR_l2_county_during': LR_l2_county_during,
        'LR_l2_district_before_ADA': LR_l2_district_before_ADA,
        'LR_l2_district_during_ADA': LR_l2_district_during_ADA,
        'LR_l2_county_before_ADA': LR_l2_county_before_ADA,
        'LR_l2_county_during_ADA': LR_l2_county_during_ADA,
        'LR_l2_subcounty_before_ADA': LR_l2_subcounty_before_ADA,
        'LR_l2_subcounty_during_ADA': LR_l2_subcounty_during_ADA,
        'LR_l2_district_before_SMOTE': LR_l2_district_before_SMOTE,
        'LR_l2_district_during_SMOTE': LR_l2_district_during_SMOTE,
        'LR_l2_county_before_SMOTE': LR_l2_county_before_SMOTE,
        'LR_l2_county_during_SMOTE': LR_l2_county_during_SMOTE, 
        'LR_l2_subcounty_before_SMOTE': LR_l2_subcounty_before_SMOTE,
        'LR_l2_subcounty_during_SMOTE': LR_l2_subcounty_during_SMOTE,
        'LR_l2_district_before_SMOTEENN': LR_l2_district_before_SMOTEENN,
        'LR_l2_district_during_SMOTEENN': LR_l2_district_during_SMOTEENN,
        'LR_l2_county_before_SMOTEENN': LR_l2_county_before_SMOTEENN,
        'LR_l2_county_during_SMOTEENN': LR_l2_county_during_SMOTEENN,
        'LR_l2_subcounty_before_SMOTEENN': LR_l2_subcounty_before_SMOTEENN,
        'LR_l2_subcounty_during_SMOTEENN': LR_l2_subcounty_during_SMOTEENN,
        'LR_l2_district_before_SMOTETOM': LR_l2_district_before_SMOTETOM,
        'LR_l2_district_during_SMOTETOM': LR_l2_district_during_SMOTETOM,
        'LR_l2_county_before_SMOTETOM': LR_l2_county_before_SMOTETOM,
        'LR_l2_county_during_SMOTETOM': LR_l2_county_during_SMOTETOM,
        'LR_l2_subcounty_before_SMOTETOM': LR_l2_subcounty_before_SMOTETOM,
        'LR_l2_subcounty_during_SMOTETOM': LR_l2_subcounty_during_SMOTETOM
}

# Define the output directory
output_dir = 'bld/'

# Store each dictionary as a pickle file
with open(f'{output_dir}LR_dict_models.pkl', 'wb') as f:
    pickle.dump(LR_dict_models, f)

In [None]:
AUC_LR_dict_before = {
    'AUC_LR_district_before': AUC_LR_district_before, 
    'AUC_LR_subcounty_before': AUC_LR_subcounty_before, 
    'AUC_LR_county_before': AUC_LR_county_before, 
    'AUC_LR_district_before_ADA': AUC_LR_district_before_ADA, 
    'AUC_LR_county_before_ADA': AUC_LR_county_before_ADA, 
    'AUC_LR_subcounty_before_ADA': AUC_LR_subcounty_before_ADA, 
    'AUC_LR_district_before_SMOTE': AUC_LR_district_before_SMOTE, 
    'AUC_LR_county_before_SMOTE': AUC_LR_county_before_SMOTE, 
    'AUC_LR_subcounty_before_SMOTE': AUC_LR_subcounty_before_SMOTE, 
    'AUC_LR_district_before_SMOTEENN': AUC_LR_district_before_SMOTEENN, 
    'AUC_LR_county_before_SMOTEENN': AUC_LR_county_before_SMOTEENN, 
    'AUC_LR_subcounty_before_SMOTEENN': AUC_LR_subcounty_before_SMOTEENN, 
    'AUC_LR_district_before_SMOTETOM': AUC_LR_district_before_SMOTETOM,
    'AUC_LR_county_before_SMOTETOM': AUC_LR_county_before_SMOTETOM, 
    'AUC_LR_subcounty_before_SMOTETOM': AUC_LR_subcounty_before_SMOTETOM
}

pd.DataFrame(AUC_LR_dict_before, index=[0]).to_csv('bld/single_case_before/AUC_LR_dict_before.csv', index=False)

In [None]:
AUC_LR_dict_during = {
    'AUC_LR_district_during': AUC_LR_district_during, 
    'AUC_LR_subcounty_during': AUC_LR_subcounty_during, 
    'AUC_LR_county_during': AUC_LR_county_during, 
    'AUC_LR_district_during_ADA': AUC_LR_district_during_ADA, 
    'AUC_LR_county_during_ADA': AUC_LR_county_during_ADA, 
    'AUC_LR_subcounty_during_ADA': AUC_LR_subcounty_during_ADA, 
    'AUC_LR_district_during_SMOTE': AUC_LR_district_during_SMOTE, 
    'AUC_LR_county_during_SMOTE': AUC_LR_county_during_SMOTE, 
    'AUC_LR_subcounty_during_SMOTE': AUC_LR_subcounty_during_SMOTE, 
    'AUC_LR_district_during_SMOTEENN': AUC_LR_district_during_SMOTEENN, 
    'AUC_LR_county_during_SMOTEENN': AUC_LR_county_during_SMOTEENN, 
    'AUC_LR_subcounty_during_SMOTEENN': AUC_LR_subcounty_during_SMOTEENN, 
    'AUC_LR_district_during_SMOTETOM': AUC_LR_district_during_SMOTETOM,
    'AUC_LR_county_during_SMOTETOM': AUC_LR_county_during_SMOTETOM, 
    'AUC_LR_subcounty_during_SMOTETOM': AUC_LR_subcounty_during_SMOTETOM
}

pd.DataFrame(AUC_LR_dict_during, index=[0]).to_csv('bld/single_case/AUC_LR_dict_during.csv', index=False)

### Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import resample
from scipy.stats import uniform, loguniform
from skopt import BayesSearchCV, space 

def train_RF_and_bootstrap(train_data, test_data, predictor_list, scoring='roc_auc'):
    '''
    train_data: dataframe of training data, this case should be ...Train_...['train_10']
    test_data: dictionary of testing data
    '''
    X_train = train_data[predictor_list]
    Y_train = train_data['FCSStaus']
    
    param_grid = dict(max_depth = range(3, 11), 
                    max_features = range(3, 8),
                    min_samples_leaf = range(50, 201, 50))
    
    RF = RandomForestClassifier(random_state=527, class_weight='balanced')
    crossval = RandomizedSearchCV(RF, param_grid, cv=5, scoring=scoring, n_iter=100)
    crossval.fit(X_train, Y_train)
    
    RF_best = RandomForestClassifier(n_estimators=800, random_state=527, **crossval.best_params_, class_weight='balanced')
    RF_best.fit(X_train, Y_train)
    # we first train and test the model with the original data 
    X_test = test_data[predictor_list]
    Y_test = test_data['FCSStaus']
    
    y_RFpred_test = RF_best.predict(X_test)
    y_RFpredprob_test = RF_best.predict_proba(X_test)[:, 1]
    AUC_RF = roc_auc_score(Y_test, y_RFpredprob_test)
    output_df = {'Prob': y_RFpredprob_test, 'Y': Y_test}
    
    return output_df, AUC_RF, RF_best

In [None]:
output_df_RF_district_before, AUC_RF_district_before, RF_l2_district_before = train_RF_and_bootstrap( data_before_district_train, data_before_district_test, predictorList)
output_df_RF_district_during, AUC_RF_district_during, RF_l2_district_during = train_RF_and_bootstrap( data_during_district_train, data_during_district_test, predictorList)
output_df_RF_subcounty_before, AUC_RF_subcounty_before, RF_l2_subcounty_before = train_RF_and_bootstrap( data_before_subcounty_train, data_before_subcounty_test, predictorList)
output_df_RF_subcounty_during, AUC_RF_subcounty_during, RF_l2_subcounty_during = train_RF_and_bootstrap( data_during_subcounty_train, data_during_subcounty_test, predictorList)
output_df_RF_county_before, AUC_RF_county_before, RF_l2_county_before = train_RF_and_bootstrap( data_before_county_train, data_before_county_test, predictorList)
output_df_RF_county_during, AUC_RF_county_during, RF_l2_county_during = train_RF_and_bootstrap( data_during_county_train, data_during_county_test, predictorList)

output_df_RF_district_before_ADA, AUC_RF_district_before_ADA, RF_l2_district_before_ADA = train_RF_and_bootstrap( data_before_district_train_ADA, data_before_district_test, predictorList)
output_df_RF_district_during_ADA, AUC_RF_district_during_ADA, RF_l2_district_during_ADA = train_RF_and_bootstrap( data_during_district_train_ADA, data_during_district_test, predictorList)
output_df_RF_county_before_ADA, AUC_RF_county_before_ADA, RF_l2_county_before_ADA = train_RF_and_bootstrap( data_before_county_train_ADA, data_before_county_test, predictorList)
output_df_RF_county_during_ADA, AUC_RF_county_during_ADA, RF_l2_county_during_ADA = train_RF_and_bootstrap( data_during_county_train_ADA, data_during_county_test, predictorList)
output_df_RF_subcounty_before_ADA, AUC_RF_subcounty_before_ADA, RF_l2_subcounty_before_ADA = train_RF_and_bootstrap( data_before_subcounty_train_ADA, data_before_subcounty_test, predictorList)
output_df_RF_subcounty_during_ADA, AUC_RF_subcounty_during_ADA, RF_l2_subcounty_during_ADA = train_RF_and_bootstrap( data_during_subcounty_train_ADA, data_during_subcounty_test, predictorList)
output_df_RF_district_before_SMOTE, AUC_RF_district_before_SMOTE, RF_l2_district_before_SMOTE = train_RF_and_bootstrap( data_before_district_train_SMOTE, data_before_district_test, predictorList)
output_df_RF_district_during_SMOTE, AUC_RF_district_during_SMOTE, RF_l2_district_during_SMOTE = train_RF_and_bootstrap( data_during_district_train_SMOTE, data_during_district_test, predictorList)
output_df_RF_county_before_SMOTE, AUC_RF_county_before_SMOTE, RF_l2_county_before_SMOTE = train_RF_and_bootstrap( data_before_county_train_SMOTE, data_before_county_test, predictorList)
output_df_RF_county_during_SMOTE, AUC_RF_county_during_SMOTE, RF_l2_county_during_SMOTE = train_RF_and_bootstrap( data_during_county_train_SMOTE, data_during_county_test, predictorList)
output_df_RF_subcounty_before_SMOTE, AUC_RF_subcounty_before_SMOTE, RF_l2_subcounty_before_SMOTE = train_RF_and_bootstrap( data_before_subcounty_train_SMOTE, data_before_subcounty_test, predictorList)
output_df_RF_subcounty_during_SMOTE, AUC_RF_subcounty_during_SMOTE, RF_l2_subcounty_during_SMOTE = train_RF_and_bootstrap( data_during_subcounty_train_SMOTE, data_during_subcounty_test, predictorList)
output_df_RF_district_before_SMOTEENN, AUC_RF_district_before_SMOTEENN, RF_l2_district_before_SMOTEENN = train_RF_and_bootstrap( data_before_district_train_SMOTEENN, data_before_district_test, predictorList)
output_df_RF_district_during_SMOTEENN, AUC_RF_district_during_SMOTEENN, RF_l2_district_during_SMOTEENN = train_RF_and_bootstrap( data_during_district_train_SMOTEENN, data_during_district_test, predictorList)
output_df_RF_county_before_SMOTEENN, AUC_RF_county_before_SMOTEENN, RF_l2_county_before_SMOTEENN = train_RF_and_bootstrap( data_before_county_train_SMOTEENN, data_before_county_test, predictorList)
output_df_RF_county_during_SMOTEENN, AUC_RF_county_during_SMOTEENN, RF_l2_county_during_SMOTEENN = train_RF_and_bootstrap( data_during_county_train_SMOTEENN, data_during_county_test, predictorList)
output_df_RF_subcounty_before_SMOTEENN, AUC_RF_subcounty_before_SMOTEENN, RF_l2_subcounty_before_SMOTEENN = train_RF_and_bootstrap( data_before_subcounty_train_SMOTEENN, data_before_subcounty_test, predictorList)
output_df_RF_subcounty_during_SMOTEENN, AUC_RF_subcounty_during_SMOTEENN, RF_l2_subcounty_during_SMOTEENN = train_RF_and_bootstrap( data_during_subcounty_train_SMOTEENN, data_during_subcounty_test, predictorList)
output_df_RF_district_before_SMOTETOM, AUC_RF_district_before_SMOTETOM, RF_l2_district_before_SMOTETOM = train_RF_and_bootstrap( data_before_district_train_SMOTETOM, data_before_district_test, predictorList)
output_df_RF_district_during_SMOTETOM, AUC_RF_district_during_SMOTETOM, RF_l2_district_during_SMOTETOM = train_RF_and_bootstrap( data_during_district_train_SMOTETOM, data_during_district_test, predictorList)
output_df_RF_county_before_SMOTETOM, AUC_RF_county_before_SMOTETOM, RF_l2_county_before_SMOTETOM = train_RF_and_bootstrap( data_before_county_train_SMOTETOM, data_before_county_test, predictorList)
output_df_RF_county_during_SMOTETOM, AUC_RF_county_during_SMOTETOM, RF_l2_county_during_SMOTETOM = train_RF_and_bootstrap( data_during_county_train_SMOTETOM, data_during_county_test, predictorList)
output_df_RF_subcounty_before_SMOTETOM, AUC_RF_subcounty_before_SMOTETOM, RF_l2_subcounty_before_SMOTETOM = train_RF_and_bootstrap( data_before_subcounty_train_SMOTETOM, data_before_subcounty_test, predictorList)
output_df_RF_subcounty_during_SMOTETOM, AUC_RF_subcounty_during_SMOTETOM, RF_l2_subcounty_during_SMOTETOM = train_RF_and_bootstrap( data_during_subcounty_train_SMOTETOM, data_during_subcounty_test, predictorList)

In [None]:
pd.DataFrame(output_df_RF_district_before).to_csv('bld/single_case_before/output_df_RF_district_before.csv', index=False)
pd.DataFrame(output_df_RF_district_during).to_csv('bld/single_case/output_df_RF_district_during.csv', index=False)
pd.DataFrame(output_df_RF_subcounty_before).to_csv('bld/single_case_before/output_df_RF_subcounty_before.csv', index=False)
pd.DataFrame(output_df_RF_subcounty_during).to_csv('bld/single_case/output_df_RF_subcounty_during.csv', index=False)
pd.DataFrame(output_df_RF_county_before).to_csv('bld/single_case_before/output_df_RF_county_before.csv', index=False)
pd.DataFrame(output_df_RF_county_during).to_csv('bld/single_case/output_df_RF_county_during.csv', index=False)
pd.DataFrame(output_df_RF_district_before_ADA).to_csv('bld/single_case_before/output_df_RF_district_before_ADA.csv', index=False)
pd.DataFrame(output_df_RF_district_during_ADA).to_csv('bld/single_case/output_df_RF_district_during_ADA.csv', index=False)
pd.DataFrame(output_df_RF_county_before_ADA).to_csv('bld/single_case_before/output_df_RF_county_before_ADA.csv', index=False)
pd.DataFrame(output_df_RF_county_during_ADA).to_csv('bld/single_case/output_df_RF_county_during_ADA.csv', index=False)
pd.DataFrame(output_df_RF_subcounty_before_ADA).to_csv('bld/single_case_before/output_df_RF_subcounty_before_ADA.csv', index=False)
pd.DataFrame(output_df_RF_subcounty_during_ADA).to_csv('bld/single_case/output_df_RF_subcounty_during_ADA.csv', index=False)
pd.DataFrame(output_df_RF_district_before_SMOTE).to_csv('bld/single_case_before/output_df_RF_district_before_SMOTE.csv', index=False)
pd.DataFrame(output_df_RF_district_during_SMOTE).to_csv('bld/single_case/output_df_RF_district_during_SMOTE.csv', index=False)
pd.DataFrame(output_df_RF_county_before_SMOTE).to_csv('bld/single_case_before/output_df_RF_county_before_SMOTE.csv', index=False)
pd.DataFrame(output_df_RF_county_during_SMOTE).to_csv('bld/single_case/output_df_RF_county_during_SMOTE.csv', index=False)
pd.DataFrame(output_df_RF_subcounty_before_SMOTE).to_csv('bld/single_case_before/output_df_RF_subcounty_before_SMOTE.csv', index=False)
pd.DataFrame(output_df_RF_subcounty_during_SMOTE).to_csv('bld/single_case/output_df_RF_subcounty_during_SMOTE.csv', index=False)
pd.DataFrame(output_df_RF_district_before_SMOTEENN).to_csv('bld/single_case_before/output_df_RF_district_before_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_RF_district_during_SMOTEENN).to_csv('bld/single_case/output_df_RF_district_during_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_RF_county_before_SMOTEENN).to_csv('bld/single_case_before/output_df_RF_county_before_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_RF_county_during_SMOTEENN).to_csv('bld/single_case/output_df_RF_county_during_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_RF_subcounty_before_SMOTEENN).to_csv('bld/single_case_before/output_df_RF_subcounty_before_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_RF_subcounty_during_SMOTEENN).to_csv('bld/single_case/output_df_RF_subcounty_during_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_RF_district_before_SMOTETOM).to_csv('bld/single_case_before/output_df_RF_district_before_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_RF_district_during_SMOTETOM).to_csv('bld/single_case/output_df_RF_district_during_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_RF_county_before_SMOTETOM).to_csv('bld/single_case_before/output_df_RF_county_before_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_RF_county_during_SMOTETOM).to_csv('bld/single_case/output_df_RF_county_during_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_RF_subcounty_before_SMOTETOM).to_csv('bld/single_case_before/output_df_RF_subcounty_before_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_RF_subcounty_during_SMOTETOM).to_csv('bld/single_case/output_df_RF_subcounty_during_SMOTETOM.csv', index=False)

RF_dict_models = {
     'RF_l2_district_before': RF_l2_district_before,
        'RF_l2_district_during': RF_l2_district_during,
        'RF_l2_subcounty_before': RF_l2_subcounty_before,
        'RF_l2_subcounty_during': RF_l2_subcounty_during,
        'RF_l2_county_before': RF_l2_county_before,
        'RF_l2_county_during': RF_l2_county_during,
        'RF_l2_district_before_ADA': RF_l2_district_before_ADA,
        'RF_l2_district_during_ADA': RF_l2_district_during_ADA,
        'RF_l2_county_before_ADA': RF_l2_county_before_ADA,
        'RF_l2_county_during_ADA': RF_l2_county_during_ADA,
        'RF_l2_subcounty_before_ADA': RF_l2_subcounty_before_ADA,
        'RF_l2_subcounty_during_ADA': RF_l2_subcounty_during_ADA,
        'RF_l2_district_before_SMOTE': RF_l2_district_before_SMOTE,
        'RF_l2_district_during_SMOTE': RF_l2_district_during_SMOTE,
        'RF_l2_county_before_SMOTE': RF_l2_county_before_SMOTE,
        'RF_l2_county_during_SMOTE': RF_l2_county_during_SMOTE, 
        'RF_l2_subcounty_before_SMOTE': RF_l2_subcounty_before_SMOTE,
        'RF_l2_subcounty_during_SMOTE': RF_l2_subcounty_during_SMOTE,
        'RF_l2_district_before_SMOTEENN': RF_l2_district_before_SMOTEENN,
        'RF_l2_district_during_SMOTEENN': RF_l2_district_during_SMOTEENN,
        'RF_l2_county_before_SMOTEENN': RF_l2_county_before_SMOTEENN,
        'RF_l2_county_during_SMOTEENN': RF_l2_county_during_SMOTEENN,
        'RF_l2_subcounty_before_SMOTEENN': RF_l2_subcounty_before_SMOTEENN,
        'RF_l2_subcounty_during_SMOTEENN': RF_l2_subcounty_during_SMOTEENN,
        'RF_l2_district_before_SMOTETOM': RF_l2_district_before_SMOTETOM,
        'RF_l2_district_during_SMOTETOM': RF_l2_district_during_SMOTETOM,
        'RF_l2_county_before_SMOTETOM': RF_l2_county_before_SMOTETOM,
        'RF_l2_county_during_SMOTETOM': RF_l2_county_during_SMOTETOM,
        'RF_l2_subcounty_before_SMOTETOM': RF_l2_subcounty_before_SMOTETOM,
        'RF_l2_subcounty_during_SMOTETOM': RF_l2_subcounty_during_SMOTETOM
}

# Define the output directory
output_dir = 'bld/'

# Store each dictionary as a pickle file
with open(f'{output_dir}RF_dict_models.pkl', 'wb') as f:
    pickle.dump(RF_dict_models, f)

AUC_RF_dict_before = {
    'AUC_RF_district_before': AUC_RF_district_before, 
    'AUC_RF_subcounty_before': AUC_RF_subcounty_before, 
    'AUC_RF_county_before': AUC_RF_county_before, 
    'AUC_RF_district_before_ADA': AUC_RF_district_before_ADA, 
    'AUC_RF_county_before_ADA': AUC_RF_county_before_ADA, 
    'AUC_RF_subcounty_before_ADA': AUC_RF_subcounty_before_ADA, 
    'AUC_RF_district_before_SMOTE': AUC_RF_district_before_SMOTE, 
    'AUC_RF_county_before_SMOTE': AUC_RF_county_before_SMOTE, 
    'AUC_RF_subcounty_before_SMOTE': AUC_RF_subcounty_before_SMOTE, 
    'AUC_RF_district_before_SMOTEENN': AUC_RF_district_before_SMOTEENN, 
    'AUC_RF_county_before_SMOTEENN': AUC_RF_county_before_SMOTEENN, 
    'AUC_RF_subcounty_before_SMOTEENN': AUC_RF_subcounty_before_SMOTEENN, 
    'AUC_RF_district_before_SMOTETOM': AUC_RF_district_before_SMOTETOM,
    'AUC_RF_county_before_SMOTETOM': AUC_RF_county_before_SMOTETOM, 
    'AUC_RF_subcounty_before_SMOTETOM': AUC_RF_subcounty_before_SMOTETOM
}

pd.DataFrame(AUC_RF_dict_before, index=[0]).to_csv('bld/single_case_before/AUC_RF_dict_before.csv', index=False)

AUC_RF_dict_during = {
    'AUC_RF_district_during': AUC_RF_district_during, 
    'AUC_RF_subcounty_during': AUC_RF_subcounty_during, 
    'AUC_RF_county_during': AUC_RF_county_during, 
    'AUC_RF_district_during_ADA': AUC_RF_district_during_ADA, 
    'AUC_RF_county_during_ADA': AUC_RF_county_during_ADA, 
    'AUC_RF_subcounty_during_ADA': AUC_RF_subcounty_during_ADA, 
    'AUC_RF_district_during_SMOTE': AUC_RF_district_during_SMOTE, 
    'AUC_RF_county_during_SMOTE': AUC_RF_county_during_SMOTE, 
    'AUC_RF_subcounty_during_SMOTE': AUC_RF_subcounty_during_SMOTE, 
    'AUC_RF_district_during_SMOTEENN': AUC_RF_district_during_SMOTEENN, 
    'AUC_RF_county_during_SMOTEENN': AUC_RF_county_during_SMOTEENN, 
    'AUC_RF_subcounty_during_SMOTEENN': AUC_RF_subcounty_during_SMOTEENN, 
    'AUC_RF_district_during_SMOTETOM': AUC_RF_district_during_SMOTETOM,
    'AUC_RF_county_during_SMOTETOM': AUC_RF_county_during_SMOTETOM, 
    'AUC_RF_subcounty_during_SMOTETOM': AUC_RF_subcounty_during_SMOTETOM
}

pd.DataFrame(AUC_RF_dict_during, index=[0]).to_csv('bld/single_case/AUC_RF_dict_during.csv', index=False)

### XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import resample
import numpy as np
from scipy.stats import uniform, loguniform
from skopt import BayesSearchCV, space 

def train_XGB_and_bootstrap(train_data, test_data, predictor_list, scoring='roc_auc'):
    '''
    train_data: dataframe of training data, this case should be ...Train_...['train_10']
    test_data: dictionary of testing data
    '''
    param_test6 = dict(max_depth = range(3, 8), min_child_weight = uniform(loc=1, scale=9),
                          gamma = uniform(loc=0.5, scale=1.5), subsample = uniform(loc=0.6, scale=0.4),
                            colsample_bytree = uniform(loc=0.4, scale=0.4), reg_lambda = uniform(loc=100, scale=1400))
    
    X_train = train_data[predictor_list]
    Y_train = train_data['FCSStaus']
    class_weight = Y_train.value_counts()[0] / Y_train.value_counts()[1]
    
    gsearch6 = RandomizedSearchCV(estimator=XGBClassifier(learning_rate=0.05, n_estimators=800, scale_pos_weight=class_weight, 
                                                    objective='binary:logistic', nthread=4,seed=27),
                            param_distributions = param_test6, scoring=scoring, n_jobs=-1, cv=5, n_iter=100)
    
    gsearch6.fit(X_train, Y_train)
    
    paraFinalTrain = {**gsearch6.best_params_}
    xgb4 = XGBClassifier(learning_rate=0.01, n_estimators=4000,
                         **paraFinalTrain, 
                         objective='binary:logistic', nthread=4,seed=527, scale_pos_weight=class_weight)
    xgb4.fit(X_train, Y_train)
    # we first train and test the model with the original data 

    X_test = test_data[predictor_list]
    Y_test = test_data['FCSStaus']

    y_XGBpredprob_test = xgb4.predict_proba(X_test)[:, 1]
    AUC_XGB = roc_auc_score(Y_test, y_XGBpredprob_test)
    output_df = {'Prob': y_XGBpredprob_test, 'Y': Y_test}
    
    return output_df, AUC_XGB, xgb4

In [None]:
output_df_XGB_district_before, AUC_XGB_district_before, XGB_l2_district_before = train_XGB_and_bootstrap( data_before_district_train, data_before_district_test, predictorList)
output_df_XGB_district_during, AUC_XGB_district_during, XGB_l2_district_during = train_XGB_and_bootstrap( data_during_district_train, data_during_district_test, predictorList)
output_df_XGB_subcounty_before, AUC_XGB_subcounty_before, XGB_l2_subcounty_before = train_XGB_and_bootstrap( data_before_subcounty_train, data_before_subcounty_test, predictorList)
output_df_XGB_subcounty_during, AUC_XGB_subcounty_during, XGB_l2_subcounty_during = train_XGB_and_bootstrap( data_during_subcounty_train, data_during_subcounty_test, predictorList)
output_df_XGB_county_before, AUC_XGB_county_before, XGB_l2_county_before = train_XGB_and_bootstrap( data_before_county_train, data_before_county_test, predictorList)
output_df_XGB_county_during, AUC_XGB_county_during, XGB_l2_county_during = train_XGB_and_bootstrap( data_during_county_train, data_during_county_test, predictorList)

output_df_XGB_district_before_ADA, AUC_XGB_district_before_ADA, XGB_l2_district_before_ADA = train_XGB_and_bootstrap( data_before_district_train_ADA, data_before_district_test, predictorList)
output_df_XGB_district_during_ADA, AUC_XGB_district_during_ADA, XGB_l2_district_during_ADA = train_XGB_and_bootstrap( data_during_district_train_ADA, data_during_district_test, predictorList)
output_df_XGB_county_before_ADA, AUC_XGB_county_before_ADA, XGB_l2_county_before_ADA = train_XGB_and_bootstrap( data_before_county_train_ADA, data_before_county_test, predictorList)
output_df_XGB_county_during_ADA, AUC_XGB_county_during_ADA, XGB_l2_county_during_ADA = train_XGB_and_bootstrap( data_during_county_train_ADA, data_during_county_test, predictorList)
output_df_XGB_subcounty_before_ADA, AUC_XGB_subcounty_before_ADA, XGB_l2_subcounty_before_ADA = train_XGB_and_bootstrap( data_before_subcounty_train_ADA, data_before_subcounty_test, predictorList)
output_df_XGB_subcounty_during_ADA, AUC_XGB_subcounty_during_ADA, XGB_l2_subcounty_during_ADA = train_XGB_and_bootstrap( data_during_subcounty_train_ADA, data_during_subcounty_test, predictorList)
output_df_XGB_district_before_SMOTE, AUC_XGB_district_before_SMOTE, XGB_l2_district_before_SMOTE = train_XGB_and_bootstrap( data_before_district_train_SMOTE, data_before_district_test, predictorList)
output_df_XGB_district_during_SMOTE, AUC_XGB_district_during_SMOTE, XGB_l2_district_during_SMOTE = train_XGB_and_bootstrap( data_during_district_train_SMOTE, data_during_district_test, predictorList)
output_df_XGB_county_before_SMOTE, AUC_XGB_county_before_SMOTE, XGB_l2_county_before_SMOTE = train_XGB_and_bootstrap( data_before_county_train_SMOTE, data_before_county_test, predictorList)
output_df_XGB_county_during_SMOTE, AUC_XGB_county_during_SMOTE, XGB_l2_county_during_SMOTE = train_XGB_and_bootstrap( data_during_county_train_SMOTE, data_during_county_test, predictorList)
output_df_XGB_subcounty_before_SMOTE, AUC_XGB_subcounty_before_SMOTE, XGB_l2_subcounty_before_SMOTE = train_XGB_and_bootstrap( data_before_subcounty_train_SMOTE, data_before_subcounty_test, predictorList)
output_df_XGB_subcounty_during_SMOTE, AUC_XGB_subcounty_during_SMOTE, XGB_l2_subcounty_during_SMOTE = train_XGB_and_bootstrap( data_during_subcounty_train_SMOTE, data_during_subcounty_test, predictorList)
output_df_XGB_district_before_SMOTEENN, AUC_XGB_district_before_SMOTEENN, XGB_l2_district_before_SMOTEENN = train_XGB_and_bootstrap( data_before_district_train_SMOTEENN, data_before_district_test, predictorList)
output_df_XGB_district_during_SMOTEENN, AUC_XGB_district_during_SMOTEENN, XGB_l2_district_during_SMOTEENN = train_XGB_and_bootstrap( data_during_district_train_SMOTEENN, data_during_district_test, predictorList)
output_df_XGB_county_before_SMOTEENN, AUC_XGB_county_before_SMOTEENN, XGB_l2_county_before_SMOTEENN = train_XGB_and_bootstrap( data_before_county_train_SMOTEENN, data_before_county_test, predictorList)
output_df_XGB_county_during_SMOTEENN, AUC_XGB_county_during_SMOTEENN, XGB_l2_county_during_SMOTEENN = train_XGB_and_bootstrap( data_during_county_train_SMOTEENN, data_during_county_test, predictorList)
output_df_XGB_subcounty_before_SMOTEENN, AUC_XGB_subcounty_before_SMOTEENN, XGB_l2_subcounty_before_SMOTEENN = train_XGB_and_bootstrap( data_before_subcounty_train_SMOTEENN, data_before_subcounty_test, predictorList)
output_df_XGB_subcounty_during_SMOTEENN, AUC_XGB_subcounty_during_SMOTEENN, XGB_l2_subcounty_during_SMOTEENN = train_XGB_and_bootstrap( data_during_subcounty_train_SMOTEENN, data_during_subcounty_test, predictorList)
output_df_XGB_district_before_SMOTETOM, AUC_XGB_district_before_SMOTETOM, XGB_l2_district_before_SMOTETOM = train_XGB_and_bootstrap( data_before_district_train_SMOTETOM, data_before_district_test, predictorList)
output_df_XGB_district_during_SMOTETOM, AUC_XGB_district_during_SMOTETOM, XGB_l2_district_during_SMOTETOM = train_XGB_and_bootstrap( data_during_district_train_SMOTETOM, data_during_district_test, predictorList)
output_df_XGB_county_before_SMOTETOM, AUC_XGB_county_before_SMOTETOM, XGB_l2_county_before_SMOTETOM = train_XGB_and_bootstrap( data_before_county_train_SMOTETOM, data_before_county_test, predictorList)
output_df_XGB_county_during_SMOTETOM, AUC_XGB_county_during_SMOTETOM, XGB_l2_county_during_SMOTETOM = train_XGB_and_bootstrap( data_during_county_train_SMOTETOM, data_during_county_test, predictorList)
output_df_XGB_subcounty_before_SMOTETOM, AUC_XGB_subcounty_before_SMOTETOM, XGB_l2_subcounty_before_SMOTETOM = train_XGB_and_bootstrap( data_before_subcounty_train_SMOTETOM, data_before_subcounty_test, predictorList)
output_df_XGB_subcounty_during_SMOTETOM, AUC_XGB_subcounty_during_SMOTETOM, XGB_l2_subcounty_during_SMOTETOM = train_XGB_and_bootstrap( data_during_subcounty_train_SMOTETOM, data_during_subcounty_test, predictorList)

In [None]:
pd.DataFrame(output_df_XGB_district_before).to_csv('bld/single_case_before/output_df_XGB_district_before.csv', index=False)
pd.DataFrame(output_df_XGB_district_during).to_csv('bld/single_case/output_df_XGB_district_during.csv', index=False)
pd.DataFrame(output_df_XGB_subcounty_before).to_csv('bld/single_case_before/output_df_XGB_subcounty_before.csv', index=False)
pd.DataFrame(output_df_XGB_subcounty_during).to_csv('bld/single_case/output_df_XGB_subcounty_during.csv', index=False)
pd.DataFrame(output_df_XGB_county_before).to_csv('bld/single_case_before/output_df_XGB_county_before.csv', index=False)
pd.DataFrame(output_df_XGB_county_during).to_csv('bld/single_case/output_df_XGB_county_during.csv', index=False)
pd.DataFrame(output_df_XGB_district_before_ADA).to_csv('bld/single_case_before/output_df_XGB_district_before_ADA.csv', index=False)
pd.DataFrame(output_df_XGB_district_during_ADA).to_csv('bld/single_case/output_df_XGB_district_during_ADA.csv', index=False)
pd.DataFrame(output_df_XGB_county_before_ADA).to_csv('bld/single_case_before/output_df_XGB_county_before_ADA.csv', index=False)
pd.DataFrame(output_df_XGB_county_during_ADA).to_csv('bld/single_case/output_df_XGB_county_during_ADA.csv', index=False)
pd.DataFrame(output_df_XGB_subcounty_before_ADA).to_csv('bld/single_case_before/output_df_XGB_subcounty_before_ADA.csv', index=False)
pd.DataFrame(output_df_XGB_subcounty_during_ADA).to_csv('bld/single_case/output_df_XGB_subcounty_during_ADA.csv', index=False)
pd.DataFrame(output_df_XGB_district_before_SMOTE).to_csv('bld/single_case_before/output_df_XGB_district_before_SMOTE.csv', index=False)
pd.DataFrame(output_df_XGB_district_during_SMOTE).to_csv('bld/single_case/output_df_XGB_district_during_SMOTE.csv', index=False)
pd.DataFrame(output_df_XGB_county_before_SMOTE).to_csv('bld/single_case_before/output_df_XGB_county_before_SMOTE.csv', index=False)
pd.DataFrame(output_df_XGB_county_during_SMOTE).to_csv('bld/single_case/output_df_XGB_county_during_SMOTE.csv', index=False)
pd.DataFrame(output_df_XGB_subcounty_before_SMOTE).to_csv('bld/single_case_before/output_df_XGB_subcounty_before_SMOTE.csv', index=False)
pd.DataFrame(output_df_XGB_subcounty_during_SMOTE).to_csv('bld/single_case/output_df_XGB_subcounty_during_SMOTE.csv', index=False)
pd.DataFrame(output_df_XGB_district_before_SMOTEENN).to_csv('bld/single_case_before/output_df_XGB_district_before_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_XGB_district_during_SMOTEENN).to_csv('bld/single_case/output_df_XGB_district_during_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_XGB_county_before_SMOTEENN).to_csv('bld/single_case_before/output_df_XGB_county_before_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_XGB_county_during_SMOTEENN).to_csv('bld/single_case/output_df_XGB_county_during_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_XGB_subcounty_before_SMOTEENN).to_csv('bld/single_case_before/output_df_XGB_subcounty_before_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_XGB_subcounty_during_SMOTEENN).to_csv('bld/single_case/output_df_XGB_subcounty_during_SMOTEENN.csv', index=False)
pd.DataFrame(output_df_XGB_district_before_SMOTETOM).to_csv('bld/single_case_before/output_df_XGB_district_before_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_XGB_district_during_SMOTETOM).to_csv('bld/single_case/output_df_XGB_district_during_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_XGB_county_before_SMOTETOM).to_csv('bld/single_case_before/output_df_XGB_county_before_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_XGB_county_during_SMOTETOM).to_csv('bld/single_case/output_df_XGB_county_during_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_XGB_subcounty_before_SMOTETOM).to_csv('bld/single_case_before/output_df_XGB_subcounty_before_SMOTETOM.csv', index=False)
pd.DataFrame(output_df_XGB_subcounty_during_SMOTETOM).to_csv('bld/single_case/output_df_XGB_subcounty_during_SMOTETOM.csv', index=False)

XGB_dict_models = {
     'XGB_l2_district_before': XGB_l2_district_before,
        'XGB_l2_district_during': XGB_l2_district_during,
        'XGB_l2_subcounty_before': XGB_l2_subcounty_before,
        'XGB_l2_subcounty_during': XGB_l2_subcounty_during,
        'XGB_l2_county_before': XGB_l2_county_before,
        'XGB_l2_county_during': XGB_l2_county_during,
        'XGB_l2_district_before_ADA': XGB_l2_district_before_ADA,
        'XGB_l2_district_during_ADA': XGB_l2_district_during_ADA,
        'XGB_l2_county_before_ADA': XGB_l2_county_before_ADA,
        'XGB_l2_county_during_ADA': XGB_l2_county_during_ADA,
        'XGB_l2_subcounty_before_ADA': XGB_l2_subcounty_before_ADA,
        'XGB_l2_subcounty_during_ADA': XGB_l2_subcounty_during_ADA,
        'XGB_l2_district_before_SMOTE': XGB_l2_district_before_SMOTE,
        'XGB_l2_district_during_SMOTE': XGB_l2_district_during_SMOTE,
        'XGB_l2_county_before_SMOTE': XGB_l2_county_before_SMOTE,
        'XGB_l2_county_during_SMOTE': XGB_l2_county_during_SMOTE, 
        'XGB_l2_subcounty_before_SMOTE': XGB_l2_subcounty_before_SMOTE,
        'XGB_l2_subcounty_during_SMOTE': XGB_l2_subcounty_during_SMOTE,
        'XGB_l2_district_before_SMOTEENN': XGB_l2_district_before_SMOTEENN,
        'XGB_l2_district_during_SMOTEENN': XGB_l2_district_during_SMOTEENN,
        'XGB_l2_county_before_SMOTEENN': XGB_l2_county_before_SMOTEENN,
        'XGB_l2_county_during_SMOTEENN': XGB_l2_county_during_SMOTEENN,
        'XGB_l2_subcounty_before_SMOTEENN': XGB_l2_subcounty_before_SMOTEENN,
        'XGB_l2_subcounty_during_SMOTEENN': XGB_l2_subcounty_during_SMOTEENN,
        'XGB_l2_district_before_SMOTETOM': XGB_l2_district_before_SMOTETOM,
        'XGB_l2_district_during_SMOTETOM': XGB_l2_district_during_SMOTETOM,
        'XGB_l2_county_before_SMOTETOM': XGB_l2_county_before_SMOTETOM,
        'XGB_l2_county_during_SMOTETOM': XGB_l2_county_during_SMOTETOM,
        'XGB_l2_subcounty_before_SMOTETOM': XGB_l2_subcounty_before_SMOTETOM,
        'XGB_l2_subcounty_during_SMOTETOM': XGB_l2_subcounty_during_SMOTETOM
}

# Define the output directory
output_dir = 'bld/'

# Store each dictionary as a pickle file
with open(f'{output_dir}XGB_dict_models.pkl', 'wb') as f:
    pickle.dump(XGB_dict_models, f)

AUC_XGB_dict_before = {
    'AUC_XGB_district_before': AUC_XGB_district_before, 
    'AUC_XGB_subcounty_before': AUC_XGB_subcounty_before, 
    'AUC_XGB_county_before': AUC_XGB_county_before, 
    'AUC_XGB_district_before_ADA': AUC_XGB_district_before_ADA, 
    'AUC_XGB_county_before_ADA': AUC_XGB_county_before_ADA, 
    'AUC_XGB_subcounty_before_ADA': AUC_XGB_subcounty_before_ADA, 
    'AUC_XGB_district_before_SMOTE': AUC_XGB_district_before_SMOTE, 
    'AUC_XGB_county_before_SMOTE': AUC_XGB_county_before_SMOTE, 
    'AUC_XGB_subcounty_before_SMOTE': AUC_XGB_subcounty_before_SMOTE, 
    'AUC_XGB_district_before_SMOTEENN': AUC_XGB_district_before_SMOTEENN, 
    'AUC_XGB_county_before_SMOTEENN': AUC_XGB_county_before_SMOTEENN, 
    'AUC_XGB_subcounty_before_SMOTEENN': AUC_XGB_subcounty_before_SMOTEENN, 
    'AUC_XGB_district_before_SMOTETOM': AUC_XGB_district_before_SMOTETOM,
    'AUC_XGB_county_before_SMOTETOM': AUC_XGB_county_before_SMOTETOM, 
    'AUC_XGB_subcounty_before_SMOTETOM': AUC_XGB_subcounty_before_SMOTETOM
}

pd.DataFrame(AUC_XGB_dict_before, index=[0]).to_csv('bld/single_case_before/AUC_XGB_dict_before.csv', index=False)

AUC_XGB_dict_during = {
    'AUC_XGB_district_during': AUC_XGB_district_during, 
    'AUC_XGB_subcounty_during': AUC_XGB_subcounty_during, 
    'AUC_XGB_county_during': AUC_XGB_county_during, 
    'AUC_XGB_district_during_ADA': AUC_XGB_district_during_ADA, 
    'AUC_XGB_county_during_ADA': AUC_XGB_county_during_ADA, 
    'AUC_XGB_subcounty_during_ADA': AUC_XGB_subcounty_during_ADA, 
    'AUC_XGB_district_during_SMOTE': AUC_XGB_district_during_SMOTE, 
    'AUC_XGB_county_during_SMOTE': AUC_XGB_county_during_SMOTE, 
    'AUC_XGB_subcounty_during_SMOTE': AUC_XGB_subcounty_during_SMOTE, 
    'AUC_XGB_district_during_SMOTEENN': AUC_XGB_district_during_SMOTEENN, 
    'AUC_XGB_county_during_SMOTEENN': AUC_XGB_county_during_SMOTEENN, 
    'AUC_XGB_subcounty_during_SMOTEENN': AUC_XGB_subcounty_during_SMOTEENN, 
    'AUC_XGB_district_during_SMOTETOM': AUC_XGB_district_during_SMOTETOM,
    'AUC_XGB_county_during_SMOTETOM': AUC_XGB_county_during_SMOTETOM, 
    'AUC_XGB_subcounty_during_SMOTETOM': AUC_XGB_subcounty_during_SMOTETOM
}

pd.DataFrame(AUC_XGB_dict_during, index=[0]).to_csv('bld/single_case/AUC_XGB_dict_during.csv', index=False)