In [7]:
import pickle
predictorList = ['FCSStaus_lag','NL_District_log', 'FoodInsecureMonthly_lag', 
       'precipitationMean', 'NDVI.Anomaly.Mean',
       'Average..mm.',  'X1.Month.Anomaly....', 'X3.Months.Anomaly....', 
       'fatalitiesMean_log','temperatureMean_log','NDVIMean_log',
       'MaleRatio', 'ShareToilet_Yes',
        'WaterSource_feq', 'IncomeStab_Somewhat stable',
       'IncomeStab_Very unstable',
       'SubjectivePoverty_Neither poor nor rich', 'SubjectivePoverty_Poor',
       'SubjectivePoverty_Very poor', 'RelLivStandard_Better off',
       'RelLivStandard_Same', 'RelLivStandard_Worse off', 'FamilySize', 
       'SelfStapleTypes', 'valueNow_Furniture', 
       'valueNow_Furniture_new', 'ValueAgoTotal_new']

## Here we generate the original data for analysis

In [8]:
file_paths = [
    r"bld\datasets\generated\data_before_county.pkl",
    r"bld\datasets\generated\data_before_district.pkl",
    r"bld\datasets\generated\data_before_subcounty.pkl",
    r"bld\datasets\generated\data_during_county.pkl",
    r"bld\datasets\generated\data_during_district.pkl",
    r"bld\datasets\generated\data_during_subcounty.pkl"
]

data_dict = {}

for path in file_paths:
    with open(path, 'rb') as file:
        data_dict[path] = pickle.load(file)

In [9]:
data_before_county = data_dict['bld\\datasets\\generated\\data_before_county.pkl']
data_before_district = data_dict['bld\\datasets\\generated\\data_before_district.pkl']
data_before_subcounty = data_dict['bld\\datasets\\generated\\data_before_subcounty.pkl']
data_during_county = data_dict['bld\\datasets\\generated\\data_during_county.pkl']
data_during_district = data_dict['bld\\datasets\\generated\\data_during_district.pkl']
data_during_subcounty = data_dict['bld\\datasets\\generated\\data_during_subcounty.pkl']

In [10]:
data_during_district_std = data_during_district['StdTrain_district']['train_10']
data_during_district_ada = data_during_district['ADASYN_Train_district']['train_10']
data_during_district_smote = data_during_district['SMOTE_Train_district']['train_10']
data_during_district_smoteenn = data_during_district['SMOTEENN_Train_district']['train_10']
data_during_district_smotetomek = data_during_district['SMOTETOM_Train_district']['train_10']
data_during_district_test = data_during_district['StdTest_district']['test_10']

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import resample
import numpy as np
import pandas as pd
from scipy.stats import uniform, loguniform
from skopt import BayesSearchCV, space 

def train_LR_and_bootstrap(train_data, test_data, predictor_list, scoring='roc_auc'):
    '''
    train_data: dataframe of training data, this case should be ...Train_...['train_10']
    test_data: dictionary of testing data
    '''
    X_train = train_data[predictor_list]
    Y_train = train_data['FCSStaus']
    
    LR_l1 = LogisticRegression(penalty='l1', random_state=527, solver='liblinear', max_iter=1000,  class_weight='balanced')
    # grid = dict(C=loguniform(0.001, 1000))
    grid = dict(C=loguniform(0.001, 1000))
    crossval = RandomizedSearchCV(LR_l1, grid, cv=5, scoring=scoring, random_state=527)
    # crossval = RandomizedSearchCV(LR_l1, grid, cv=5, scoring=scoring)
    crossval.fit(X_train, Y_train)
    
    LR_l2 = LogisticRegression(penalty='l1', random_state=527, max_iter=1000, **crossval.best_params_, solver='liblinear',  class_weight='balanced')
    LR_l2.fit(X_train, Y_train)
    # we first train and test the model with the original data 
    X_test = test_data[predictor_list]
    Y_test = test_data['FCSStaus']
    y_LRpredprob_test = LR_l2.predict_proba(X_test)[:, 1]
    AUC_LR = roc_auc_score(Y_test, y_LRpredprob_test)
    output_df = {'Prob': y_LRpredprob_test, 'Y': Y_test}
    
    return output_df, AUC_LR, LR_l2

def train_LR_and_bootstrap_noWe(train_data, test_data, predictor_list, scoring='roc_auc'):
    '''
    train_data: dataframe of training data, this case should be ...Train_...['train_10']
    test_data: dictionary of testing data
    '''
    X_train = train_data[predictor_list]
    Y_train = train_data['FCSStaus']
    
    LR_l1 = LogisticRegression(penalty='l1', random_state=527, solver='liblinear', max_iter=1000)
    # grid = dict(C=loguniform(0.001, 1000))
    grid = dict(C=loguniform(0.001, 1000))
    crossval = RandomizedSearchCV(LR_l1, grid, cv=5, scoring=scoring, random_state=527)
    # crossval = RandomizedSearchCV(LR_l1, grid, cv=5, scoring=scoring)
    crossval.fit(X_train, Y_train)
    
    LR_l2 = LogisticRegression(penalty='l1', random_state=527, max_iter=1000, **crossval.best_params_, solver='liblinear')
    LR_l2.fit(X_train, Y_train)
    # we first train and test the model with the original data 
    X_test = test_data[predictor_list]
    Y_test = test_data['FCSStaus']
    y_LRpredprob_test = LR_l2.predict_proba(X_test)[:, 1]
    AUC_LR = roc_auc_score(Y_test, y_LRpredprob_test)
    output_df = {'Prob': y_LRpredprob_test, 'Y': Y_test}
    
    return output_df, AUC_LR, LR_l2

In [12]:
output_df_LR_district, AUC_LR_district, LR_l2_district = train_LR_and_bootstrap( data_during_district_std, data_during_district_test, predictorList)
output_df_LR_district_ada, AUC_LR_district_ada, LR_l2_district_ada = train_LR_and_bootstrap( data_during_district_ada, data_during_district_test, predictorList)
output_df_LR_district_smote, AUC_LR_district_smote, LR_l2_district_smote = train_LR_and_bootstrap( data_during_district_smote, data_during_district_test, predictorList)
output_df_LR_district_smoteenn, AUC_LR_district_smoteenn, LR_l2_district_smoteenn = train_LR_and_bootstrap( data_during_district_smoteenn, data_during_district_test, predictorList)
output_df_LR_district_smotetomek, AUC_LR_district_smotetomek, LR_l2_district_smotetomek = train_LR_and_bootstrap( data_during_district_smotetomek, data_during_district_test, predictorList)
output_df_LR_district_noW, AUC_LR_district_noW, LR_l2_district_noW = train_LR_and_bootstrap_noWe( data_during_district_std, data_during_district_test, predictorList)

In [13]:
pd.DataFrame(output_df_LR_district).to_csv('bld/select_feature/output_df_LR_district.csv', index=False)
pd.DataFrame(output_df_LR_district_ada).to_csv('bld/select_feature/output_df_LR_district_ada.csv', index=False)
pd.DataFrame(output_df_LR_district_smote).to_csv('bld/select_feature/output_df_LR_district_smote.csv', index=False)
pd.DataFrame(output_df_LR_district_smoteenn).to_csv('bld/select_feature/output_df_LR_district_smoteenn.csv', index=False)
pd.DataFrame(output_df_LR_district_smotetomek).to_csv('bld/select_feature/output_df_LR_district_smotetomek.csv', index=False)
pd.DataFrame(output_df_LR_district_noW).to_csv('bld/select_feature/output_df_LR_district_noW.csv', index=False)

In [14]:
LR_dict_models_noW = {
    'LR_l2_district': LR_l2_district,
    'LR_l2_district_ada': LR_l2_district_ada,
    'LR_l2_district_smote': LR_l2_district_smote,
    'LR_l2_district_smoteenn': LR_l2_district_smoteenn,
    'LR_l2_district_smotetomek': LR_l2_district_smotetomek, 
    'LR_l2_district_noW': LR_l2_district_noW
}

# Define the output directory
output_dir = 'bld/select_feature/'

# Store each dictionary as a pickle file
with open(f'{output_dir}LR_dict_models.pkl', 'wb') as f:
    pickle.dump(LR_dict_models_noW, f)

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import resample
import numpy as np
from scipy.stats import uniform, loguniform
from skopt import BayesSearchCV, space 

def train_RF_and_bootstrap(train_data, test_data, predictor_list, scoring='roc_auc'):
    '''
    train_data: dataframe of training data, this case should be ...Train_...['train_10']
    test_data: dictionary of testing data
    '''
    X_train = train_data[predictor_list]
    Y_train = train_data['FCSStaus']
    
    param_grid = dict(max_depth = range(3, 11), 
                    max_features = range(3, 8),
                    min_samples_leaf = range(50, 201, 50))
    
    RF = RandomForestClassifier(random_state=527,  class_weight='balanced')
    crossval = RandomizedSearchCV(RF, param_grid, cv=5, scoring=scoring, n_iter=100)
    crossval.fit(X_train, Y_train)
    
    RF_best = RandomForestClassifier(n_estimators=800, random_state=527, **crossval.best_params_,  class_weight='balanced')
    RF_best.fit(X_train, Y_train)
    # we first train and test the model with the original data 
    X_test = test_data[predictor_list]
    Y_test = test_data['FCSStaus']
    
    y_RFpred_test = RF_best.predict(X_test)
    y_RFpredprob_test = RF_best.predict_proba(X_test)[:, 1]
    AUC_RF = roc_auc_score(Y_test, y_RFpredprob_test)
    output_df = {'Prob': y_RFpredprob_test, 'Y': Y_test}
    
    return output_df, AUC_RF, RF_best


def train_RF_and_bootstrap_noWe(train_data, test_data, predictor_list, scoring='roc_auc'):
    '''
    train_data: dataframe of training data, this case should be ...Train_...['train_10']
    test_data: dictionary of testing data
    '''
    X_train = train_data[predictor_list]
    Y_train = train_data['FCSStaus']
    
    param_grid = dict(max_depth = range(3, 11), 
                    max_features = range(3, 8),
                    min_samples_leaf = range(50, 201, 50))
    
    RF = RandomForestClassifier(random_state=527)
    crossval = RandomizedSearchCV(RF, param_grid, cv=5, scoring=scoring, n_iter=100)
    crossval.fit(X_train, Y_train)
    
    RF_best = RandomForestClassifier(n_estimators=800, random_state=527, **crossval.best_params_)
    RF_best.fit(X_train, Y_train)
    # we first train and test the model with the original data 
    X_test = test_data[predictor_list]
    Y_test = test_data['FCSStaus']
    
    y_RFpred_test = RF_best.predict(X_test)
    y_RFpredprob_test = RF_best.predict_proba(X_test)[:, 1]
    AUC_RF = roc_auc_score(Y_test, y_RFpredprob_test)
    output_df = {'Prob': y_RFpredprob_test, 'Y': Y_test}
    
    return output_df, AUC_RF, RF_best

In [16]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import resample
import numpy as np
from scipy.stats import uniform, loguniform
from skopt import BayesSearchCV, space 

def train_XGB_and_bootstrap(train_data, test_data, predictor_list, scoring='roc_auc'):
    '''
    train_data: dataframe of training data, this case should be ...Train_...['train_10']
    test_data: dictionary of testing data
    '''
    param_test6 = dict(max_depth = range(3, 8), min_child_weight = uniform(loc=1, scale=9),
                          gamma = uniform(loc=0.5, scale=1.5), subsample = uniform(loc=0.6, scale=0.4),
                            colsample_bytree = uniform(loc=0.4, scale=0.4), reg_lambda = uniform(loc=100, scale=1400))
    
    X_train = train_data[predictor_list]
    Y_train = train_data['FCSStaus']
    class_weight = Y_train.value_counts()[0] / Y_train.value_counts()[1]
    gsearch6 = RandomizedSearchCV(estimator=XGBClassifier(learning_rate=0.05, n_estimators=800,
                                                    objective='binary:logistic', scale_pos_weight=class_weight, nthread=4,seed=27),
                            param_distributions = param_test6, scoring=scoring, n_jobs=-1, cv=5, n_iter=100)
    
    gsearch6.fit(X_train, Y_train)
    
    paraFinalTrain = {**gsearch6.best_params_}
    xgb4 = XGBClassifier(learning_rate=0.01, n_estimators=4000,
                         **paraFinalTrain, scale_pos_weight=class_weight, 
                         objective='binary:logistic', nthread=4,seed=527)
    xgb4.fit(X_train, Y_train)
    # we first train and test the model with the original data 

    X_test = test_data[predictor_list]
    Y_test = test_data['FCSStaus']

    y_XGBpredprob_test = xgb4.predict_proba(X_test)[:, 1]
    AUC_XGB = roc_auc_score(Y_test, y_XGBpredprob_test)
    output_df = {'Prob': y_XGBpredprob_test, 'Y': Y_test}
    
    return output_df, AUC_XGB, xgb4
  
  
def train_XGB_and_bootstrap_noWe(train_data, test_data, predictor_list, scoring='roc_auc'):
    '''
    train_data: dataframe of training data, this case should be ...Train_...['train_10']
    test_data: dictionary of testing data
    '''
    param_test6 = dict(max_depth = range(3, 8), min_child_weight = uniform(loc=1, scale=9),
                          gamma = uniform(loc=0.5, scale=1.5), subsample = uniform(loc=0.6, scale=0.4),
                            colsample_bytree = uniform(loc=0.4, scale=0.4), reg_lambda = uniform(loc=100, scale=1400))
    
    X_train = train_data[predictor_list]
    Y_train = train_data['FCSStaus']
    gsearch6 = RandomizedSearchCV(estimator=XGBClassifier(learning_rate=0.05, n_estimators=800,
                                                    objective='binary:logistic', nthread=4,seed=27),
                            param_distributions = param_test6, scoring=scoring, n_jobs=-1, cv=5, n_iter=100)
    
    gsearch6.fit(X_train, Y_train)
    
    paraFinalTrain = {**gsearch6.best_params_}
    xgb4 = XGBClassifier(learning_rate=0.01, n_estimators=4000,
                         **paraFinalTrain, 
                         objective='binary:logistic', nthread=4,seed=527)
    xgb4.fit(X_train, Y_train)
    # we first train and test the model with the original data 

    X_test = test_data[predictor_list]
    Y_test = test_data['FCSStaus']

    y_XGBpredprob_test = xgb4.predict_proba(X_test)[:, 1]
    AUC_XGB = roc_auc_score(Y_test, y_XGBpredprob_test)
    output_df = {'Prob': y_XGBpredprob_test, 'Y': Y_test}
    
    return output_df, AUC_XGB, xgb4

In [17]:
output_df_RF_district, AUC_RF_district, RF_district = train_RF_and_bootstrap( data_during_district_std, data_during_district_test, predictorList)
output_df_RF_district_adasyn, AUC_RF_district_adasyn, RF_district_adasyn = train_RF_and_bootstrap( data_during_district_ada, data_during_district_test, predictorList)
output_df_RF_district_smote, AUC_RF_district_smote, RF_district_smote = train_RF_and_bootstrap( data_during_district_smote, data_during_district_test, predictorList)
output_df_RF_district_smoteenn, AUC_RF_district_smoteenn, RF_district_smoteenn = train_RF_and_bootstrap( data_during_district_smoteenn, data_during_district_test, predictorList)
output_df_RF_district_smotetomek, AUC_RF_district_smotetomek, RF_district_smotetomek = train_RF_and_bootstrap( data_during_district_smotetomek, data_during_district_test, predictorList)
output_df_RF_district_noW, AUC_RF_district_noW, RF_district_noW = train_RF_and_bootstrap_noWe( data_during_district_std, data_during_district_test, predictorList)

In [18]:
pd.DataFrame(output_df_RF_district).to_csv('bld/select_feature/output_df_RF_district.csv', index=False)
pd.DataFrame(output_df_RF_district_adasyn).to_csv('bld/select_feature/output_df_RF_district_adasyn.csv', index=False)
pd.DataFrame(output_df_RF_district_smote).to_csv('bld/select_feature/output_df_RF_district_smote.csv', index=False)
pd.DataFrame(output_df_RF_district_smoteenn).to_csv('bld/select_feature/output_df_RF_district_smoteenn.csv', index=False)
pd.DataFrame(output_df_RF_district_smotetomek).to_csv('bld/select_feature/output_df_RF_district_smotetomek.csv', index=False)
pd.DataFrame(output_df_RF_district_noW).to_csv('bld/select_feature/output_df_RF_district_noW.csv', index=False)


In [19]:
RF_dict_models_noW = {
    'RF_district': RF_district,
    'RF_district_adasyn': RF_district_adasyn,
    'RF_district_smote': RF_district_smote,
    'RF_district_smoteenn': RF_district_smoteenn,
    'RF_district_smotetomek': RF_district_smotetomek,
    'RF_district_noW': RF_district_noW
}

# Define the output directory
output_dir = 'bld/select_feature/'

# Store each dictionary as a pickle file
with open(f'{output_dir}RF_dict_models.pkl', 'wb') as f:
    pickle.dump(RF_dict_models_noW, f)

In [20]:
output_df_XGB_district, AUC_XGB_district, XGB_district = train_XGB_and_bootstrap( data_during_district_std, data_during_district_test, predictorList)
output_df_XGB_district_adasyn, AUC_XGB_district_adasyn, XGB_district_adasyn = train_XGB_and_bootstrap( data_during_district_ada, data_during_district_test, predictorList)
output_df_XGB_district_smote, AUC_XGB_district_smote, XGB_district_smote = train_XGB_and_bootstrap( data_during_district_smote, data_during_district_test, predictorList)
output_df_XGB_district_smoteenn, AUC_XGB_district_smoteenn, XGB_district_smoteenn = train_XGB_and_bootstrap( data_during_district_smoteenn, data_during_district_test, predictorList)
output_df_XGB_district_smotetomek, AUC_XGB_district_smotetomek, XGB_district_smotetomek = train_XGB_and_bootstrap( data_during_district_smotetomek, data_during_district_test, predictorList)
output_df_XGB_district_noW, AUC_XGB_district_noW, XGB_district_noW = train_XGB_and_bootstrap_noWe( data_during_district_std, data_during_district_test, predictorList)

In [21]:
pd.DataFrame(output_df_XGB_district).to_csv('bld/select_feature/output_df_XGB_district.csv', index=False)
pd.DataFrame(output_df_XGB_district_adasyn).to_csv('bld/select_feature/output_df_XGB_district_adasyn.csv', index=False)
pd.DataFrame(output_df_XGB_district_smote).to_csv('bld/select_feature/output_df_XGB_district_smote.csv', index=False)
pd.DataFrame(output_df_XGB_district_smoteenn).to_csv('bld/select_feature/output_df_XGB_district_smoteenn.csv', index=False)
pd.DataFrame(output_df_XGB_district_smotetomek).to_csv('bld/select_feature/output_df_XGB_district_smotetomek.csv', index=False)
pd.DataFrame(output_df_XGB_district_noW).to_csv('bld/select_feature/output_df_XGB_district_noW.csv', index=False)

In [22]:
XGB_dict_models = {
    'XGB_district': XGB_district,
    'XGB_district_adasyn': XGB_district_adasyn,
    'XGB_district_smote': XGB_district_smote,
    'XGB_district_smoteenn': XGB_district_smoteenn,
    'XGB_district_smotetomek': XGB_district_smotetomek,
    'XGB_district_noW': XGB_district_noW  
}

# Define the output directory
output_dir = 'bld/select_feature/'

# Store each dictionary as a pickle file
with open(f'{output_dir}XGB_dict_models.pkl', 'wb') as f:
    pickle.dump(XGB_dict_models, f)

In [23]:
AUC_LR_dict = {
    'AUC_LR_district': AUC_LR_district,
    'AUC_LR_district_ada': AUC_LR_district_ada,
    'AUC_LR_district_smote': AUC_LR_district_smote,
    'AUC_LR_district_smoteenn': AUC_LR_district_smoteenn,
    'AUC_LR_district_smotetomek': AUC_LR_district_smotetomek,
    'AUC_LR_district_noW': AUC_LR_district_noW
}

AUC_RF_dict = {
    'AUC_RF_district': AUC_RF_district,
    'AUC_RF_district_adasyn': AUC_RF_district_adasyn,
    'AUC_RF_district_smote': AUC_RF_district_smote,
    'AUC_RF_district_smoteenn': AUC_RF_district_smoteenn,
    'AUC_RF_district_smotetomek': AUC_RF_district_smotetomek,
    'AUC_RF_district_noW': AUC_RF_district_noW
}

AUC_XGB_dict = {
    'AUC_XGB_district': AUC_XGB_district,
    'AUC_XGB_district_adasyn': AUC_XGB_district_adasyn,
    'AUC_XGB_district_smote': AUC_XGB_district_smote,
    'AUC_XGB_district_smoteenn': AUC_XGB_district_smoteenn,
    'AUC_XGB_district_smotetomek': AUC_XGB_district_smotetomek,
    'AUC_XGB_district_noW': AUC_XGB_district_noW
}

pd.DataFrame(AUC_LR_dict, index=[0]).to_csv('bld/select_feature/AUC_LR_dict.csv', index=False)
pd.DataFrame(AUC_RF_dict, index=[0]).to_csv('bld/select_feature/AUC_RF_dict.csv', index=False)
pd.DataFrame(AUC_XGB_dict, index=[0]).to_csv('bld/select_feature/AUC_XGB_dict.csv', index=False)