In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.preprocessing import label_binarize




# Tree Based model

In [2]:
# Define the function to calculate Precision, recall, F1, AUCPR, AUCROC
def calculate_AUCPR(y_test, y_score, n_classes):
    y_test_binarized=label_binarize(y_test, classes=[*range(n_classes)])
    precision = dict()
    recall = dict()
    area_pr = dict()
    precision_mean=dict()
    recall_mean=dict()
    
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test_binarized[:, i],
                                                            y_score[:, i])
        area_pr[i]= auc(recall[i], precision[i],)
        precision_mean[i]=precision[i].mean()
        recall_mean[i]=recall[i].mean()
        
    mean_AUCPR = np.mean(list(area_pr.values()))
    mean_precision = np.mean(list(precision_mean.values()))
    mean_recall = np.mean(list(recall_mean.values()))
    mean_f1=2 * (mean_precision * mean_recall) / (mean_precision + mean_recall)
    return mean_precision, mean_recall, mean_f1, mean_AUCPR

In [13]:
df_data=pd.read_csv('../v2_data_all/v2_data_coordinate_encoded_final.csv')
data_final=df_data.drop(columns=['lat','lon'])
X=data_final.drop(columns=['outcome'])
y=data_final['outcome']
object = MinMaxScaler()
X_scaled=object.fit_transform(X)

In [14]:

X_train, X_test, y_train, y_test = train_test_split(X_scaled,
                                                    y,
                                                    random_state = 42, 
                                                    test_size=0.2, stratify=y)

# lr
lr = LogisticRegression(random_state=42).fit(X_train, y_train)
y_score_lr = lr.predict_proba(X_test)
# knn
knn = KNeighborsClassifier().fit(X_train, y_train)
y_score_knn = lr.predict_proba(X_test)
# rf
rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
y_score_rf = rf.predict_proba(X_test)
# gb
gb = GradientBoostingClassifier(random_state=42).fit(X_train, y_train)
y_score_gb  = gb.predict_proba(X_test)
# lgbm
lgbm = lgb.LGBMClassifier(random_state=42).fit(X_train, y_train)
y_score_lgbm = lgbm.predict_proba(X_test)
# # xgb
xgb = XGBClassifier(random_state=42).fit(X_train, y_train)
y_score_xgb = xgb.predict_proba(X_test)
# svc
svc = SVC(random_state=42, probability=True).fit(X_train, y_train)
y_score_svc = svc.predict_proba(X_test)

print("model prediction finished")

model prediction finished


In [15]:
lr_precision, lr_recall, lr_f1, lr_AUCPR=calculate_AUCPR(y_test, y_score_lr, n_classes=3)
knn_precision, knn_recall, knn_f1, knn_AUCPR=calculate_AUCPR(y_test, y_score_knn, n_classes=3)
rf_precision, rf_recall, rf_f1, rf_AUCPR=calculate_AUCPR(y_test, y_score_rf, n_classes=3)
gb_precision, gb_recall, gb_f1, gb_AUCPR=calculate_AUCPR(y_test, y_score_gb, n_classes=3)
lgbm_precision, lgbm_recall, lgbm_f1, lgbm_AUCPR=calculate_AUCPR(y_test, y_score_lgbm, n_classes=3)
xgb_precision, xgb_recall, xgb_f1, xgb_AUCPR=calculate_AUCPR(y_test, y_score_xgb, n_classes=3)
svc_precision, svc_recall, svc_f1, svc_AUCPR=calculate_AUCPR(y_test, y_score_svc, n_classes=3)

lr_AUCROC= roc_auc_score(y_test, y_score_lr, average='weighted', multi_class='ovo') 
knn_AUCROC=roc_auc_score(y_test, y_score_knn, average='weighted', multi_class='ovo') 
rf_AUCROC=roc_auc_score(y_test, y_score_rf, average='weighted', multi_class='ovo') 
gb_AUCROC=roc_auc_score(y_test, y_score_gb, average='weighted', multi_class='ovo') 
lgbm_AUCROC=roc_auc_score(y_test, y_score_lgbm, average='weighted', multi_class='ovo') 
xgb_AUCROC=roc_auc_score(y_test, y_score_xgb, average='weighted', multi_class='ovo') 
svc_AUCROC=roc_auc_score(y_test, y_score_svc, average='weighted', multi_class='ovo') 
print("none finished")

none finished


In [16]:
new_row=[{'metric_type': 'no_location_precision', 'lr': lr_precision, 'knn':knn_precision, 'rf':rf_precision, 'gb':gb_precision, 'lgbm':lgbm_precision, 'xg':xgb_precision, 'svc':svc_precision}, 
        {'metric_type': 'no_location_recall',  'lr': lr_recall, 'knn':knn_recall, 'rf':rf_recall, 'gb':gb_recall, 'lgbm':lgbm_recall, 'xg':xgb_recall, 'svc':svc_recall},
        {'metric_type': 'no_location_F1', 'lr':lr_f1 , 'knn':knn_f1,'rf':rf_f1, 'gb':gb_f1, 'lgbm':lgbm_f1, 'xg':xgb_f1, 'svc':svc_f1},
        {'metric_type': 'no_location_AUCROC', 'lr':lr_AUCROC , 'knn':knn_AUCROC, 'rf':rf_AUCROC, 'gb':gb_AUCROC, 'lgbm':lgbm_AUCROC, 'xg':xgb_AUCROC, 'svc':svc_AUCROC},
        {'metric_type': 'no_location_AUCPR', 'lr':lr_AUCPR, 'knn':knn_AUCPR, 'rf':rf_AUCPR, 'gb':gb_AUCPR, 'lgbm':lgbm_AUCPR, 'xg':xgb_AUCPR, 'svc':svc_AUCPR}]
df_comparison = pd.DataFrame(new_row)
df_comparison.set_index('metric_type', inplace=True)
df_comparison

Unnamed: 0_level_0,lr,knn,rf,gb,lgbm,xg,svc
metric_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
no_location_precision,0.396261,0.396261,0.384355,0.415245,0.417077,0.400817,0.405585
no_location_recall,0.560318,0.560318,0.501782,0.583676,0.579757,0.570271,0.57601
no_location_F1,0.464221,0.464221,0.435288,0.48526,0.485143,0.470759,0.476003
no_location_AUCROC,0.586767,0.586767,0.571112,0.622856,0.61605,0.599403,0.609888
no_location_AUCPR,0.410091,0.410091,0.386918,0.444254,0.447223,0.418701,0.419982


In [17]:
# df_comparison.to_csv('base_model_comparison.csv')