# GD2T2D: Model Building

In this notebook we will:
- Create the validation framework
- Test models
- Fine tune the models
- Find the best model

In [2]:
# Imports
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
data_path = "../data/processed/decoded_data.csv"

In [20]:
df = pd.read_csv(data_path)

In [21]:
df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("(", "").str.replace(")", "")

## Creating the validation framework

In [7]:
from sklearn.model_selection import train_test_split

In [22]:

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [23]:
df_full_train.reset_index(drop=True)

Unnamed: 0,older_maternal_age,high_pre_pregnancy_bmi_or_overweight,family_history_of_diabetes,socioeconomic_factors_deprivation_quintile,presence_of_t2dm_associated_gene_variants,ethnicity,multiparity,insulin_treatment_during_pregnancy,pregnancy_complications_hypertensive_disorders,pregnancy_complications_preterm_delivery,...,perinatal_outcome_28_day_mortality,obesity_or_unhealthy_postpartum_weight_gain,physical_inactivity,unhealthy_diet,smoking,alcohol_intake,does_not_undergo_postpartum_glucose_screening,breastfeeding,history_of_recurrence_of_gdm,t2dm_risk
0,no,yes,no,1,no,asian,yes,yes,yes,yes,...,no,yes,yes,no,no,no,yes,no,no,1
1,yes,yes,no,3,no,asian,no,yes,yes,no,...,no,yes,yes,no,no,yes,yes,no,no,0
2,no,yes,no,5,no,white,no,yes,yes,no,...,no,no,yes,no,yes,yes,no,no,no,1
3,no,yes,no,3,no,black,no,no,no,no,...,no,yes,yes,no,no,yes,no,yes,no,0
4,no,no,no,3,no,white,no,no,no,no,...,no,no,no,no,no,yes,no,no,no,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4795,yes,no,no,3,no,other,no,no,no,no,...,no,yes,yes,yes,no,no,yes,yes,no,0
4796,no,no,no,2,no,asian,yes,no,no,no,...,no,yes,no,no,no,yes,no,yes,no,0
4797,yes,yes,no,4,no,mixed,yes,yes,yes,yes,...,no,yes,yes,yes,no,no,no,yes,yes,1
4798,no,yes,yes,2,no,white,yes,no,yes,yes,...,no,yes,yes,yes,no,yes,no,no,no,1


In [24]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [25]:
y_train = df_train.t2dm_risk.values
y_val = df_val.t2dm_risk.values
y_test = df_test.t2dm_risk.values

In [26]:
del df_train['t2dm_risk']
del df_val['t2dm_risk']
del df_test['t2dm_risk']

## Initial Model Training

In this section we will train the following models on the `df_train` data and validate using the validation data. The following models will be tested:
- Decision Tree Classifier
- Random Forest Classifier
- XGBoost Classifier



In [57]:
## imports model imports
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


from sklearn.feature_extraction import DictVectorizer

## Metrics imports
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score



### Decision Tree Classifier

In [50]:
# Create dictionaries for the feature matrix
train_dicts = df_train.to_dict(orient='records')

# Then vectorise those dictionaries
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

#Instantiate the decisiontree classifier and train it
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)


y_pred = dt.predict_proba(X_val)[:, 1]

scores = []
model = "decision_tree"
auc = roc_auc_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred >= 0.5)
acc = accuracy_score(y_val, y_pred >= 0.5)
prec = precision_score(y_val, y_pred >= 0.5)
rec = recall_score(y_val, y_pred >= 0.5)
scores.append((model, auc, f1, acc, prec, rec))


In [51]:
columns = ['model', 'auc', 'f1', 'acc', 'prec', 'rec']
df_scores = pd.DataFrame(scores, columns=columns)

In [52]:
df_scores

Unnamed: 0,model,auc,f1,acc,prec,rec
0,decision_tree,0.79744,0.689542,0.841667,0.674121,0.705686


### Random Forest Classifer

In [54]:
# Create dictionaries for the feature matrix
train_dicts = df_train.to_dict(orient='records')

# Then vectorise those dictionaries
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

#Instantiate the decisiontree classifier and train it
rf = RandomForestClassifier(n_estimators=10, random_state=42)
rf.fit(X_train, y_train)

val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)


y_pred = rf.predict_proba(X_val)[:, 1]

scores = []
model = "random_forest"
auc = roc_auc_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred >= 0.5)
acc = accuracy_score(y_val, y_pred >= 0.5)
prec = precision_score(y_val, y_pred >= 0.5)
rec = recall_score(y_val, y_pred >= 0.5)
scores.append((model, auc, f1, acc, prec, rec))
df_scores.loc[len(df_scores)] = scores[0]



In [61]:
df_scores

Unnamed: 0,model,auc,f1,acc,prec,rec
0,decision_tree,0.79744,0.689542,0.841667,0.674121,0.705686
1,random_forest,0.907674,0.729904,0.86,0.702786,0.759197


### XGBoost 

In [62]:
features = list(dv.get_feature_names_out())

features

['abnormal_ogtt_results=abnormal',
 'abnormal_ogtt_results=normal',
 'alcohol_intake=no',
 'alcohol_intake=yes',
 'breastfeeding=no',
 'breastfeeding=yes',
 'does_not_undergo_postpartum_glucose_screening=no',
 'does_not_undergo_postpartum_glucose_screening=yes',
 'elevated_hba1c_during_pregnancy=elevated',
 'elevated_hba1c_during_pregnancy=normal',
 'ethnicity=asian',
 'ethnicity=black',
 'ethnicity=mixed',
 'ethnicity=other',
 'ethnicity=white',
 'family_history_of_diabetes=no',
 'family_history_of_diabetes=yes',
 'gestational_weight_gain=no',
 'gestational_weight_gain=yes',
 'high_pre_pregnancy_bmi_or_overweight=no',
 'high_pre_pregnancy_bmi_or_overweight=yes',
 'history_of_recurrence_of_gdm=no',
 'history_of_recurrence_of_gdm=yes',
 'instrumental_delivery=no',
 'instrumental_delivery=yes',
 'insulin_treatment_during_pregnancy=no',
 'insulin_treatment_during_pregnancy=yes',
 'large_for_gestational_age=no',
 'large_for_gestational_age=yes',
 'macrosomia_baby_birth_weightdelivered_a_ba

In [63]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [68]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=200)

y_pred = model.predict(dval)

scores = []
model = "XGBoost"
auc = roc_auc_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred >= 0.5)
acc = accuracy_score(y_val, y_pred >= 0.5)
prec = precision_score(y_val, y_pred >= 0.5)
rec = recall_score(y_val, y_pred >= 0.5)
scores.append((model, auc, f1, acc, prec, rec))
df_scores.loc[len(df_scores)] = scores[0]



In [69]:
df_scores

Unnamed: 0,model,auc,f1,acc,prec,rec
0,decision_tree,0.79744,0.689542,0.841667,0.674121,0.705686
1,random_forest,0.907674,0.729904,0.86,0.702786,0.759197
2,XGBoost,0.923598,0.731293,0.868333,0.743945,0.719064


Without any model tuning, decision tree performance is not close to the other two models in ROC AUC score. Out of the box, both random forest and XGboost perform well. Next steps will be to tune the models to see the impact this has on performance.