# IGP 5 Models - Final Models



In [37]:
# read from csv
male_X_train_scaled = pd.read_csv(output_csv_path + 'male_X_train_scaled.csv')
male_y_train = pd.read_csv(output_csv_path + 'male_y_train.csv').iloc[:, 0]
female_X_train_scaled = pd.read_csv(output_csv_path + 'female_X_train_scaled.csv')
female_y_train = pd.read_csv(output_csv_path + 'female_y_train.csv').iloc[:, 0]
both_X_train_scaled = pd.read_csv(output_csv_path + 'both_X_train_scaled.csv')
both_y_train = pd.read_csv(output_csv_path + 'both_y_train.csv').iloc[:, 0]

In [3]:
# load functions in python file with magic command
%run ../code/final-model.py

output_csv_path = '../output/'

## Male Dataset

Final models:

* Three features: `('SVM rbf', SVC(kernel='rbf', random_state=RANDOM_STATE))`
* Two features: `('LightGBM', LGBMClassifier(verbose=-1, random_state=RANDOM_STATE))`


### Male Two Feature Model - LightGBM

In [10]:
# drop cols
male_X_train_scaled_2 = male_X_train_scaled.iloc[:, [2, 3]]

# copy X, y
X = male_X_train_scaled_2.copy()
y = male_y_train.copy()

In [21]:
# parameters
param_distributions = {
    'max_depth': [2, 4, 6, 8, 10],
    'learning_rate': [0.001, 0.01, 0.1],
    'n_estimators': [100, 200, 300, 500],
    'num_leaves': [20, 30, 40],
    'min_data_in_leaf': [20, 30, 40],
    'feature_fraction': [0.6, 0.8, 1.0],
    'bagging_fraction': [0.6, 0.8, 1.0],
    'reg_alpha': [0.0, 0.1, 1.0],
    'reg_lambda': [0.0, 0.1, 1.0]
}



# custom scorer for MCC
mcc_scorer = make_scorer(matthews_corrcoef)

random_search = RandomizedSearchCV(estimator=lgbm_clf,
                                   param_distributions=param_distributions,
                                   scoring={'accuracy': 'accuracy', 'mcc': mcc_scorer},
                                   refit='mcc',
                                   cv=5,
                                   n_iter=100,  # random parameter combinations
                                   n_jobs=-1,
                                   verbose=1,
                                   random_state=RANDOM_STATE)



# fit random GridSearchCV 
random_search.fit(X, y) 

# print best parameters and scores
print("Best parameters: ", random_search.best_params_)
print("Best accuracy score: ", random_search.cv_results_['mean_test_accuracy'][random_search.best_index_])
print("Best MCC score: ", random_search.cv_results_['mean_test_mcc'][random_search.best_index_])

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters:  {'reg_lambda': 0.0, 'reg_alpha': 0.1, 'num_leaves': 30, 'n_estimators': 500, 'min_data_in_leaf': 30, 'max_depth': 8, 'learning_rate': 0.01, 'feature_fraction': 1.0, 'bagging_fraction': 0.6}
Best accuracy score:  0.8706821480406386
Best MCC score:  0.7518275775629697


**Best parameters:**

* Using `accuracy` and `MCC`
* So many parameters - chose these, in a randomised approach:
  * max_depth - depth of tree in ensemble; risk of overfitting if too high
  * learning_rate - shrinkage rate; step size of each iteration
  * n_estimators - number of decision treesto include
  * num_leaves - max umber of terminal nodes; too many can lead to overfitting
  * min_data_in_leaf - min number data points in leaf; can help prevent overfitting
  * feature_fraction - fraction of features to randomly select for each tree; value less than 1 can reduce overfitting and improve generalisation
  * bagging_fraction- fraction of data instance to be randomly sampled - can help reduce overfitting
  * reg_alpha - regularisation (Lasso) - penalty to objective function to reduce overfitting. 
  * reg_lambda - ridge regularisation - penalty to objective function to reduce overfitting

In [22]:
# best estimator from the grid search
best_estimator = random_search.best_estimator_

# save the model to a file
joblib.dump(best_estimator, 'male_2_feature_lgbm_model.pkl')

['male_2_feature_lgbm_model.pkl']

In [None]:
# load the saved model
loaded_model = joblib.load('male_2_feature_lgbm_model.pkl')

# loaded model for prediction
new_data = 
predictions = loaded_model.predict(new_data)

### Male Three Feature Model - SVC rbf

In [18]:
# drop cols
male_X_train_scaled_3 = male_X_train_scaled.iloc[:, [2, 3,9]]

# copy X, y
X = male_X_train_scaled_3.copy()
y = male_y_train.copy()

In [35]:
# parameter distributions
param_distributions = {
    'C': [0.1, 1, 10, 100, 500],
    'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
    'kernel': ['rbf'],
    #'degree': [2, 3, 4],  # polynomial kernel
    #'coef0': [0.0, 0.1, 0.5, 1.0]  # polynomial and sigmoid kernels
}

# custom scorer for MCC
mcc_scorer = make_scorer(matthews_corrcoef)

# SVC classifier
svc = SVC(random_state=RANDOM_STATE)

# RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=svc,
                                   param_distributions=param_distributions,
                                   scoring={'accuracy': 'accuracy', 'mcc': mcc_scorer},
                                   refit='mcc',  # refit for MCC
                                   cv=5,
                                   n_iter=100,
                                   n_jobs=-1,
                                   verbose=1,
                                   random_state=RANDOM_STATE)

# fit the RandomizedSearchCV 
random_search.fit(X, y)

# print the best parameters and scores
print("Best parameters: ", random_search.best_params_)
print("Best accuracy score: ", random_search.cv_results_['mean_test_accuracy'][random_search.best_index_])
print("Best MCC score: ", random_search.cv_results_['mean_test_mcc'][random_search.best_index_])



Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters:  {'kernel': 'rbf', 'gamma': 1, 'C': 100}
Best accuracy score:  0.8936865021770684
Best MCC score:  0.7895879188980575


In [36]:
# best estimator from the grid search
best_estimator = random_search.best_estimator_

# save the model to a file
joblib.dump(best_estimator, 'male_3_feature_svc_model.pkl')

['male_3_feature_svc_model.pkl']

**Best Parameters**
* C - regularisation parameter in SVM; manages trade-off between good margin and minimising training error
* gamma - kernel coefficent which affects influence of each training example on decision boundary
* kernel - which kernel function to use
  * rbf - radial basis function 
  * poly - polynomial
  * sigmoid - sigmoid


## Female - 

Two and three feature: ('Gradient Boosting', GradientBoostingClassifier(random_state=RANDOM_STATE))



### Female Two Feature model - Gradient Boosting

In [44]:
# drop cols
female_X_train_scaled_2 = female_X_train_scaled.iloc[:, [2, 3]]

# copy X, y
X = female_X_train_scaled_2.copy()
y = female_y_train.copy()

In [45]:
from sklearn.ensemble import GradientBoostingClassifier

# parameter distributions
param_distributions = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'max_leaf_nodes': [10, 20, 30, None]
}

#  scorer for MCC
mcc_scorer = make_scorer(matthews_corrcoef)

# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(random_state=RANDOM_STATE)

# RandomizedSearchCV 
random_search = RandomizedSearchCV(estimator=gbc,
                                   param_distributions=param_distributions,
                                   scoring={'accuracy': 'accuracy', 'mcc': mcc_scorer},
                                   refit='mcc',  # refit MCC metric
                                   cv=5,
                                   n_iter=100,
                                   n_jobs=-1,
                                   verbose=1,
                                   random_state=RANDOM_STATE)

# fit RandomizedSearchCV 
random_search.fit(X, y)

# print best parameters 
print("Best parameters: ", random_search.best_params_)
print("Best accuracy score: ", random_search.cv_results_['mean_test_accuracy'][random_search.best_index_])
print("Best MCC score: ", random_search.cv_results_['mean_test_mcc'][random_search.best_index_])

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Best parameters:  {'subsample': 0.8, 'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_leaf_nodes': None, 'max_features': 'log2', 'max_depth': 3, 'learning_rate': 0.05}
Best accuracy score:  0.8215384615384614
Best MCC score:  0.6076134037640286


In [46]:
# best estimator from the grid search
best_estimator = random_search.best_estimator_

# save the model to a file
joblib.dump(best_estimator, 'female_2_feature_gradboost_model.pkl')

['female_2_feature_gradboost_model.pkl']

### Female three features - Gradient Boosting

In [47]:
# drop cols
female_X_train_scaled_3 = female_X_train_scaled.iloc[:, [2, 3, 5]]

# copy X, y
X = female_X_train_scaled_3.copy()
y = female_y_train.copy()

In [48]:
from sklearn.ensemble import GradientBoostingClassifier

# parameter distributions
param_distributions = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'max_leaf_nodes': [10, 20, 30, None]
}

#  scorer for MCC
mcc_scorer = make_scorer(matthews_corrcoef)

# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(random_state=RANDOM_STATE)

# RandomizedSearchCV 
random_search = RandomizedSearchCV(estimator=gbc,
                                   param_distributions=param_distributions,
                                   scoring={'accuracy': 'accuracy', 'mcc': mcc_scorer},
                                   refit='mcc',  # refit MCC metric
                                   cv=5,
                                   n_iter=100,
                                   n_jobs=-1,
                                   verbose=1,
                                   random_state=RANDOM_STATE)

# fit RandomizedSearchCV 
random_search.fit(X, y)

# print best parameters 
print("Best parameters: ", random_search.best_params_)
print("Best accuracy score: ", random_search.cv_results_['mean_test_accuracy'][random_search.best_index_])
print("Best MCC score: ", random_search.cv_results_['mean_test_mcc'][random_search.best_index_])

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters:  {'subsample': 0.8, 'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_leaf_nodes': 10, 'max_features': 'sqrt', 'max_depth': 9, 'learning_rate': 0.01}
Best accuracy score:  0.8584615384615384
Best MCC score:  0.6834695166779801


In [49]:
# best estimator from the grid search
best_estimator = random_search.best_estimator_

# save the model to a file
joblib.dump(best_estimator, 'female_3_feature_gradboost_model.pkl')

['female_3_feature_gradboost_model.pkl']

## Both Dataset

### Both dataset - two features - Random Forest

In [54]:
# drop cols
both_X_train_scaled_2 = both_X_train_scaled.iloc[:, [2, 3]]

# copy X, y
X = both_X_train_scaled_2.copy()
y = both_y_train.copy()

In [56]:


# parameter distributions
param_distributions = {
    'n_estimators': [50, 100, 200, 300, 500],
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.4, 0.6, 0.8, 1.0],
    'max_leaf_nodes': [10, 20, 30, 40, None]
}

#  scorer for MCC
mcc_scorer = make_scorer(matthews_corrcoef)

# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(random_state=RANDOM_STATE)

# RandomizedSearchCV 
random_search = RandomizedSearchCV(estimator=gbc,
                                   param_distributions=param_distributions,
                                   scoring={'accuracy': 'accuracy', 'mcc': mcc_scorer},
                                   refit='mcc',  # refit MCC metric
                                   cv=5,
                                   n_iter=100,
                                   n_jobs=-1,
                                   verbose=1,
                                   random_state=RANDOM_STATE)

# fit RandomizedSearchCV 
random_search.fit(X, y)

# print best parameters 
print("Best parameters: ", random_search.best_params_)
print("Best accuracy score: ", random_search.cv_results_['mean_test_accuracy'][random_search.best_index_])
print("Best MCC score: ", random_search.cv_results_['mean_test_mcc'][random_search.best_index_])

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters:  {'subsample': 0.4, 'n_estimators': 100, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_leaf_nodes': None, 'max_features': 'sqrt', 'max_depth': 5, 'learning_rate': 0.01}
Best accuracy score:  0.8048239895697524
Best MCC score:  0.5992459582168063


In [57]:
# best estimator from the grid search
best_estimator = random_search.best_estimator_

# save the model to a file
joblib.dump(best_estimator, 'both_2_feature_gradboost_model.pkl')

['both_2_feature_gradboost_model.pkl']

### Both dataset - three features - Gradient Boosting

In [58]:
# drop cols
both_X_train_scaled_3 = both_X_train_scaled.iloc[:, [2, 3, 6]]

# copy X, y
X = both_X_train_scaled_3.copy()
y = both_y_train.copy()

In [59]:

from sklearn.ensemble import RandomForestClassifier


# parameter distributions
param_distributions = {
    'n_estimators': [100, 200, 300, 500, 1000],
    'max_depth': [3, 5, 7, 9, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'max_leaf_nodes': [5, 8, 10, 20, 30, None]
}

# scorer for MCC
mcc_scorer = make_scorer(matthews_corrcoef)

# rf Classifier
rf_clf = RandomForestClassifier(random_state=RANDOM_STATE)

# RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf_clf,
                                   param_distributions=param_distributions,
                                   scoring={'accuracy': 'accuracy', 'mcc': mcc_scorer},
                                   refit='mcc',  # refit mcc
                                   cv=5,
                                   n_iter=100,
                                   n_jobs=-1,
                                   verbose=1,
                                   random_state=RANDOM_STATE)

# fit 
random_search.fit(X, y)

# print best parameters
print("Best parameters: ", random_search.best_params_)
print("Best accuracy score: ", random_search.cv_results_['mean_test_accuracy'][random_search.best_index_])
print("Best MCC score: ", random_search.cv_results_['mean_test_mcc'][random_search.best_index_])

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Best parameters:  {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_leaf_nodes': 20, 'max_features': 'log2', 'max_depth': 7, 'criterion': 'gini', 'bootstrap': True}
Best accuracy score:  0.8285528031290743
Best MCC score:  0.6474264277249798


In [60]:
# best estimator from the grid search
best_estimator = random_search.best_estimator_

# save the model to a file
joblib.dump(best_estimator, 'both_3_feature_rf_model.pkl')

['both_3_feature_rf_model.pkl']