In [1]:
import sys
sys.path.append('/Users/tompease/Documents/Coding/titanic')
from utils.data_loader import TitanicLoader
from sklearn.model_selection import ShuffleSplit, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
from random import uniform
import pandas as pd
from sklearn.preprocessing import LabelEncoder

loader = TitanicLoader()
X, y = loader.load('Survived')

cv_split = ShuffleSplit(n_splits = 5, test_size = .3, train_size = .7, random_state = 42)

In [2]:
log_regression_param_dist = {
  'penalty': ['l1', 'l2', 'elasticnet'],
  'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
  'fit_intercept': [True, False],
  'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
  'warm_start': [True, False]
}

random_forest_param_dist = {
  'n_estimators': randint(1,1000),
  'criterion': ['gini', 'entropy', 'log_loss'],
  'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
  'min_samples_split': randint(1,20),
  'min_samples_leaf': randint(1,20),
  'min_weight_fraction_leaf': [0.0, 0.0001, 0.001, 0.1],
  'max_features': ['log2', 'sqrt', None],
  'bootstrap': [True, False],
 }

decision_tree_param_dist = {
  'criterion': ['gini', 'entropy', 'log_loss'],
  'splitter' : ['best', 'random'],
  'max_depth' : [1, 2, 4, 8, 16, None],
  'min_samples_split' : randint(2,10),
  'min_samples_leaf' : randint(1, 10),
  'min_weight_fraction_leaf': [0.0, 0.0001, 0.001, 0.1],
  'max_features': ['sqrt', 'log2', None]
}

grad_boost_param_dist = {
  'loss' : ['log_loss', 'exponential'],
  'learning_rate' : [0.001, 0.01, 0.1, 1, 10],
  'n_estimators': randint(1,1000),
  'min_samples_split': randint(1,10),
  'min_samples_leaf': randint(1,20),
  'min_weight_fraction_leaf': [0.0, 0.0001, 0.001, 0.1],
  'max_depth': randint(1,5),
  'max_features': ['sqrt', 'log2', None]
}

In [3]:
classification_algs = [
  [LogisticRegression(max_iter=10000), log_regression_param_dist],
  [RandomForestClassifier(), random_forest_param_dist],
  [DecisionTreeClassifier(), decision_tree_param_dist],
  [GradientBoostingClassifier(), grad_boost_param_dist]
]

In [4]:

ITERATIONS = 100

columns = ['MLA name', 'Parameters', 'Train accuracy', 'Test accuracy']
MLA_compare = pd.DataFrame(columns = columns)

row_index = 0

for el in classification_algs:
  alg = el[0]
  param_grid = el[1]
  
  name = alg.__class__.__name__
  MLA_compare.loc[row_index, 'MLA name'] = name

  tuned_model = RandomizedSearchCV(alg, param_distributions=param_grid, n_iter=ITERATIONS, scoring='accuracy', cv=cv_split, return_train_score=True)
  tuned_model.fit(X,y)

  MLA_compare.loc[row_index, 'Parameters'] = str(tuned_model.best_params_)
  MLA_compare.loc[row_index, 'Train accuracy'] = tuned_model.cv_results_["mean_train_score"][tuned_model.best_index_]
  MLA_compare.loc[row_index, 'Test accuracy'] = tuned_model.cv_results_["mean_test_score"][tuned_model.best_index_]

  row_index += 1

300 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/tompease/opt/miniconda3/envs/ds-env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tompease/opt/miniconda3/envs/ds-env/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/tompease/opt/miniconda3/envs/ds-env/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 64, in _check_solver
    raise ValueError(
ValueError: Only 'saga

In [5]:
MLA_compare.sort_values(by = ['Test accuracy'], ascending = False, inplace = True)

MLA_compare.loc[3, 'Parameters']

"{'learning_rate': 0.01, 'loss': 'log_loss', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 19, 'min_samples_split': 6, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 925}"

In [6]:
MLA_compare

Unnamed: 0,MLA name,Parameters,Train accuracy,Test accuracy
3,GradientBoostingClassifier,"{'learning_rate': 0.01, 'loss': 'log_loss', 'm...",0.866453,0.83209
1,RandomForestClassifier,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.8687,0.826119
2,DecisionTreeClassifier,"{'criterion': 'gini', 'max_depth': 16, 'max_fe...",0.858427,0.818657
0,LogisticRegression,"{'warm_start': True, 'solver': 'liblinear', 'p...",0.807384,0.810448
