# Machine Learning

## Initial Setup

Configuration for Jupyter to automatically apply changes in code base

In [1]:
%load_ext autoreload
%autoreload 2

Add parent directory (`server`) to system path, to be able to import modules from `src`

In [2]:
import sys
sys.path.append("..")

Import required libraries

In [7]:
from typing import Dict, Iterable, Any, Tuple, Union
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from src_backend.util import (
    MODELS_DIR, MODEL_LOGREG_FN, MODEL_KNN_FN, MODEL_RF_FN, MODEL_GBM_FN
)
from src_backend.data import TrainingPipeline

## Ingest Data

In [6]:
pipeline = TrainingPipeline(test_size=0.2, random_state=0)
pipeline.ingest()
X_train, X_test, y_train, y_test = pipeline.output()

## Model Training

In [10]:
# Model Types
ModelTypes = Union[
    LogisticRegression, KNeighborsClassifier,
    RandomForestClassifier, GradientBoostingClassifier
]

In [11]:
# Function to perform hyperparameter tuning and return trained model
def tune_hyperparam(
    clf_class, hyperparam_grid: Dict[str, Iterable], X_train, y_train,
    fixed_hyperparams: Dict[str, Any] = {}, num_folds: int = 5
) -> Tuple[ModelTypes, Dict[str, Any]]:
    # Perform grid search over defined hyperparameters
    clf = clf_class(**fixed_hyperparams)
    gs_estimator = GridSearchCV(clf, hyperparam_grid, scoring="accuracy", cv=num_folds)
    gs_estimator.fit(X_train, y_train)
    print(f'Best hyperparameters found: {gs_estimator.best_params_}')
    
    # Retrain model on entire training set
    clf_new = clf_class(**{**fixed_hyperparams, **gs_estimator.best_params_})
    clf_new.fit(X_train, y_train)
    return clf_new

# Function to compute training and test scores
def compute_scores(clf: ModelTypes, X_train, X_test, y_train, y_test) -> None:
    # Compute training accuracy
    predictions_train = clf.predict(X_train)
    acc_train = accuracy_score(y_train, predictions_train)

    # Compute test accuracy
    predictions_test = clf.predict(X_test)
    acc_test = accuracy_score(y_test, predictions_test)

    print(f"Training accuracy: {acc_train}")
    print(f"Test accuarcy: {acc_test}")

### Logistic Regression (One-vs-rest)

In [22]:
%%time
# Define hyperparameters to be searched during tuning
hyperparam_grid = {
    "C": [10 ** i for i in range(-2, 3)],
    "l1_ratio": [0.0, 0.25, 0.5, 0.75, 1.0]
}

# Define hyperparameters to be fixed
fixed_hyperparams = {
    "penalty": "elasticnet",
    "solver": "saga",
    "tol": 1e-3,
    "max_iter": 500
}

clf_logreg = tune_hyperparam(LogisticRegression, hyperparam_grid, X_train, y_train, fixed_hyperparams)
compute_scores(clf_logreg, X_train, X_test, y_train, y_test)

Best hyperparameters found: {'C': 1, 'l1_ratio': 0.0}
Training accuracy: 0.9833333333333333
Test accuarcy: 1.0
Wall time: 1.39 s


### k-Nearest Neighbors

In [23]:
%%time
# Define hyperparameters to be searched during tuning
hyperparam_grid = {
    'n_neighbors': [1, 3, 5, 9],
    'weights': ['uniform', 'distance']
}

clf_knn = tune_hyperparam(KNeighborsClassifier, hyperparam_grid, X_train, y_train)
compute_scores(clf_knn, X_train, X_test, y_train, y_test)

Best hyperparameters found: {'n_neighbors': 3, 'weights': 'uniform'}
Training accuracy: 0.95
Test accuarcy: 1.0
Wall time: 265 ms


### Random Forest

In [24]:
%%time
# Define hyperparameters to be searched during tuning
hyperparam_grid = {
    'n_estimators': [50, 100, 500],
    'max_features': ['sqrt', 'log2', 1/3]
}

# Define hyperparameters to be fixed
fixed_hyperparams = {
    'random_state': 0
}

clf_rf = tune_hyperparam(RandomForestClassifier, hyperparam_grid, X_train, y_train, fixed_hyperparams)
compute_scores(clf_rf, X_train, X_test, y_train, y_test)

Best hyperparameters found: {'max_features': 'sqrt', 'n_estimators': 50}
Training accuracy: 1.0
Test accuarcy: 0.9333333333333333
Wall time: 13.3 s


### Gradient Boosting

In [25]:
%%time
# Define hyperparameters to be searched during tuning
hyperparam_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.5, 0.1, 0.05, 0.01],
    'max_features': ['sqrt', 'log2']
}

# Define hyperparameters to be fixed
fixed_hyperparams = {
    'random_state': 0
}

clf_gbm = tune_hyperparam(GradientBoostingClassifier, hyperparam_grid, X_train, y_train, fixed_hyperparams)
compute_scores(clf_gbm, X_train, X_test, y_train, y_test)

Best hyperparameters found: {'learning_rate': 0.5, 'max_features': 'sqrt', 'n_estimators': 500}
Training accuracy: 1.0
Test accuarcy: 1.0
Wall time: 3min 45s


## Save Models

In [26]:
# Logistic Regression
joblib.dump(clf_logreg, MODELS_DIR / MODEL_LOGREG_FN)

# K-Nearest Neighbors
joblib.dump(clf_knn, MODELS_DIR / MODEL_KNN_FN)

# Random Forest
joblib.dump(clf_rf, MODELS_DIR / MODEL_RF_FN)

# Gradient Boosting
joblib.dump(clf_gbm, MODELS_DIR / MODEL_GBM_FN)

['C:\\Users\\thefo\\Documents\\git_repos\\app-iris-ml-react-fastapi\\models\\clf_gbm.pkl']