In [2]:
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import optuna
from sklearn.svm import SVC

## Exercise 5: Classification

In [3]:
DATA_PATH = Path("../data/classification")

In [4]:
X_train = np.load(DATA_PATH/"X_train.npy")
y_train = np.load(DATA_PATH/"y_train.npy")
X_test = np.load(DATA_PATH/"X_test.npy")
y_test = np.load(DATA_PATH/"y_test.npy")

In [5]:
X_train.shape

(500, 50)

### SVC

For this exercise, the data of the problem are better scaled, with 500 samples for 50 features. Since we need to perform a classification, we will compare SVC and RandomForestClassifier, starting with SVC.

In [9]:
svc_linear_pipeline = make_pipeline(
    StandardScaler(),
    SVC(kernel="linear", random_state=0)
)

In [10]:
svc_linear_pipeline.fit(X_train, y_train)

In [13]:
scores = cross_val_score(svc_linear_pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring="accuracy")
print(f"Cross-validation scores: {np.mean(scores)}")

Cross-validation scores: 0.8379999999999999


In [14]:
svc_linear_pipeline.score(X_test, y_test)

0.778

Using a linear kernel gives an accuracy of 0.778 on the test set. Let's try with a polynomial kernel.

In [15]:
svc_poly_pipeline = make_pipeline(
    StandardScaler(),
    SVC(kernel="poly", random_state=0)
)
svc_poly_pipeline.fit(X_train, y_train)
scores = cross_val_score(svc_poly_pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring="accuracy")
print(f"Cross-validation scores: {np.mean(scores)}")

Cross-validation scores: 0.7899999999999998


In [17]:
svc_poly_pipeline.score(X_test, y_test)

0.862

This time with a polynomial kernel, we get an accuracy of 0.862 on the test set, but a lower accuracy on the training set (0.789). Let's use Optuna to find the best hyperparameters.

In [18]:
def objective_svc(trial):
    C = trial.suggest_loguniform("C", 1e-3, 1e3)
    degree = trial.suggest_int("degree", 1, 5)
    kernel = trial.suggest_categorical("kernel", ["linear", "poly", "rbf"])
    svc_poly_pipeline = make_pipeline(
        StandardScaler(),
        SVC(kernel=kernel, degree=degree, C=C, random_state=0)
    )
    scores = cross_val_score(svc_poly_pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring="accuracy")
    return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective_svc, n_trials=100)

[I 2023-06-19 10:56:17,447] A new study created in memory with name: no-name-8fd013cb-0d95-4787-b16a-42b706d19c11
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2023-06-19 10:56:17,534] Trial 0 finished with value: 0.788 and parameters: {'C': 0.1618288917185804, 'degree': 2, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.788.
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2023-06-19 10:56:17,563] Trial 1 finished with value: 0.73 and parameters: {'C': 0.14291228751357968, 'degree': 3, 'kernel': 'rbf'}. Best is trial 0 with value: 0.788.
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2023-06-19 10:56:17,594] Trial 2 finished with value: 0.828 and parameters: {'C': 101.2348680432446, 'degree': 2, 'kernel': 'rbf'}. Best is trial 2 with value: 0.828.
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2023-06-19 10:56:17,639] Trial 3 finished with value: 0.5039999999999999 and parameters: {'C': 0.012583991540460358, 'degree': 4, 'kernel': 'sigmoid'}. Best is trial 2 with value:

In [19]:
svc_linear_pipeline = make_pipeline(
    StandardScaler(),
    SVC(kernel=str(study.best_params["kernel"]), C=study.best_params["C"], degree=study.best_params["degree"], random_state=0)
)

In [20]:
scores = cross_val_score(svc_linear_pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring="accuracy")
print(f"Cross-validation scores: {np.mean(scores)}")

Cross-validation scores: 0.8399999999999999


In [21]:
svc_linear_pipeline.fit(X_train, y_train)
svc_linear_pipeline.score(X_test, y_test)

0.786

In [22]:
study.best_params

{'C': 983.8854842875078, 'degree': 4, 'kernel': 'linear'}

Interestingly, the best hyperparameters found by Optuna provide a lower accuracy on the test set (0.786) than the default ones with a polynomial kernel. Let's rerun the optimization with the polynomial kernel to find the best hyperparameters.

In [23]:
def objective_svc_poly(trial):
    C = trial.suggest_loguniform("C", 1e-3, 1e3)
    degree = trial.suggest_int("degree", 1, 5)
    svc_poly_pipeline = make_pipeline(
        StandardScaler(),
        SVC(kernel="poly", degree=degree, C=C, random_state=0)
    )
    scores = cross_val_score(svc_poly_pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring="accuracy")
    return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective_svc_poly, n_trials=100)

[I 2023-06-19 11:19:40,353] A new study created in memory with name: no-name-ce20dff6-18b7-4bd1-b0ec-6fe1dc761c42
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2023-06-19 11:19:41,108] Trial 0 finished with value: 0.526 and parameters: {'C': 17.767831989121536, 'degree': 4}. Best is trial 0 with value: 0.526.
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2023-06-19 11:19:41,351] Trial 1 finished with value: 0.734 and parameters: {'C': 0.04807130459011856, 'degree': 1}. Best is trial 1 with value: 0.734.
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2023-06-19 11:19:41,371] Trial 2 finished with value: 0.5039999999999999 and parameters: {'C': 0.0014899629338372702, 'degree': 5}. Best is trial 1 with value: 0.734.
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2023-06-19 11:19:41,387] Trial 3 finished with value: 0.508 and parameters: {'C': 2.6691674566251398, 'degree': 4}. Best is trial 1 with value: 0.734.
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2023-06-19 11:1

In [24]:
svc_poly_pipeline = make_pipeline(
    StandardScaler(),
    SVC(kernel="poly", degree=study.best_params["degree"], C=study.best_params["C"], random_state=0)
)

In [25]:
scores = cross_val_score(svc_poly_pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring="accuracy")
print(f"Cross-validation scores: {np.mean(scores)}")

Cross-validation scores: 0.8480000000000001


In [26]:
svc_poly_pipeline.fit(X_train, y_train)
svc_poly_pipeline.score(X_test, y_test)

0.778

In [27]:
study.best_params

{'C': 98.07986314469444, 'degree': 1}

Once again, it looks like going with default parameter for the polynomial kernel is the best choice.

### RandomForestClassifier

Let's try with RandomForestClassifier.

In [28]:
rfc_pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(random_state=0)
)

In [29]:
scores = cross_val_score(rfc_pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring="accuracy")
print(f"Cross-validation scores: {np.mean(scores)}")

Cross-validation scores: 0.734


In [30]:
rfc_pipeline.fit(X_train, y_train)
rfc_pipeline.score(X_test, y_test)

0.846

Suprisingly, the default hyperparameters allow us to almost beat the threshold (0.85). Intuitively, we could think about restricting the depth of the trees to avoid overfitting. Let's try with a max_depth of 5.

In [60]:
rfc_pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(max_depth=5, random_state=0)
)
scores = cross_val_score(rfc_pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring="accuracy")
print(f"Cross-validation scores: {np.mean(scores)}")
rfc_pipeline.fit(X_train, y_train)
rfc_pipeline.score(X_test, y_test)

Cross-validation scores: 0.7380000000000001


0.852

We beat the 0.85 mark with this small change. Let's try to optimize the hyperparameters with Optuna.

In [47]:
def objective_rfc(trial):
    n_estimators = trial.suggest_int("n_estimators", 1, 100)
    max_depth = trial.suggest_int("max_depth", 3, 7)
    rfc_pipeline = make_pipeline(
        StandardScaler(),
        RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
    )
    scores = cross_val_score(rfc_pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring="accuracy")
    return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective_rfc, n_trials=500)

[I 2023-06-19 11:25:13,839] A new study created in memory with name: no-name-7ae27b57-f097-4e5d-b796-7861976c4668
[I 2023-06-19 11:25:13,871] Trial 0 finished with value: 0.6679999999999999 and parameters: {'n_estimators': 6, 'max_depth': 5}. Best is trial 0 with value: 0.6679999999999999.
[I 2023-06-19 11:25:13,952] Trial 1 finished with value: 0.742 and parameters: {'n_estimators': 62, 'max_depth': 3}. Best is trial 1 with value: 0.742.
[I 2023-06-19 11:25:13,985] Trial 2 finished with value: 0.728 and parameters: {'n_estimators': 13, 'max_depth': 6}. Best is trial 1 with value: 0.742.
[I 2023-06-19 11:25:14,035] Trial 3 finished with value: 0.7020000000000001 and parameters: {'n_estimators': 11, 'max_depth': 5}. Best is trial 1 with value: 0.742.
[I 2023-06-19 11:25:14,159] Trial 4 finished with value: 0.752 and parameters: {'n_estimators': 56, 'max_depth': 7}. Best is trial 4 with value: 0.752.
[I 2023-06-19 11:25:14,197] Trial 5 finished with value: 0.7219999999999999 and paramete

In [57]:
rfc_pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(n_estimators=study.best_params["n_estimators"], max_depth=study.best_params["max_depth"], random_state=0)
)

In [58]:
scores = cross_val_score(rfc_pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring="accuracy")
print(f"Cross-validation scores: {np.mean(scores)}")

Cross-validation scores: 0.772


In [59]:
rfc_pipeline.fit(X_train, y_train)
rfc_pipeline.score(X_test, y_test)

0.836

With the RandomForestClassifier too, setting the max_depth to 5 and leaving the other hyperparameters to their default values gives the best accuracy on the test set (0.856) than the optimized hyperparameters. Let's rerun the optimization with the max_depth set to 5.

In [66]:
def objective_rfc(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 150)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])
    rfc_pipeline = make_pipeline(
        StandardScaler(),
        RandomForestClassifier(n_estimators=n_estimators, max_depth=5, max_features=max_features, random_state=0)
    )
    scores = cross_val_score(rfc_pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring="accuracy")
    return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective_rfc, n_trials=200)

[I 2023-06-19 11:34:23,149] A new study created in memory with name: no-name-050839c6-a80a-454a-a275-e4ff3924bb04
[I 2023-06-19 11:34:23,922] Trial 0 finished with value: 0.73 and parameters: {'n_estimators': 100, 'max_features': 'log2'}. Best is trial 0 with value: 0.73.
[I 2023-06-19 11:34:24,305] Trial 1 finished with value: 0.734 and parameters: {'n_estimators': 101, 'max_features': 'log2'}. Best is trial 1 with value: 0.734.
[I 2023-06-19 11:34:24,707] Trial 2 finished with value: 0.736 and parameters: {'n_estimators': 86, 'max_features': 'log2'}. Best is trial 2 with value: 0.736.
[I 2023-06-19 11:34:25,027] Trial 3 finished with value: 0.728 and parameters: {'n_estimators': 57, 'max_features': 'log2'}. Best is trial 2 with value: 0.736.
[I 2023-06-19 11:34:25,107] Trial 4 finished with value: 0.728 and parameters: {'n_estimators': 61, 'max_features': 'log2'}. Best is trial 2 with value: 0.736.
[I 2023-06-19 11:34:25,248] Trial 5 finished with value: 0.7380000000000001 and parame

In [68]:
rfc_pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(n_estimators=study.best_params["n_estimators"], max_depth=5, max_features=study.best_params["max_features"], random_state=0)
)
scores = cross_val_score(rfc_pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring="accuracy")
print(f"Cross-validation scores: {np.mean(scores)}")
rfc_pipeline.fit(X_train, y_train)
rfc_pipeline.score(X_test, y_test)

Cross-validation scores: 0.754


0.85

In [69]:
study.best_params

{'n_estimators': 132, 'max_features': 'sqrt'}

Optimizing the number of estimators and max features (which turns out to be the default value) gives a flat 0.85, which is less than the default value, but it preforms better on the training set.

### Conclusion

In this exercise, we compared SVC and RandomForestClassifier. We found that SVC with a polynomial kernel and RandomForestClassifier with a max_depth of 5 both beat the 0.85 mark. We also found that optimizing the hyperparameters with Optuna did not improve the accuracy on the test set.