<a href="https://colab.research.google.com/github/valsson-group/UNT-ChemicalApplicationsOfMachineLearning-Spring2026/blob/main/Lecture-11_February-26-2026/Lecture-11_Classification_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Lecture 11 - Classification



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Here we create an example dataset for classifcation with 4 classes.

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

n_classes=4
n_samples=1000

uneven_classes = False

if uneven_classes:
  weights=[4, 1, 0.4, 0.1]
  weights/=np.sum(weights)
else:
  weights=None

features, target = make_classification(
    n_samples=n_samples,
    n_features=12,
    n_informative=4,
    n_redundant=1,
    n_repeated=0,
    n_classes=n_classes,
    shuffle=True,
    weights=weights
)

feature_names = [f"feature {i}" for i in range(X.shape[1])]

print("Size of each class")
for i in range(n_classes):
  num = np.sum(target == i)
  perc = num/n_samples
  print("- {:d}: {:d} ({:.1f}%)".format(i,num,perc*100))

In [None]:
print(target.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

model = RandomForestClassifier(max_depth=2)

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2)

model.fit(features_train, target_train)

target_test_predicted = model.predict(features_test)

print("Accuracy:                 {:.4f}".format(metrics.accuracy_score(target_test,target_test_predicted)))
print("Precision (micro):        {:.4f}".format(metrics.precision_score(target_test,target_test_predicted,average='micro')))
print("Precision (macro):        {:.4f}".format(metrics.precision_score(target_test,target_test_predicted,average='macro')))
print("Recall (micro):           {:.4f}".format(metrics.recall_score(target_test,target_test_predicted,average='micro')))
print("Recall (macro):           {:.4f}".format(metrics.recall_score(target_test,target_test_predicted,average='macro')))

cfm = metrics.ConfusionMatrixDisplay.from_predictions(target_test,target_test_predicted)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_validate,ShuffleSplit

scoring = {'accuracy':'accuracy',
            'recall_micro': metrics.make_scorer(metrics.recall_score, zero_division=np.nan, average='micro'),
            'precision_micro': metrics.make_scorer(metrics.precision_score, zero_division=np.nan,average='micro'),
            'recall_macro': metrics.make_scorer(metrics.recall_score, zero_division=np.nan, average='macro'),
            'precision_macro': metrics.make_scorer(metrics.precision_score, zero_division=np.nan,average='macro'),
            'roc_auc_ovr': 'roc_auc_ovr',
            'roc_auc_ovo': 'roc_auc_ovo'
}



NumSplits=100
cv_random = ShuffleSplit(n_splits=NumSplits, test_size=0.2)


model = RandomForestClassifier(n_estimators=100)

    # Evaluate the models using crossvalidation
scores_random = cross_validate(
        model,
        features, target,
        scoring=scoring,
        cv=cv_random,
        return_train_score=True,
        return_estimator=True,
        return_indices=True
)

print("max_depth={:d}".format(d))
print("- Accuracy:               {:.3f} +- {:.3f}".format(scores_random['test_accuracy'].mean(), scores_random['test_accuracy'].std()))
print("- ROC AUC (OVR, Macro):   {:.3f} +- {:.3f}".format(scores_random['test_roc_auc_ovr'].mean(), scores_random['test_roc_auc_ovr'].std()))
print("- ROC AUC (OVO, Macro):   {:.3f} +- {:.3f}".format(scores_random['test_roc_auc_ovo'].mean(), scores_random['test_roc_auc_ovo'].std()))
print("- Precision (Micro):      {:.3f} +- {:.3f}".format(np.nanmean(scores_random['test_precision_micro']), np.nanstd(scores_random['test_precision_micro'])))
print("- Precision (Macro):      {:.3f} +- {:.3f}".format(np.nanmean(scores_random['test_precision_macro']), np.nanstd(scores_random['test_precision_macro'])))
print("- Recall (Micro):         {:.3f} +- {:.3f}".format(np.nanmean(scores_random['test_precision_micro']), np.nanstd(scores_random['test_precision_micro'])))
print("- Recall (Macro):         {:.3f} +- {:.3f}".format(np.nanmean(scores_random['test_precision_macro']), np.nanstd(scores_random['test_precision_macro'])))
print(" ")


### Y-scrambling / Y-randomization

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

model = RandomForestClassifier()

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2)

scramble = True

# this works for numpy arrays
rng = np.random.default_rng()
target_train_used = rng.choice(target_train, size=target_train.size, replace=False) if scramble else target_train.copy()

# if you are using a panda dataframe you should use something like
# target_train_used = target_train.sample(frac=1.0).reset_index(drop=True) if scramble else target_train.copy()

model.fit(features_train, target_train_used)

target_test_predicted = model.predict(features_test)

print("Accuracy:                 {:.4f}".format(metrics.accuracy_score(target_test,target_test_predicted)))
print("Precision (micro):        {:.4f}".format(metrics.precision_score(target_test,target_test_predicted,average='micro')))
print("Precision (macro):        {:.4f}".format(metrics.precision_score(target_test,target_test_predicted,average='macro')))
print("Recall (micro):           {:.4f}".format(metrics.recall_score(target_test,target_test_predicted,average='micro')))
print("Recall (macro):           {:.4f}".format(metrics.recall_score(target_test,target_test_predicted,average='macro')))

cfm = metrics.ConfusionMatrixDisplay.from_predictions(target_test,target_test_predicted)


## Hyperparameter search by hand



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_validate,ShuffleSplit

scoring = {'accuracy':'accuracy',
            'recall_micro': metrics.make_scorer(metrics.recall_score, zero_division=np.nan, average='micro'),
            'precision_micro': metrics.make_scorer(metrics.precision_score, zero_division=np.nan,average='micro'),
            'recall_macro': metrics.make_scorer(metrics.recall_score, zero_division=np.nan, average='macro'),
            'precision_macro': metrics.make_scorer(metrics.precision_score, zero_division=np.nan,average='macro'),
            'roc_auc_ovr': 'roc_auc_ovr',
            'roc_auc_ovo': 'roc_auc_ovo'
}

max_depths = [1, 2, 3]

NumSplits=100
cv_random = ShuffleSplit(n_splits=NumSplits, test_size=0.2)

for d in max_depths:
    model = RandomForestClassifier(n_estimators=100, max_depth=d)

    # Evaluate the models using crossvalidation
    scores_random = cross_validate(
        model,
        features, target,
        scoring=scoring,
        cv=cv_random,
        return_train_score=True,
        return_estimator=True,
        return_indices=True
    )

    print("max_depth={:d}".format(d))
    print("- Accuracy:               {:.3f} +- {:.3f}".format(scores_random['test_accuracy'].mean(), scores_random['test_accuracy'].std()))
    print("- ROC AUC (OVR, Macro):   {:.3f} +- {:.3f}".format(scores_random['test_roc_auc_ovr'].mean(), scores_random['test_roc_auc_ovr'].std()))
    print("- ROC AUC (OVO, Macro):   {:.3f} +- {:.3f}".format(scores_random['test_roc_auc_ovo'].mean(), scores_random['test_roc_auc_ovo'].std()))
    print("- Precision (Micro):      {:.3f} +- {:.3f}".format(np.nanmean(scores_random['test_precision_micro']), np.nanstd(scores_random['test_precision_micro'])))
    print("- Precision (Macro):      {:.3f} +- {:.3f}".format(np.nanmean(scores_random['test_precision_macro']), np.nanstd(scores_random['test_precision_macro'])))
    print("- Recall (Micro):         {:.3f} +- {:.3f}".format(np.nanmean(scores_random['test_precision_micro']), np.nanstd(scores_random['test_precision_micro'])))
    print("- Recall (Macro):         {:.3f} +- {:.3f}".format(np.nanmean(scores_random['test_precision_macro']), np.nanstd(scores_random['test_precision_macro'])))
    print(" ")



## Hyperparameter search using GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_validate,ShuffleSplit
from sklearn.model_selection import train_test_split, GridSearchCV

scoring = {'accuracy':'accuracy',
            'recall_micro': metrics.make_scorer(metrics.recall_score, zero_division=np.nan, average='micro'),
            'precision_micro': metrics.make_scorer(metrics.precision_score, zero_division=np.nan,average='micro'),
            'recall_macro': metrics.make_scorer(metrics.recall_score, zero_division=np.nan, average='macro'),
            'precision_macro': metrics.make_scorer(metrics.precision_score, zero_division=np.nan,average='macro'),
            'roc_auc_ovr': 'roc_auc_ovr',
            'roc_auc_ovo': 'roc_auc_ovo'
}

NumSplits=100
cv_random = ShuffleSplit(n_splits=NumSplits, test_size=0.2)


parameters = {'max_depth': [2,3,4,5]
              }

model = RandomForestClassifier(100)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=parameters,
    cv=cv_random,
    scoring='accuracy',
)
grid_search.fit(features,target)

print("Best Params:", grid_search.best_params_) #
print("Best Score:", grid_search.best_score_) #


In [None]:
grid_search.cv_results_