# Classification

We then build a logistic regression classifier on top of the extracted features of the image patches.

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from algutils import SIFT_feature

In [2]:
# set the random seed to ensure that the results are reproducible

seed = 12345
np.random.seed(seed)

In [3]:
# load the dataset

df_train = pd.read_pickle("dataset_train.pkl")
df_test = pd.read_pickle("dataset_test.pkl")

## If you want to use our precomputed STFT features, you can replace the above statements with:
    
#     df_train = pd.read_pickle("dataset_train_with_SIFT.pkl")
#     df_test = pd.read_pickle("dataset_test_with_SIFT.pkl")

In [4]:
# load the fitted STFT feature extractor
sift_feature_extractor = SIFT_feature(filename = "feature_extractor.pkl")

## Training

In [5]:
# define the classifier

lr = LogisticRegression(penalty = "l1", solver = "liblinear", 
                        class_weight="balanced", C = 1, max_iter=1000, 
                        verbose = 1, 
                        random_state = seed,
                       )

clf = Pipeline([('scaler', StandardScaler()), 
                ('classifier', lr)])

In [6]:
# if the feature vectors of the training set has not been computed before, compute them now
if "feature_SIFT" not in df_train:
    
    df_train.loc[:, "feature_SIFT"] = list( sift.transform(df_train["filedir"]) )
    df_train.to_csv("dataset_train.csv")
    df_train.to_pickle("dataset_train.pkl")

In [7]:
# fit the classifier
clf = clf.fit(np.stack(df_train.feature_SIFT), np.stack(df_train.label))

[LibLinear]

## Testing

In [8]:
# if the feature vectors of the test set has not been computed before, compute them now
if "feature_SIFT" not in df_test:
    
    df_test.loc[:, "feature_SIFT"] = list( sift_feature_extractor.transform(df_test["filedir"]) )
    df_test.to_csv("dataset_test.csv")
    df_test.to_pickle("dataset_test.pkl")

100%|██████████████████████████████████████████████████████████████████████████████| 1457/1457 [06:13<00:00,  3.90it/s]


Obtain the evaluation metrics in **Supplementary Fig. S15e**.

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

y_test = np.stack(df_test.label) # the ground truth
y_pred = clf.predict(np.stack(df_test.feature_SIFT)) # the predicted label
y_prob = clf.predict_proba(np.stack(df_test.feature_SIFT)) # the predicted probability for each label

acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")
auc = roc_auc_score(y_test, y_prob, average="macro", multi_class="ovr")

print({
        "Accuracy" : acc, 
        "Precision" : precision, 
        "Recall": recall, 
        "F1 score": f1, 
        "AUC" : auc
    })

{'Accuracy': 0.9910775566231984, 'Precision': 0.9902713205261477, 'Recall': 0.9891710718864298, 'F1 score': 0.9897069872471881, 'AUC': 0.9988634195451765}


Obtain the confusion matrix in **Supplementary Fig. S15f**.

In [9]:
confusion_mat = confusion_matrix(y_test, y_pred, labels = ["low", "optimal", "high"])

print("confusion matrix:", confusion_mat)

confusion matrix: [[561   4   0]
 [  2 331   6]
 [  0   1 552]]
