# 2022-08-23: AutoML Experiments

### Authors

* Kevin Chu (kevin@velexi.com)


### Overview

This Jupyter notebook explores the use of AutoML to quickly assess multiple common ML models for texture classification. The images used to generate the features used in this notebook were obtained from the texture dataset made available by the Signal and Image Processing Institute at the University of Southern California (https://sipi.usc.edu/database/database.php?volume=textures).


### Function Signature

* __input__: grayscale image (stored as 2D numpy arrays)

* __output__: grayscale image of LBP image, LGP histogram

* __data processing__: compute the LBP image and LGP histogram for the input image

## History

### 2022-08-23

- Initial version of notebook.

## Experimentation & Development

### Imports

In [1]:
# --- Imports

# Standard library
import json
import os

# External packages
import pandas as pd
from pycaret import classification

### Parameters

In [2]:
# Dataset
data_dir = os.path.join("..", "data", "final", "texture-classification")

# AutoML
experiment_name = "automl-test"
num_best_models = 7

### Prepare Data

In [3]:
# --- Prepare dataset

# Load metadata
metadata_path = os.path.join(data_dir, "metadata.csv")
metadata_df = pd.read_csv(metadata_path)

# Construct columns for texture features
data_file = metadata_df.at[0, "file"]
with open(os.path.join(data_dir, data_file), 'r') as data_path:
    features = json.load(data_path)
    texture_features = features["texture"]
    feature_columns = [f"texture-{i}" for i in range(len(texture_features))]

# Load features
records = []
for _, row in metadata_df.iterrows():
    # Read features from JSON
    with open(os.path.join(data_dir, row["file"]), 'r') as file_:
        features = json.load(file_)
    
    # Extract textures
    texture_features = features["texture"]
    
    # Add new record
    records.append(dict(zip(feature_columns, texture_features)))
    
features_df = pd.DataFrame.from_records(records, columns=feature_columns)

# Construct DataFrame for model training and testing
data_df = features_df.merge(metadata_df, left_index=True, right_index=True)
del data_df["file"]

# --- Check DataFrame

print(f"Number of records: {len(data_df.index)}")
print(f"Columns: {list(data_df.columns)}")
data_df.head()

Number of records: 180
Columns: ['texture-0', 'texture-1', 'texture-2', 'texture-3', 'texture-4', 'texture-5', 'texture-6', 'texture-7', 'texture-8', 'texture-9', 'texture-10', 'target']


Unnamed: 0,texture-0,texture-1,texture-2,texture-3,texture-4,texture-5,texture-6,texture-7,texture-8,texture-9,texture-10,target
0,0.095673,0.086304,0.064758,0.061218,0.067444,0.064316,0.050446,0.051987,0.081924,0.08342,0.292511,bark
1,0.095963,0.08812,0.065109,0.059372,0.066528,0.064468,0.046875,0.052628,0.082672,0.084244,0.294022,bark
2,0.12085,0.095428,0.041687,0.0383,0.034439,0.039825,0.047562,0.057449,0.085831,0.101822,0.336807,fabric
3,0.08876,0.088852,0.053574,0.04631,0.057465,0.078674,0.071365,0.073822,0.083038,0.078018,0.280121,foam
4,0.128555,0.097183,0.033188,0.029572,0.042633,0.041458,0.026047,0.036942,0.093079,0.110519,0.360825,straw


### Perform AutoML Evaluation

In [4]:
# --- Perform AutoML Evaluation

# Set up the dataset for AutoML
classification.setup(data=data_df,
                     target="target",
                     log_experiment=True,
                     experiment_name=experiment_name,
                     silent=True,
                     html=False)

# Automatically train, test, and evaluate models
best_models = classification.compare_models(n_select=num_best_models,
                                            verbose=False)

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,8892
1,Target,target
2,Target Type,Multiclass
3,Label Encoded,"bark: 0, brick: 1, fabric: 2, foam: 3, grass: ..."
4,Original Data,"(180, 12)"
5,Missing Values,False
6,Numeric Features,11
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


### Analyze Results

In [5]:
# Best models
for model in best_models:
    print(model)
    print()

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

GaussianNB(priors=None, var_smoothing=1e-09)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=8892, verbose=0,
                       warm_start=False)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=Non

In [6]:
# Display score table
classification.pull()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.161
nb,Naive Bayes,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.152
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.039
lda,Linear Discriminant Analysis,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.004
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.034
lightgbm,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.013
gbc,Gradient Boosting Classifier,0.9603,0.9992,0.95,0.9505,0.9501,0.9531,0.9566,0.08
dt,Decision Tree Classifier,0.9423,0.9657,0.95,0.9385,0.9354,0.933,0.9362,0.004
qda,Quadratic Discriminant Analysis,0.8737,0.9217,0.8875,0.8089,0.8314,0.8499,0.8687,0.142
ada,Ada Boost Classifier,0.8051,0.968,0.8208,0.7592,0.7668,0.7755,0.8029,0.015
