<a href="https://colab.research.google.com/github/woodRock/fishy-business/blob/main/code/identification/species/RO1_SO1_Identification_Species_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Research objective - Identification, Sub-objective - Species

This notebook contributes to the first research objective - fish identification. It addresses the first sub-objective - mutli-class classification of fish species.

## Classification methods

This notebook explores traditional machine learning methods for the fish species binary classification task, to elucidate suitable techniques for application in this task.

In [None]:
!pip install skfeature-chappers

Collecting skfeature-chappers
  Downloading skfeature_chappers-1.1.0-py3-none-any.whl (66 kB)
[?25l[K     |█████                           | 10 kB 27.7 MB/s eta 0:00:01[K     |█████████▉                      | 20 kB 24.9 MB/s eta 0:00:01[K     |██████████████▉                 | 30 kB 17.5 MB/s eta 0:00:01[K     |███████████████████▊            | 40 kB 15.2 MB/s eta 0:00:01[K     |████████████████████████▊       | 51 kB 7.2 MB/s eta 0:00:01[K     |█████████████████████████████▋  | 61 kB 8.4 MB/s eta 0:00:01[K     |████████████████████████████████| 66 kB 3.0 MB/s 
Installing collected packages: skfeature-chappers
Successfully installed skfeature-chappers-1.1.0


In [None]:
import scipy.io
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [None]:
run = 1
seed = 1617 * run
np.random.seed(seed)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.listdir('/content/drive/My Drive')

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

path = ['drive', 'MyDrive', 'AI', 'fish', 'REIMS_data.xlsx']
path = os.path.join(*path)

# Load the dataset
data = pd.read_excel(path)

print("[INFO] Reading the dataset.")
raw = pd.read_excel(path)

data = raw[~raw['m/z'].str.contains('HM')]
data = data[~data['m/z'].str.contains('QC')]
data = data[~data['m/z'].str.contains('HM')]
X = data.drop('m/z', axis=1) # X contains only the features.
# Binary encodings for class labels (1 for Hoki, 0 for Mackeral)
y = data['m/z'].apply(lambda x: 1 if 'H' in x else 0)
y = np.array(y)

Mounted at /content/drive
[INFO] Reading the dataset.


In [None]:
X.shape

(234, 1023)

In [None]:
from prettytable import PrettyTable

def show_results(results, label='Technique'):
    table = PrettyTable([label, 'Train Mean', 'Train Std', 'Test Mean', 'Test Std'])

    for name, result in results.items():
        train, test = list(zip(*result))
        vals = [np.mean(train), np.std(train), np.mean(test), np.std(test)]
        row = ['%.4f' % elem for elem in vals ]
        table.add_row(np.concatenate([[name], row]))

    print('\n') # tqdm messses with table border.
    print(table)

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# from skfeature.function.similarity_based import reliefF
# from skfeature.function.information_theoretical_based import MRMR
# from sklearn.feature_selection import chi2
# from sklearn.feature_selection import SelectKBest
# from sklearn.model_selection import StratifiedKFold
# from sklearn.svm import LinearSVC as svm
# from sklearn.metrics import balanced_accuracy_score

# def normalize(X_train, X_test):
#     scaler = MinMaxScaler(feature_range=(0, 1))
#     scaler = scaler.fit(X_train)
#     X_train = scaler.transform(X_train)
#     X_test = scaler.transform(X_test)
#     return X_train, X_test

# runs=30
# methods = { "reliefF" : reliefF.reliefF, "mrmr": MRMR.mrmr, "chi2": chi2 } #, "pso": pso}
# results = { "reliefF" : [], "mrmr": [], "chi2": [] }# , "pso": []}
# penalty = 'l1'

# for i in tqdm(range(runs)):
#     for name, fs_method in methods.items():
#         train_accs = []
#         test_accs = []
#         skf = StratifiedKFold(n_splits=10, random_state=1234, shuffle=True)

#         for train, test in skf.split(X, y):
#             X_train, X_test = (X[train], X[test])
#             y_train, y_test = y[train], y[test]
#             X_train, X_test = normalize(X_train, X_test)

#             fs = SelectKBest(fs_method, k=500)
#             X_train = fs.fit_transform(X_train, y_train)
#             X_test = fs.transform(X_test)

#             model = svm(penalty='l1', dual=(penalty=='l2'), max_iter=10_000)
#             clf = model.fit(X_train, y_train)

#             y_predict = model.predict(X_train)
#             train_acc = balanced_accuracy_score(y_train, y_predict)
#             train_accs.append(train_acc)
#             y_predict = model.predict(X_test)
#             test_acc = balanced_accuracy_score(y_test, y_predict)
#             test_accs.append(test_acc)

#             results[name].append((train_acc, test_acc))

# show_results(results, label='FS Method')

100%|██████████| 30/30 [39:31<00:00, 79.06s/it]



+-----------+------------+-----------+-----------+----------+
| FS Method | Train Mean | Train Std | Test Mean | Test Std |
+-----------+------------+-----------+-----------+----------+
|  reliefF  |   0.9782   |   0.0104  |   0.8028  |  0.0558  |
|    mrmr   |   0.9744   |   0.0097  |   0.7879  |  0.1321  |
|    chi2   |   0.9632   |   0.0088  |   0.6486  |  0.1901  |
+-----------+------------+-----------+-----------+----------+





In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.svm import LinearSVC as svm
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.naive_bayes import GaussianNB as nb
from sklearn.metrics import balanced_accuracy_score

def normalize(X_train, X_test):
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

models = {'knn': knn, 'rf': rf, 'dt': dt, 'nb': nb, 'svm': svm}
results = {'knn': [], 'rf': [], 'dt': [], 'nb': [],'svm': []}

for _ in tqdm(range(30)):
    skf = StratifiedKFold(n_splits=10, random_state=1234, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test = (X.iloc[train], X.iloc[test])
        y_train, y_test = y[train], y[test]
        X_train, X_test = normalize(X_train, X_test)

        for name, model in  models.items():
            model = model()
            clf = model.fit(X_train, y_train)
            y_predict = model.predict(X_train)
            train_acc = balanced_accuracy_score(y_train, y_predict)
            y_predict = model.predict(X_test)
            test_acc = balanced_accuracy_score(y_test, y_predict)
            results[name].append((train_acc, test_acc))

show_results(results, label='Classifier')

100%|██████████| 30/30 [02:55<00:00,  5.84s/it]



+------------+------------+-----------+-----------+----------+
| Classifier | Train Mean | Train Std | Test Mean | Test Std |
+------------+------------+-----------+-----------+----------+
|    knn     |   0.9689   |   0.0056  |   0.9422  |  0.0497  |
|     rf     |   1.0000   |   0.0000  |   0.9872  |  0.0244  |
|     dt     |   1.0000   |   0.0000  |   0.9942  |  0.0147  |
|     nb     |   0.9316   |   0.0124  |   0.8926  |  0.0621  |
|    svm     |   1.0000   |   0.0000  |   0.9913  |  0.0174  |
+------------+------------+-----------+-----------+----------+





In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC as svc
from sklearn.svm import LinearSVC as lsvc
from sklearn.svm import NuSVC as nusvc
from sklearn.svm import LinearSVR as lsvr

models = {'svc': svc, 'nusvc': nusvc, 'lsvc': lsvc,}
results = {'svc': [], 'nusvc': [], 'lsvc': []}

for _ in tqdm(range(30)):
    skf = StratifiedKFold(n_splits=10, random_state=1234, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test = (X.iloc[train], X.iloc[test])
        y_train, y_test = y[train], y[test]
        X_train, X_test = normalize(X_train, X_test)

        for name, model in  models.items():
            model = model()
            clf = model.fit(X_train, y_train)
            y_predict = model.predict(X_train)
            train_acc = balanced_accuracy_score(y_train, y_predict)
            y_predict = model.predict(X_test)
            test_acc = balanced_accuracy_score(y_test, y_predict)
            results[name].append((train_acc, test_acc))

show_results(results, label='SVM Type')

100%|██████████| 30/30 [16:12<00:00, 32.43s/it]



+----------+------------+-----------+-----------+----------+
| SVM Type | Train Mean | Train Std | Test Mean | Test Std |
+----------+------------+-----------+-----------+----------+
|   svc    |   0.7325   |   0.0354  |   0.4903  |  0.1214  |
|  nusvc   |   0.9031   |   0.0197  |   0.6236  |  0.1518  |
|   lsvc   |   1.0000   |   0.0000  |   0.8714  |  0.0854  |
+----------+------------+-----------+-----------+----------+





In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC as svm

kernels = ['poly', 'rbf', 'sigmoid', 'linear']
results = {'poly': [], 'rbf': [], 'sigmoid': [], 'linear': []}

for _ in tqdm(range(30)):
    for train, test in skf.split(X, y):
        X_train, X_test = (X.iloc[train], X.iloc[test])
        y_train, y_test = y[train], y[test]
        X_train, X_test = normalize(X_train, X_test)

        for kernel in kernels:
            model = svm(kernel=kernel)
            clf = model.fit(X_train, y_train)
            y_predict = model.predict(X_train)
            train_acc = balanced_accuracy_score(y_train, y_predict)
            y_predict = model.predict(X_test)
            test_acc = balanced_accuracy_score(y_test, y_predict)
            results[kernel].append((train_acc, test_acc))

show_results(results, label='kernel')

100%|██████████| 30/30 [01:52<00:00,  3.74s/it]



+---------+------------+-----------+-----------+----------+
|  kernel | Train Mean | Train Std | Test Mean | Test Std |
+---------+------------+-----------+-----------+----------+
|   poly  |   0.7063   |   0.0227  |   0.5389  |  0.0694  |
|   rbf   |   0.7325   |   0.0354  |   0.4903  |  0.1214  |
| sigmoid |   0.3747   |   0.0178  |   0.3347  |  0.0859  |
|  linear |   1.0000   |   0.0000  |   0.8736  |  0.1077  |
+---------+------------+-----------+-----------+----------+



