<a href="https://colab.research.google.com/github/woodRock/fishy-business/blob/main/Part.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install skfeature-chappers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import scipy.io
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
import pickle

In [None]:
run = 1
seed = 1617 * run
np.random.seed(seed)

In [None]:
from sklearn import preprocessing

def load_data(dataset="Fish"):
    folder = '' # folder = '../data/matlab/'
    mat = scipy.io.loadmat(folder + dataset + '.mat')
    X = mat['X']
    X = X.astype(float)
    y = mat['Y']
    y = y[:, 0]
    return X,y 

def get_labels(y):
    le = preprocessing.LabelEncoder()
    le.fit(y)
    y_ = le.transform(y)
    labels = le.inverse_transform(np.unique(y_))
    return y_, labels

dataset = "Part"
X,y = load_data(dataset=dataset)
y_, labels = get_labels(y)

In [None]:
inc = 50
no_features = X.shape[1] + inc
j = np.arange(inc,no_features,inc) # [50,4800]

In [None]:
from skfeature.function.similarity_based import reliefF
from skfeature.function.information_theoretical_based import MRMR
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC as svm

methods = { "reliefF" : reliefF.reliefF, "mrmr": MRMR.mrmr, "chi2": chi2}
results = { "reliefF" : [], "mrmr": [], "chi2": [], "pso": []}
penalty = 'l1'

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import balanced_accuracy_score

def normalize(X_train, X_test):
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

In [None]:
from Problem import FeatureSelection
from PSO import Swarm

def pso(X,y):
    prob = FeatureSelection(minimized=True, X=X, y=y)
    pop_size = 30
    n_iterations = 100
    no_fea = X.shape[1]
    swarm = Swarm(n_particle=pop_size, length=no_fea, n_iterations=n_iterations,
                        max_pos=1.0, min_pos=0.0, max_vel=0.2, min_vel=-0.2,
                        problem=prob)
    best_sol, best_fit = swarm.iterate()
    sel_fea = np.where(best_sol > prob.threshold)[0]
    return sel_fea

In [None]:
runs = 15
name = "pso"
folds = 10

for k in tqdm(range(runs)):
    train_accs = []
    test_accs = []
    skf = StratifiedKFold(n_splits=folds, random_state=1234, shuffle=True)

    # DEBUG: Fold counter
    f = 1

    for train, test in skf.split(X, y):
        X_train, X_test = (X[train], X[test])
        y_train, y_test = y[train], y[test]
        X_train, X_test = normalize(X_train, X_test)

        sel_fea = pso(X_train, y_train)

        # DEBUG: Measure progress.
        print(f"Run {k}, fold {f} ")

        model = svm(penalty='l1', dual=False, tol=1e-3, max_iter=5_000)
        model.fit(X_train[:, sel_fea], y_train)

        y_predict = model.predict(X_train[:, sel_fea])
        train_acc = balanced_accuracy_score(y_train, y_predict)
        train_accs.append(train_acc)

        y_predict = model.predict(X_test[:, sel_fea])
        test_acc = balanced_accuracy_score(y_test, y_predict)
        test_accs.append(test_acc)

        # DEBUG: Increment fold counter
        f += 1

    no_fea = len(sel_fea)
    results[name].append((no_fea, np.mean(train_accs), np.mean(test_accs)))

  0%|          | 0/15 [00:00<?, ?it/s]

Run 0, fold 1 
Run 0, fold 2 
Run 0, fold 3 
Run 0, fold 4 
Run 0, fold 5 
Run 0, fold 6 
Run 0, fold 7 
Run 0, fold 8 
Run 0, fold 9 


  7%|▋         | 1/15 [1:01:29<14:20:58, 3689.87s/it]

Run 0, fold 10 
Run 1, fold 1 
Run 1, fold 2 
Run 1, fold 3 
Run 1, fold 4 
Run 1, fold 5 
Run 1, fold 6 
Run 1, fold 7 
Run 1, fold 8 
Run 1, fold 9 


 13%|█▎        | 2/15 [2:00:38<13:01:31, 3607.08s/it]

Run 1, fold 10 
Run 2, fold 1 
Run 2, fold 2 
Run 2, fold 3 
Run 2, fold 4 
Run 2, fold 5 
Run 2, fold 6 
Run 2, fold 7 
Run 2, fold 8 
Run 2, fold 9 


 20%|██        | 3/15 [3:00:39<12:00:51, 3604.28s/it]

Run 2, fold 10 
Run 3, fold 1 
Run 3, fold 2 
Run 3, fold 3 
Run 3, fold 4 
Run 3, fold 5 
Run 3, fold 6 
Run 3, fold 7 
Run 3, fold 8 
Run 3, fold 9 


 27%|██▋       | 4/15 [4:01:14<11:03:00, 3616.43s/it]

Run 3, fold 10 
Run 4, fold 1 
Run 4, fold 2 
Run 4, fold 3 
Run 4, fold 4 
Run 4, fold 5 
Run 4, fold 6 
Run 4, fold 7 
Run 4, fold 8 
Run 4, fold 9 


 33%|███▎      | 5/15 [5:01:18<10:01:58, 3611.83s/it]

Run 4, fold 10 
Run 5, fold 1 
Run 5, fold 2 
Run 5, fold 3 
Run 5, fold 4 
Run 5, fold 5 
Run 5, fold 6 
Run 5, fold 7 
Run 5, fold 8 
Run 5, fold 9 


 40%|████      | 6/15 [6:00:15<8:57:58, 3586.48s/it] 

Run 5, fold 10 
Run 6, fold 1 
Run 6, fold 2 
Run 6, fold 3 
Run 6, fold 4 
Run 6, fold 5 
Run 6, fold 6 
Run 6, fold 7 
Run 6, fold 8 
Run 6, fold 9 


 47%|████▋     | 7/15 [6:59:46<7:57:30, 3581.27s/it]

Run 6, fold 10 
Run 7, fold 1 
Run 7, fold 2 
Run 7, fold 3 
Run 7, fold 4 
Run 7, fold 5 
Run 7, fold 6 
Run 7, fold 7 
Run 7, fold 8 
Run 7, fold 9 


 53%|█████▎    | 8/15 [7:59:57<6:58:55, 3590.83s/it]

Run 7, fold 10 
Run 8, fold 1 
Run 8, fold 2 
Run 8, fold 3 
Run 8, fold 4 
Run 8, fold 5 
Run 8, fold 6 
Run 8, fold 7 
Run 8, fold 8 
Run 8, fold 9 


 60%|██████    | 9/15 [8:57:26<5:54:38, 3546.47s/it]

Run 8, fold 10 
Run 9, fold 1 
Run 9, fold 2 
Run 9, fold 3 
Run 9, fold 4 
Run 9, fold 5 
Run 9, fold 6 
Run 9, fold 7 
Run 9, fold 8 
Run 9, fold 9 


 67%|██████▋   | 10/15 [9:57:30<4:57:01, 3564.29s/it]

Run 9, fold 10 
Run 10, fold 1 
Run 10, fold 2 
Run 10, fold 3 
Run 10, fold 4 
Run 10, fold 5 
Run 10, fold 6 
Run 10, fold 7 
Run 10, fold 8 
Run 10, fold 9 


 73%|███████▎  | 11/15 [10:57:39<3:58:31, 3577.76s/it]

Run 10, fold 10 
Run 11, fold 1 
Run 11, fold 2 
Run 11, fold 3 
Run 11, fold 4 
Run 11, fold 5 
Run 11, fold 6 
Run 11, fold 7 
Run 11, fold 8 
Run 11, fold 9 


 80%|████████  | 12/15 [11:57:47<2:59:21, 3587.00s/it]

Run 11, fold 10 
Run 12, fold 1 
Run 12, fold 2 
Run 12, fold 3 
Run 12, fold 4 
Run 12, fold 5 
Run 12, fold 6 
Run 12, fold 7 
Run 12, fold 8 
Run 12, fold 9 


 87%|████████▋ | 13/15 [12:57:08<1:59:18, 3579.06s/it]

Run 12, fold 10 
Run 13, fold 1 
Run 13, fold 2 
Run 13, fold 3 
Run 13, fold 4 
Run 13, fold 5 
Run 13, fold 6 
Run 13, fold 7 
Run 13, fold 8 
Run 13, fold 9 


 93%|█████████▎| 14/15 [13:57:50<59:58, 3598.18s/it]  

Run 13, fold 10 
Run 14, fold 1 
Run 14, fold 2 
Run 14, fold 3 
Run 14, fold 4 
Run 14, fold 5 
Run 14, fold 6 
Run 14, fold 7 
Run 14, fold 8 
Run 14, fold 9 


100%|██████████| 15/15 [14:56:33<00:00, 3586.26s/it]

Run 14, fold 10 





In [None]:
with open('results-pso-cloud.pkl', 'wb+') as f:
    pickle.dump(results, f)

In [None]:
for k in tqdm(j):
    for name, fs_method in methods.items(): 
        if name == "pso":
          continue

        train_accs = []
        test_accs = []
        skf = StratifiedKFold(n_splits=folds, random_state=1234, shuffle=True)

        for train, test in skf.split(X, y):
            X_train, X_test = (X[train], X[test])
            y_train, y_test = y[train], y[test]
            X_train, X_test = normalize(X_train, X_test)

            fs = SelectKBest(fs_method, k=k)
            X_train = fs.fit_transform(X_train, y_train)
            X_test = fs.transform(X_test)

            model = svm(penalty='l1', dual=False, tol=1e-3, max_iter=5_000)
            clf = model.fit(X_train, y_train)

            y_predict = model.predict(X_train)
            train_acc = balanced_accuracy_score(y_train, y_predict)
            train_accs.append(train_acc)
            y_predict = model.predict(X_test)
            test_acc = balanced_accuracy_score(y_test, y_predict)
            test_accs.append(test_acc)

        no_fea = k 
        results[name].append((no_fea, np.mean(train_accs), np.mean(test_accs)))

  8%|▊         | 8/96 [10:19<1:53:56, 77.69s/it]

In [None]:
with open('results-full-cloud.pkl', 'wb+') as f:
    pickle.dump(results, f)

In [None]:
for name, result in results.items():
  k, train, test = zip(*result)
  if name == "pso":
    plt.scatter(k, train, label=name)
  else:
    plt.plot(k, train, label=name)

plt.title("Train: Accuracy vs. No. Features")
plt.xlabel("No. Features")
plt.ylabel("Accuracy")
plt.legend()
plt.savefig(fname=f"accuracy-features-{dataset}-train", dpi=500)
plt.show()

In [None]:
for name, result in results.items():
  k, train, test = zip(*result)
  if name == "pso":
    plt.scatter(k, test, label=name)
  else:
    plt.plot(k, test, label=name)

plt.title("Test: Accuracy vs. No. Features")
plt.xlabel("No. Features")
plt.ylabel("Accuracy")
plt.legend()
plt.savefig(fname=f"accuracy-features-{dataset}-test", dpi=500)
plt.show()

In [50]:
print(results)

{'reliefF': [(50, 0.812469586315087, 0.5847222222222221), (100, 0.9155893685010833, 0.6527777777777778), (150, 0.9307986567918654, 0.6777777777777778), (200, 0.9489223465963705, 0.6888888888888889), (250, 0.9585619228912947, 0.7805555555555556), (300, 0.9720060455798996, 0.7583333333333333), (350, 0.9726293413559967, 0.773611111111111), (400, 0.9730705960077778, 0.773611111111111), (450, 0.9747890865972358, 0.7736111111111111), (500, 0.9782143163382552, 0.8027777777777778), (550, 0.9774904106143495, 0.7972222222222223), (600, 0.9795582594479029, 0.763888888888889), (650, 0.9795582594479029, 0.7847222222222222), (700, 0.9810218197824311, 0.7888888888888889), (750, 0.9810218197824311, 0.7888888888888889), (800, 0.9824366471734892, 0.7888888888888889), (850, 0.9857473432685657, 0.7916666666666667), (900, 0.9843325158775074, 0.7944444444444445), (950, 0.9839442243601836, 0.8125), (1000, 0.9839929573036533, 0.8263888888888887), (1050, 0.9840416902471232, 0.8055555555555556), (1100, 0.983992

In [51]:
results['pso']

[(1256, 0.9906190655851097, 0.8305555555555555),
 (1218, 0.9831157643211974, 0.7944444444444444),
 (1274, 0.985580079230334, 0.7986111111111112),
 (1236, 0.9888954914167138, 0.8125),
 (1261, 0.9853559076903728, 0.8013888888888889),
 (1220, 0.9856467333207573, 0.7972222222222223),
 (1170, 0.9862151795258758, 0.8180555555555555),
 (1223, 0.9858268880085518, 0.8208333333333334),
 (1186, 0.9849676161730491, 0.8069444444444442),
 (1302, 0.9846459787461486, 0.8013888888888889),
 (1184, 0.990112242973024, 0.7763888888888889),
 (1243, 0.9866732691944916, 0.8319444444444443),
 (1213, 0.9870923725083317, 0.8180555555555555),
 (1212, 0.9873492422813305, 0.8097222222222221),
 (1223, 0.9831465761177137, 0.8430555555555556)]

In [None]:
for method, result in results.items():
    k, train, test = list(zip(*result))
    best_k = np.argmax(test)
    print(f"{method} performed best at {k[best_k]} features, with {train[best_k]} training accuracy, and {test[best_k]} test accuracy.")

k, train, test = results['mrmr'][-1]
print(f"Full-dataset with {k} features, with {train} training accuracy, and {test} test accuracy.")

In [None]:
from prettytable import PrettyTable

def show_results(results, label='Method'):
    table = PrettyTable([label, 'Best K', 'Train', 'Test'])

    for name, result in results.items():
        k, train, test = list(zip(*result))
        best_k = np.argmax(test)
        vals = [k[best_k], train[best_k], test[best_k]]
        row = ['%.4f' % elem if i != 0 else elem for i, elem in enumerate(vals) ]
        table.add_row(np.concatenate([[name], row]))

    k, train, test = results['mrmr'][-1]
    vals = [k, train, test]
    row = ['%.4f' % elem if i != 0 else elem for i, elem in enumerate(vals) ]
    table.add_row(np.concatenate([['full'], row]))

    print('\n') # tqdm messses with table border.
    print(table)

show_results(results)