# Imports

In [2]:
#imports
import os
import argparse
import ast
import pickle as pkl
from itertools import tee
import random

import numpy as np
from scipy.stats import randint

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_predict, cross_val_score, train_test_split
from sklearn.decomposition import PCA

from sklearn.neural_network import MLPClassifier

# Parameters

In [2]:
# root_path = "../../local_data/latent_spaces/edge_prediction/"
root_path = "../../../../../../vol/aimspace/users/wyo/latent_spaces/vertices_prediction/"
organ = "liver"
output = False
save = False
n_estimators = 344
max_leaf_nodes = 770
SEED = 42

In [3]:
path = os.path.join(root_path, organ)
with open(f'{path}/{1000071}', "rb") as fp:
        x = pkl.load(fp)
fp.close()
pca = PCA(n_components=64)

X_pca = pca.fit_transform(x.T)
X_pca = X_pca.T
X_pca = X_pca.flatten()
print(x.shape)
# needed_padding = 2000 - len(x)
# x = np.pad(x, ((0, needed_padding), (0, 0)), 'constant')
# print(x.shape)
# x = x.flatten()
print(X_pca.shape)

(1087, 256)
(16384,)


# Data

In [None]:
path = os.path.join(root_path, organ)
dirs = next(os.walk(path))[2]
female_ids = np.loadtxt("../data/female_mesh_ids.csv")
x = []
y = []
padded_x = []

for dir in dirs:
    with open(f'{path}/{dir}', "rb") as fp:
        latent_space = pkl.load(fp)
    fp.close()

    x.append(latent_space)
    if(int(dir) in female_ids):
        y.append(0)
    else:
        y.append(1)

max_vertices = max(len(inner_array) for inner_array in x)
for space in x:
    needed_padding = max_vertices - len(space)
    space = np.pad(space, ((0, needed_padding), (0, 0)), 'constant')
    space = space.flatten()
    padded_x.append(space)


print(len(x))
print(len(y))

x_train, x_test, y_train, y_test = train_test_split(padded_x, y, test_size=0.05, random_state=SEED)

np.asarray(padded_x).shape

(988, 74688)

In [4]:
path = os.path.join(root_path, organ)
dirs = next(os.walk(path))[2]
female_ids = np.loadtxt("../data/female_mesh_ids.csv")
x = []
y = []
padded_x = []
dirs = dirs[:30]

for dir in dirs:
    with open(f'{path}/{dir}', "rb") as fp:
        latent_space = pkl.load(fp)
    fp.close()

    x.append(latent_space)
    if(int(dir) in female_ids):
        y.append(0)
    else:
        y.append(1)

max_vertices = max(len(inner_array) for inner_array in x)
for space in x:
    pca = PCA(n_components=64)

    space_pca = pca.fit_transform(space.T)
    # space_pca = space_pca.T
    space_pca = space_pca.flatten()
    padded_x.append(space_pca)


print(len(x))
print(len(y))

x_train, x_test, y_train, y_test = train_test_split(padded_x, y, test_size=0.05, random_state=SEED)

np.asarray(padded_x).shape

30
30


(30, 16384)

# Model

## Random Forest

In [41]:
rnd_clf = RandomForestClassifier(criterion='gini', n_estimators=n_estimators, max_leaf_nodes=max_leaf_nodes, random_state=SEED, n_jobs=2)
rnd_clf.fit(x_train, y_train)
# scores = cross_val_score(rnd_clf, x_train, y_train,                         
#                          scoring="accuracy", cv=10) 
# scores

In [42]:
y_pred = rnd_clf.predict(x_test)
# y_pred = cross_val_predict(rnd_clf, x_train, y_train, cv=10) 

In [43]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7

In [44]:
conf_mx = confusion_matrix(y_test, y_pred)
conf_mx

array([[15,  4],
       [11, 20]], dtype=int64)

### Fine Tuning

In [None]:
param_distribs = {
        'n_estimators': randint(low=1, high=1000),
        'max_leaf_nodes' : randint(low=1, high=1000),
    }

sweep_rnd_clf = RandomForestClassifier(random_state=SEED)
rnd_search = RandomizedSearchCV(sweep_rnd_clf, param_distributions=param_distribs,
                                n_iter=20, cv=5, scoring='accuracy', random_state=SEED)
rnd_search.fit(x_train, y_train)
print("Best hyperparameters:", rnd_search.best_params_)

In [None]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

## MLP

In [26]:
clf = MLPClassifier(max_iter=200,
                    hidden_layer_sizes=(6,5),
                    random_state=42,
                    verbose=True,
                    learning_rate_init=0.01)

clf.fit(x_train, y_train)

Iteration 1, loss = 1.19819237
Iteration 2, loss = 0.96469556
Iteration 3, loss = 0.94940585
Iteration 4, loss = 0.93540136
Iteration 5, loss = 0.92172763
Iteration 6, loss = 0.90828881
Iteration 7, loss = 0.89512583
Iteration 8, loss = 0.88139960
Iteration 9, loss = 1.74258955
Iteration 10, loss = 1.67969643
Iteration 11, loss = 1.18433993
Iteration 12, loss = 1.54358569
Iteration 13, loss = 1.66922518
Iteration 14, loss = 1.59069332
Iteration 15, loss = 1.41943498
Iteration 16, loss = 1.21095104
Iteration 17, loss = 1.01003017
Iteration 18, loss = 0.96888724
Iteration 19, loss = 0.93885386
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


In [27]:
y_pred = clf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5

### Fine Tuning

In [5]:
parameter_space = {
        'hidden_layer_sizes': [(10), (10,20,30,10), (1,5,10,20,30), (10,10,10), (10,20,30,20,10),(6,5)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'learning_rate_init': [0.0001, 0.001, 0.01, 0.1]
    }

In [6]:
clf = MLPClassifier(max_iter=500, verbose=False)
random_search = RandomizedSearchCV(clf, parameter_space, n_iter=100, cv=5, verbose=1, random_state=42, scoring='accuracy')

random_search.fit(x_train, y_train)



Fitting 5 folds for each of 96 candidates, totalling 480 fits




In [7]:
print('Best parameters found:\n', random_search.best_params_)

Best parameters found:
 {'solver': 'adam', 'learning_rate_init': 0.0001, 'hidden_layer_sizes': 10, 'activation': 'tanh'}


In [8]:
y_pred = random_search.best_estimator_.predict(x_test)
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))

Accuracy: 1.00
