## Description of Notebook

The notebook is used to test ML approach on the VDF statistical momenta, anisotropies, and particle fraction numbers. Here we also vary the labels based on the threshold (five categories of labels were defined during the data processing phase)

In [15]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import ADASYN

In [16]:
def outputclass_analysis(test_labels, predicted_labels, output_score=''):
    tn, fp, fn, tp = confusion_matrix(test_labels, predicted_labels).ravel()
    print("------------ SUMMARY OF CLASSIFICATION RESULTS ----------------")
    print("TP = "+str(tp))
    print("TN = "+str(tn))
    print("FP = "+str(fp))
    print("FN = "+str(fn))
    precision = tp/(tp+fp) if ((tp+fp) != 0) else -999.9
    recall = tp/(tp+fn) if ((tp+fn) != 0) else -999.9
    acc = (tp+tn)/(tp+fn+fp+tn)
    tss = tp/(tp+fn) - fp/(fp+tn) if (((tp+fn) != 0) and ((fp+tn) != 0)) else -999.9
    hss = 2*(tp*tn - fp*fn)/((tp+fn)*(fn+tn) + (tp+fp)*(fp+tn))
    print("Precision = "+str(precision))
    print("Recall = "+str(recall))
    print("Accuracy = "+str(acc))
    print("TSS = "+str(tss))
    print("HSS = "+str(hss))
    if (output_score == 'TSS'): return tss
    if (output_score == 'HSS'): return hss
    if (output_score == 'precision'): return precision
    if (output_score == 'accuracy'): return acc
    return

def outputclass_analysis_scorereturn(test_labels, predicted_labels):
    matrix_elements = confusion_matrix(test_labels, predicted_labels, labels=[0, 1]).ravel()
    tn, fp, fn, tp = matrix_elements
    precision = tp/(tp+fp) if ((tp+fp) != 0) else -999.9
    recall = tp/(tp+fn) if ((tp+fn) != 0) else -999.9
    acc = (tp+tn)/(tp+fn+fp+tn)
    tss = tp/(tp+fn) - fp/(fp+tn) if (((tp+fn) != 0) and ((fp+tn) != 0)) else -999.9
    return tp, tn, fp, fn, acc, tss

## Test 1. Preparation and testing of the data set.

Various thresholds are tried here.

In [17]:
def label_union(indstr):
    labels_an = np.load('./mldata_vdfmoments/allsimulations.labels_allmoments_an_'+indstr+'_all.npy')
    labels_me = np.load('./mldata_vdfmoments/allsimulations.labels_allmoments_me_'+indstr+'_all.npy')
    labels_allmoments = np.copy(labels_me)
    labels_allmoments[np.where(labels_an == 1)] = 1
    print('------------------------- THRES = ' + indstr + '----------------------------------')
    print('The total number of data points is: ' + str(len(labels_allmoments)))
    print('Among them unstable (positive) samples: ' + str(len(np.where(labels_allmoments == 1)[0])))
    return labels_allmoments

featurevector_allmoments = np.load('./mldata_vdfmoments/allsimulations.featurevector_allmoments_all.npy')
times_allmoments = np.load('./mldata_vdfmoments/allsimulations.timep_array_all.npy')
labels_allmoments_001 = label_union('001')
labels_allmoments_005 = label_union('005')
labels_allmoments_01 = label_union('01')
labels_allmoments_05 = label_union('05')
labels_allmoments_10 = label_union('10')

------------------------- THRES = 001----------------------------------
The total number of data points is: 1253
Among them unstable (positive) samples: 1083
------------------------- THRES = 005----------------------------------
The total number of data points is: 1253
Among them unstable (positive) samples: 594
------------------------- THRES = 01----------------------------------
The total number of data points is: 1253
Among them unstable (positive) samples: 398
------------------------- THRES = 05----------------------------------
The total number of data points is: 1253
Among them unstable (positive) samples: 108
------------------------- THRES = 10----------------------------------
The total number of data points is: 1253
Among them unstable (positive) samples: 61


In [18]:
scaler = StandardScaler()
scaler.fit(featurevector_allmoments)
featurevector_allmoments = scaler.transform(featurevector_allmoments)

In [19]:
data_split = ShuffleSplit(n_splits=10, test_size=0.33, random_state=0)

## Test 2. Random Forest on the data sets of various thresholds

Given that Random Forest has demonstrated a good performance previously, we down-select it for the thresholds.

In [20]:
def evaluate_RFclassifier(featurevector_allmoments, labels_allmoments, data_split):

    # parameter grid
    param_grid = {'n_estimators': [10,50,100,200], \
                  'max_depth': [None, 2, 5, 10, 25], \
                  'class_weight': [None, 'balanced']}

    # classifier and gridsearch
    clf = RandomForestClassifier()
    grid_search = GridSearchCV(clf, param_grid, cv=data_split, verbose=1, scoring='accuracy')
    grid_search.fit(featurevector_allmoments, labels_allmoments)
    print("Best parameters found: ", grid_search.best_params_)
    best_params = grid_search.best_params_

    # fitting 10 times and accumulating the scores for the best model
    tp = np.zeros([10], dtype=int)
    tn = np.zeros([10], dtype=int)
    fp = np.zeros([10], dtype=int)
    fn = np.zeros([10], dtype=int)
    acc = np.zeros([10], dtype=float)
    tss = np.zeros([10], dtype=float)

    for i, split_indexes in enumerate(data_split.split(featurevector_allmoments)):
        train_index, test_index = split_indexes
        X_train, X_test = featurevector_allmoments[train_index], featurevector_allmoments[test_index]
        f_train, f_test = labels_allmoments[train_index], labels_allmoments[test_index]
        clf = RandomForestClassifier(**best_params)
        clf.fit(X_train, f_train)
        f_predicted = clf.predict(X_test)
        tp[i], tn[i], fp[i], fn[i], acc[i], tss[i] = outputclass_analysis_scorereturn(f_test, f_predicted)

    print("TP = " + str(np.mean(tp)) + "+/-" + str(np.std(tp)))
    print("TN = " + str(np.mean(tn)) + "+/-" + str(np.std(tn)))
    print("FP = " + str(np.mean(fp)) + "+/-" + str(np.std(fp)))
    print("FN = " + str(np.mean(fn)) + "+/-" + str(np.std(fn)))
    print("Acc = " + str(np.mean(acc)) + "+/-" + str(np.std(acc)))
    print("TSS = " + str(np.mean(tss)) + "+/-" + str(np.std(tss)))
    

print("-------------------------------------")
print("Testing for threshold THRES = 0.0001:")
evaluate_RFclassifier(featurevector_allmoments, labels_allmoments_001, data_split)

print("-------------------------------------")
print("Testing for threshold THRES = 0.0005:")
evaluate_RFclassifier(featurevector_allmoments, labels_allmoments_005, data_split)

print("-------------------------------------")
print("Testing for threshold THRES = 0.001:")
evaluate_RFclassifier(featurevector_allmoments, labels_allmoments_01, data_split)

print("-------------------------------------")
print("Testing for threshold THRES = 0.005:")
evaluate_RFclassifier(featurevector_allmoments, labels_allmoments_05, data_split)

print("-------------------------------------")
print("Testing for threshold THRES = 0.01:")
evaluate_RFclassifier(featurevector_allmoments, labels_allmoments_10, data_split)

-------------------------------------
Testing for threshold THRES = 0.0001:
Fitting 10 folds for each of 40 candidates, totalling 400 fits
Best parameters found:  {'class_weight': None, 'max_depth': 25, 'n_estimators': 100}
TP = 346.0+/-6.164414002968976
TN = 49.3+/-6.649060083951716
FP = 8.1+/-2.1656407827707715
FN = 10.6+/-3.411744421846396
Acc = 0.9548309178743961+/-0.007087633213102043
TSS = 0.8270774989723977+/-0.039068593915542814
-------------------------------------
Testing for threshold THRES = 0.0005:
Fitting 10 folds for each of 40 candidates, totalling 400 fits
Best parameters found:  {'class_weight': None, 'max_depth': 10, 'n_estimators': 10}
TP = 180.6+/-9.614572273377531
TN = 208.6+/-10.49952379872535
FP = 10.7+/-4.075536774462967
FN = 14.1+/-3.534119409414459
Acc = 0.940096618357488+/-0.011058476465951301
TSS = 0.8783239828589811+/-0.02211962374035656
-------------------------------------
Testing for threshold THRES = 0.001:
Fitting 10 folds for each of 40 candidates, t

Same as above but with the restricted class weight set to balance

In [21]:
def tss_score(test_labels, predicted_labels):
    matrix_elements = confusion_matrix(test_labels, predicted_labels, labels=[0, 1]).ravel()
    tn, fp, fn, tp = matrix_elements
    tss = tp/(tp+fn) - fp/(fp+tn) if (((tp+fn) != 0) and ((fp+tn) != 0)) else -999.9
    return tss


def evaluate_RFclassifier_score(featurevector_allmoments, labels_allmoments, data_split, tss_scorer):

    # parameter grid
    param_grid = {'n_estimators': [10,50,100,200], \
                  'max_depth': [None, 2, 5, 10, 25], \
                  'class_weight': [None, 'balanced']}

    # classifier and gridsearch
    clf = RandomForestClassifier()
    grid_search = GridSearchCV(clf, param_grid, cv=data_split, verbose=1, scoring=tss_scorer)
    grid_search.fit(featurevector_allmoments, labels_allmoments)
    print("Best parameters found: ", grid_search.best_params_)
    best_params = grid_search.best_params_

    # fitting 10 times and accumulating the scores for the best model
    tp = np.zeros([10], dtype=int)
    tn = np.zeros([10], dtype=int)
    fp = np.zeros([10], dtype=int)
    fn = np.zeros([10], dtype=int)
    acc = np.zeros([10], dtype=float)
    tss = np.zeros([10], dtype=float)

    for i, split_indexes in enumerate(data_split.split(featurevector_allmoments)):
        train_index, test_index = split_indexes
        X_train, X_test = featurevector_allmoments[train_index], featurevector_allmoments[test_index]
        f_train, f_test = labels_allmoments[train_index], labels_allmoments[test_index]
        clf = RandomForestClassifier(**best_params)
        clf.fit(X_train, f_train)
        f_predicted = clf.predict(X_test)
        tp[i], tn[i], fp[i], fn[i], acc[i], tss[i] = outputclass_analysis_scorereturn(f_test, f_predicted)

    print("TP = " + str(np.mean(tp)) + "+/-" + str(np.std(tp)))
    print("TN = " + str(np.mean(tn)) + "+/-" + str(np.std(tn)))
    print("FP = " + str(np.mean(fp)) + "+/-" + str(np.std(fp)))
    print("FN = " + str(np.mean(fn)) + "+/-" + str(np.std(fn)))
    print("Acc = " + str(np.mean(acc)) + "+/-" + str(np.std(acc)))
    print("TSS = " + str(np.mean(tss)) + "+/-" + str(np.std(tss)))
    

tss_scorer = make_scorer(tss_score, greater_is_better=True)
    
print("-------------------------------------")
print("Testing for threshold THRES = 0.0001:")
evaluate_RFclassifier_score(featurevector_allmoments, labels_allmoments_001, data_split, tss_scorer)

print("-------------------------------------")
print("Testing for threshold THRES = 0.0005:")
evaluate_RFclassifier_score(featurevector_allmoments, labels_allmoments_005, data_split, tss_scorer)

print("-------------------------------------")
print("Testing for threshold THRES = 0.001:")
evaluate_RFclassifier_score(featurevector_allmoments, labels_allmoments_01, data_split, tss_scorer)

print("-------------------------------------")
print("Testing for threshold THRES = 0.005:")
evaluate_RFclassifier_score(featurevector_allmoments, labels_allmoments_05, data_split, tss_scorer)

print("-------------------------------------")
print("Testing for threshold THRES = 0.01:")
evaluate_RFclassifier_score(featurevector_allmoments, labels_allmoments_10, data_split, tss_scorer)

-------------------------------------
Testing for threshold THRES = 0.0001:
Fitting 10 folds for each of 40 candidates, totalling 400 fits
Best parameters found:  {'class_weight': 'balanced', 'max_depth': 5, 'n_estimators': 100}
TP = 329.1+/-10.202450685987166
TN = 53.7+/-6.527633568147036
FP = 3.7+/-2.147091055358389
FN = 27.5+/-9.233092656309694
Acc = 0.9246376811594204+/-0.021272501310438646
TSS = 0.8574655583462052+/-0.039053590136864644
-------------------------------------
Testing for threshold THRES = 0.0005:
Fitting 10 folds for each of 40 candidates, totalling 400 fits
Best parameters found:  {'class_weight': 'balanced', 'max_depth': 25, 'n_estimators': 10}
TP = 180.2+/-9.987992791347018
TN = 208.5+/-10.180864403379509
FP = 10.8+/-3.8157568056677826
FN = 14.5+/-3.4713109915419564
Acc = 0.9388888888888888+/-0.010250770615421715
TSS = 0.8757483498397696+/-0.020944497229009578
-------------------------------------
Testing for threshold THRES = 0.001:
Fitting 10 folds for each of 