In [28]:
from scipy.io import loadmat
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score

from pyod.models.iforest import IForest
from pyod.models.loda import LODA
from pyod.models.dif import DIF

In [29]:
data = loadmat('shuttle.mat')

In [30]:
train_data, test_data, train_labels, test_labels = train_test_split(data['X'], data['y'], test_size=0.4)

In [31]:
normalizer = preprocessing.Normalizer()
train_data = normalizer.fit_transform(train_data)
test_data = normalizer.transform(test_data) # normalize test data using the same normalizer as train data


In [32]:
print(train_data.shape)
print(train_data[0])
print(test_data.shape)
print(test_data[0])

(29458, 9)
[0.33136248 0.         0.68959218 0.         0.23284931 0.02686723
 0.36718545 0.46569862 0.08955743]
(19639, 9)
[0.43186628 0.         0.6810199  0.         0.44847652 0.19101778
 0.2408485  0.23254338 0.        ]


In [33]:
def compute_metrics(model_type, model_args, train_data, test_data, train_labels, test_labels):
    model = model_type(**model_args)
    model.fit(train_data)
    
    y_pred = model.predict(test_data)
    ba = balanced_accuracy_score(test_labels, y_pred)
    roc_auc = roc_auc_score(test_labels, y_pred)
    
    print(f'Model {model_type.__name__} BA: {ba}, ROC AUC: {roc_auc}')

In [35]:
for n_split in range(10):
    train_data, test_data, train_labels, test_labels = train_test_split(data['X'], data['y'], test_size=0.4)
    
    normalizer = preprocessing.Normalizer()
    train_data = normalizer.fit_transform(train_data)
    test_data = normalizer.transform(test_data)
    
    print(f'Split {n_split}')
    compute_metrics(IForest, {'contamination': 0.02}, train_data, test_data, train_labels, test_labels)
    compute_metrics(LODA, {'contamination': 0.02, 'n_bins': 10}, train_data, test_data, train_labels, test_labels)
    compute_metrics(DIF, {'contamination': 0.02}, train_data, test_data, train_labels, test_labels)
    
    
    
    

Split 0
Model IForest BA: 0.5998333516463392, ROC AUC: 0.5998333516463392
Model LODA BA: 0.617981540490056, ROC AUC: 0.6179815404900562
Model DIF BA: 0.5105940739112919, ROC AUC: 0.5105940739112919
Split 1
Model IForest BA: 0.604398227685085, ROC AUC: 0.604398227685085
Model LODA BA: 0.6064795523782491, ROC AUC: 0.6064795523782491
Model DIF BA: 0.5088589123484015, ROC AUC: 0.5088589123484014
Split 2
Model IForest BA: 0.5975364102578582, ROC AUC: 0.5975364102578581
Model LODA BA: 0.5948096064620105, ROC AUC: 0.5948096064620104
Model DIF BA: 0.5075146706483272, ROC AUC: 0.5075146706483272
Split 3
Model IForest BA: 0.607963763963556, ROC AUC: 0.607963763963556
Model LODA BA: 0.6175132577726942, ROC AUC: 0.6175132577726942
Model DIF BA: 0.5078648792774819, ROC AUC: 0.5078648792774819
Split 4
Model IForest BA: 0.6048049832491118, ROC AUC: 0.6048049832491118
Model LODA BA: 0.617036479150722, ROC AUC: 0.617036479150722
Model DIF BA: 0.5076126656515761, ROC AUC: 0.5076126656515761
Split 5
Mode