In [1]:
import numpy as np
import pandas as pd
import os
from pyod.models.knn import KNN
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')



In [None]:
# 1. NoFS

In [2]:
# import normalize train & test set
train_set = pd.read_csv('train_set_norm.csv',header=0)
print(train_set.shape)
test_set = pd.read_csv('test_set_norm.csv',header=0)
print(test_set.shape)

# fit the model
random_state = 47
clf = KNN(contamination=0.05, n_neighbors=20, method='largest',radius=1.0, algorithm='kd_tree', 
                         metric='minkowski', n_jobs=16)
clf.fit(train_set.drop(['Sample_id','CLASS'],axis=1).values)
y_pred = clf.predict(train_set.drop(['Sample_id','CLASS'],axis=1).values)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)

print('OUTLIERS : ',n_outliers,'INLIERS : ',n_inliers)

# save model
file_name = 'AD_KNN_Chemical_NoFS_model.pkl'
_ = joblib.dump(clf, file_name, compress=9)

print(clf.threshold_)
print(clf.get_params())

# Train
# copy main dataframe
dfx_Tr = train_set[['Sample_id','CLASS']]

# find outliers
dfx_Tr['Outliers_LOF'] = y_pred.tolist()

print(dfx_Tr.shape)

dfx_Tr_sorted = dfx_Tr[dfx_Tr.Outliers_LOF == 0]
train_set_AD = train_set[train_set.Sample_id.isin(dfx_Tr_sorted.Sample_id)]
print(train_set_AD.shape)
print('class composition original')
print(train_set.CLASS.value_counts())
print('class composition in AD')
print(train_set_AD.CLASS.value_counts())

train_set_AD.to_csv('train_set_norm_within_AD.csv', header=True, index=True, index_label=['Index'])

# Test
y_pred_te = clf.predict(test_set.drop(['Sample_id','CLASS'],axis=1).values)

n_inliers_te = len(y_pred_te) - np.count_nonzero(y_pred_te == 1)
n_outliers_te = len(y_pred_te) - np.count_nonzero(y_pred_te == 0)

print('\nOUTLIERS : ',n_outliers_te,'INLIERS : ',n_inliers_te)

# copy main dataframe
dfx_Te = test_set[['Sample_id','CLASS']]

# find outliers
dfx_Te['Outliers_LOF'] = y_pred_te.tolist()

print(dfx_Te.shape)

dfx_Te_sorted = dfx_Te[dfx_Te.Outliers_LOF == 0]
test_set_AD = test_set[test_set.Sample_id.isin(dfx_Te_sorted.Sample_id)]
print(test_set_AD.shape)
print('class composition original')
print(test_set.CLASS.value_counts())
print('class composition in AD')
print(test_set_AD.CLASS.value_counts())

test_set_AD.to_csv('test_set_norm_within_AD.csv', header=True, index=True, index_label=['Index'])

(277, 983)
(69, 983)
OUTLIERS :  14 INLIERS :  263
48.91164529716072
{'algorithm': 'kd_tree', 'contamination': 0.05, 'leaf_size': 30, 'method': 'largest', 'metric': 'minkowski', 'metric_params': None, 'n_jobs': 16, 'n_neighbors': 20, 'p': 2, 'radius': 1.0}
(277, 3)
(263, 983)
class composition original
1    215
0     62
Name: CLASS, dtype: int64
class composition in AD
1    205
0     58
Name: CLASS, dtype: int64
OUTLIERS :  0 INLIERS :  69
(69, 3)
(69, 983)
class composition original
1    54
0    15
Name: CLASS, dtype: int64
class composition in AD
1    54
0    15
Name: CLASS, dtype: int64


In [None]:
# 2. FS

In [3]:
# import normalize train & test set
train_set = pd.read_csv('train_set_norm.csv',header=0)
print(train_set.shape)
test_set = pd.read_csv('test_set_norm.csv',header=0)
print(test_set.shape)

# fit the model
random_state = 47
clf = KNN(contamination=0.05, n_neighbors=20, method='largest',radius=1.0, algorithm='kd_tree', 
                         metric='minkowski', n_jobs=16)
clf.fit(train_set.drop(['Sample_id','CLASS'],axis=1).values)
y_pred = clf.predict(train_set.drop(['Sample_id','CLASS'],axis=1).values)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)

print('OUTLIERS : ',n_outliers,'INLIERS : ',n_inliers)

# save model
file_name = 'AD_KNN_Chemical_FS_model.pkl'
_ = joblib.dump(clf, file_name, compress=9)

print(clf.threshold_)
print(clf.get_params())

# Train
# copy main dataframe
dfx_Tr = train_set[['Sample_id','CLASS']]

# find outliers
dfx_Tr['Outliers_LOF'] = y_pred.tolist()

print(dfx_Tr.shape)

dfx_Tr_sorted = dfx_Tr[dfx_Tr.Outliers_LOF == 0]
train_set_AD = train_set[train_set.Sample_id.isin(dfx_Tr_sorted.Sample_id)]
print(train_set_AD.shape)
print('class composition original')
print(train_set.CLASS.value_counts())
print('class composition in AD')
print(train_set_AD.CLASS.value_counts())

train_set_AD.to_csv('train_set_norm_within_AD.csv', header=True, index=True, index_label=['Index'])

# Test
y_pred_te = clf.predict(test_set.drop(['Sample_id','CLASS'],axis=1).values)

n_inliers_te = len(y_pred_te) - np.count_nonzero(y_pred_te == 1)
n_outliers_te = len(y_pred_te) - np.count_nonzero(y_pred_te == 0)

print('\nOUTLIERS : ',n_outliers_te,'INLIERS : ',n_inliers_te)

# copy main dataframe
dfx_Te = test_set[['Sample_id','CLASS']]

# find outliers
dfx_Te['Outliers_LOF'] = y_pred_te.tolist()

print(dfx_Te.shape)

dfx_Te_sorted = dfx_Te[dfx_Te.Outliers_LOF == 0]
test_set_AD = test_set[test_set.Sample_id.isin(dfx_Te_sorted.Sample_id)]
print(test_set_AD.shape)
print('class composition original')
print(test_set.CLASS.value_counts())
print('class composition in AD')
print(test_set_AD.CLASS.value_counts())

test_set_AD.to_csv('test_set_norm_within_AD.csv', header=True, index=True, index_label=['Index'])

(277, 14)
(69, 14)
OUTLIERS :  14 INLIERS :  263
4.107583863684451
{'algorithm': 'kd_tree', 'contamination': 0.05, 'leaf_size': 30, 'method': 'largest', 'metric': 'minkowski', 'metric_params': None, 'n_jobs': 16, 'n_neighbors': 20, 'p': 2, 'radius': 1.0}
(277, 3)
(263, 14)
class composition original
1    215
0     62
Name: CLASS, dtype: int64
class composition in AD
1    204
0     59
Name: CLASS, dtype: int64

OUTLIERS :  1 INLIERS :  68
(69, 3)
(68, 14)
class composition original
1    54
0    15
Name: CLASS, dtype: int64
class composition in AD
1    53
0    15
Name: CLASS, dtype: int64
