In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
columns = ['Triaxial_y', 'Track Position']

In [3]:
df = pd.read_csv('D:/DR_train/Monitoracao_diagnostico_maquinas/Python/df.csv', usecols=columns)

In [4]:
plt.style.use('default')

plt.rcParams.update({
    'font.size': 16,
    'axes.linewidth': 2,
    'axes.titlesize': 20,
    'axes.edgecolor': 'black',
    'axes.labelsize': 20,
    'axes.grid': True,
    'lines.linewidth': 1.5,
    'lines.markersize': 6,
    'figure.figsize': (15, 6),
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'font.family': 'Arial',
    'legend.fontsize': 13,
    'legend.framealpha': 1,
    'legend.edgecolor': 'black',
    'legend.shadow': False,
    'legend.fancybox': True,
    'legend.frameon': True,
})


In [9]:
from scipy import stats
import matplotlib.font_manager
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.lof import LOF
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import optuna

In [6]:
X_train, X_test = train_test_split(df, test_size = 0.3, shuffle = False)

In [7]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
def objective(trial):
    contamination = trial.suggest_float('contamination', 0.01, 0.2)
    n_neighbors = trial.suggest_int('n_neighbors', 1, 1000)
    method = trial.suggest_categorical('method', ['largest', 'mean', 'median'])
    radius = trial.suggest_float('radius', 0.1, 10)
    algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
    leaf_size = trial.suggest_int('leaf_size', 5, 100)
    
    model = KNN(
        contamination = contamination,
        n_neighbors = n_neighbors,
        method = method,
        radius = radius,
        algorithm = algorithm,
        leaf_size = leaf_size
    )
    
    model.fit(X_train_scaled)
    
    # get the prediction labels and outlier scores of the training data
    y_train_pred = model.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = model.decision_scores_  # raw outlier scores
    
    # get the prediction on the test data
    y_test_pred = model.predict(X_test_scaled)  # outlier labels (0 or 1)
    y_test_scores = model.decision_function(X_test_scaled)  # outlier scores

    
    # Silhouette score
    silhouette_avg = silhouette_score(X_test_scaled, y_test_pred)

    
    return np.mean(y_test_scores)
    
study_knn = optuna.create_study(direction='maximize')
study_knn.optimize(objective, n_trials=20)
study_knn.best_params

[32m[I 2022-08-03 11:02:05,553][0m A new study created in memory with name: no-name-b69c5b50-b12a-4711-95ee-933fc77eebc3[0m
[33m[W 2022-08-03 11:07:32,910][0m Trial 0 failed because of the following error: MemoryError((147988948, 928), dtype('float64'))[0m
Traceback (most recent call last):
  File "C:\Users\vinic\anaconda3\envs\tf-gpu\lib\site-packages\optuna\study\_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\vinic\AppData\Local\Temp\ipykernel_14032\447020629.py", line 18, in objective
    model.fit(X_train_scaled)
  File "C:\Users\vinic\anaconda3\envs\tf-gpu\lib\site-packages\pyod\models\knn.py", line 210, in fit
    dist_arr, _ = self.neigh_.kneighbors(n_neighbors=self.n_neighbors,
  File "C:\Users\vinic\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\neighbors\_base.py", line 776, in kneighbors
    chunked_results = Parallel(n_jobs, **parallel_kwargs)(
  File "C:\Users\vinic\anaconda3\envs\tf-gpu\lib\site-packages\joblib\parallel.py

MemoryError: Unable to allocate 1023. GiB for an array with shape (147988948, 928) and data type float64

In [11]:
model = KNN(contamination = 0.01168569852132198,
            n_neighbors = 406,
            method = 'median',
            radius = 6.826488748721495,
            algorithm = 'kd_tree',
            leaf_size = 49)
print(model)
model.fit(X_train_scaled)

# get the prediction labels and outlier scores of the training data
y_train_pred = model.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = model.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = model.predict(X_test_scaled)  # outlier labels (0 or 1)
y_test_scores = model.decision_function(X_test_scaled)  # outlier scores

  warn('algorithm parameter is deprecated and will be removed '


KNN(algorithm='kd_tree', contamination=0.01168569852132198, leaf_size=49,
  method='median', metric='minkowski', metric_params=None, n_jobs=1,
  n_neighbors=406, p=2, radius=6.826488748721495)


MemoryError: Unable to allocate 449. GiB for an array with shape (147988948, 407) and data type float64

In [None]:
unique, counts = np.unique(y_test_pred, return_counts=True)
total = dict(zip(unique, counts))

n_inliers = total[0]
n_outliers = total[1]

print('Inliers: ', n_inliers, 'Outliers: ', n_outliers)

In [None]:
y_train_pred2 = pd.Series(y_train_pred)
y_train_pred2.name = 'IsOutlier?'


df_train = pd.concat([X_train,y_train_pred2],axis=1)
df_train['IsOutlier?'] = df_train['IsOutlier?'].astype('str')

In [None]:
# Silhouette score
silhouette_avg = silhouette_score(X_train_scaled, y_train_pred)

print('OUTLIERS: ', n_outliers, 'INLIERS: ', n_inliers, clf_name, 'SILHOUETTE SCORE: ', silhouette_avg)

sns.scatterplot(data=df_train, x=df_train.index, y=df_train['Triaxial_y'], hue='IsOutlier?', c=1)