In [47]:
from sympy import Integer
import jax
import jax.numpy as jnp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import io
import requests
import hashlib
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
import plotly.express as px

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import accuracy_score, precision_score, f1_score

Następnie wybieramy encoding danych wraz z PCA

In [48]:
from data_encoder import encode_data

network_data = pd.read_csv('dataset_sdn.csv')
X_train, X_test, y_train, y_test = encode_data(network_data,2)

Następnie używamy sklearn aby nauczył się używając algorytmu Isolation Forest naszych danych. Jest to model nienadzorowany.

In [49]:
iso_forest = IsolationForest()
iso_forest.fit(X_train)

Teraz każemy mu przewidzieć które z danych są anomaliami na danych testowych

In [50]:
y_pred = iso_forest.predict(X_test)

Isoletion Forest zamienia daje nam wartości -1 i 1 dla odpowiednio anomalii i danych uznanych za prawidłowe. Z tego powodu zamieniamy -1 na 0 używając np.where

In [51]:
y_pred_binary = np.where(y_pred == 1, 0, 1)

Teraz możemy porównać skuteczność naszego modelu używając informacji z danych z których znamy które z wyników to są anomalie

In [52]:
accuracy = accuracy_score(y_test, y_pred_binary)
print('Dokładność: ', accuracy)

Dokładność:  0.6006034925526451


Próba dobrania hiperparametrów ręcznie

In [53]:
n_estimators_values = [50, 100, 200]
max_samples_values = ['auto', 0.5, 0.75, 1.0]
contamination_values = [0.1, 0.2, 0.3]
max_features_values = [1.0, 0.5, 0.75]

# Store results in a list
results = []

# Iterate over different values of hyperparameters
for n_estimators in n_estimators_values:
    for max_samples in max_samples_values:
        for contamination in contamination_values:
            for max_features in max_features_values:
                clf = IsolationForest(
                    n_estimators=n_estimators,
                    max_samples=max_samples,
                    contamination=contamination,
                    max_features=max_features,
                    random_state=42
                )
                clf.fit(X_train)
                
                # Predicting -1 for outliers and 1 for inliers
                y_train_pred = clf.predict(X_train)
                y_test_pred = clf.predict(X_test)
                
                # Convert predictions from -1 (outliers) and 1 (inliers) to binary classification
                y_train_pred = np.where(y_train_pred == 1, 0, 1)
                y_test_pred = np.where(y_test_pred == 1, 0, 1)
                
                train_accuracy = accuracy_score(y_train, y_train_pred)
                test_accuracy = accuracy_score(y_test, y_test_pred)
                train_precision = precision_score(y_train, y_train_pred, average='binary')
                test_precision = precision_score(y_test, y_test_pred, average='binary')
                train_f1 = f1_score(y_train, y_train_pred, average='binary')
                test_f1 = f1_score(y_test, y_test_pred, average='binary')
                
                results.append({
                    'n_estimators': n_estimators,
                    'max_samples': max_samples,
                    'contamination': contamination,
                    'max_features': max_features,
                    'train_accuracy': train_accuracy,
                    'test_accuracy': test_accuracy,
                    'train_precision': train_precision,
                    'test_precision': test_precision,
                    'train_f1': train_f1,
                    'test_f1': test_f1
                })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print the results
print(results_df)

# Find and print the best result for test accuracy
best_result = results_df.loc[results_df['test_accuracy'].idxmax()]
print("\nBest hyperparameters for test accuracy:")
print(best_result)

     n_estimators max_samples  contamination  max_features  train_accuracy  \
0              50        auto            0.1          1.00        0.586762   
1              50        auto            0.1          0.50        0.596035   
2              50        auto            0.1          0.75        0.604056   
3              50        auto            0.2          1.00        0.579540   
4              50        auto            0.2          0.50        0.590546   
..            ...         ...            ...           ...             ...   
103           200         1.0            0.2          0.50        0.612613   
104           200         1.0            0.2          0.75        0.606064   
105           200         1.0            0.3          1.00        0.603093   
106           200         1.0            0.3          0.50        0.618474   
107           200         1.0            0.3          0.75        0.609944   

     test_accuracy  train_precision  test_precision  train_f1  

Najlepsze hiperparametry i ich wyniki:

n_estimators             50

max_samples            0.75

contamination           0.3

max_features           0.75

train_accuracy     0.627114

test_accuracy      0.629141

train_precision    0.532973

test_precision     0.521488

train_f1           0.461667

test_f1            0.462126
