In [35]:
# import neophodnih biblioteka
import joblib
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

# Učitavanje podataka

In [48]:
# ucitavanje prethodno podeljenih skupova podataka

X_test, y_test = joblib.load("unsw_corr_test.pkl")
X_train, y_train = joblib.load("unsw_corr_train.pkl")
X_validation, y_validation = joblib.load("unsw_corr_val.pkl")
print("Training set:" + str(X_train.shape) + "\nTest set:" + str(X_test.shape))

#ucitavanje PCA transformisanih skupova podataka

_,_,y_train_pca,y_validation_pca = joblib.load("unsw_trv_unprocessed.pkl")
X_train_pca, X_validation_pca, X_test_pca = joblib.load("unsw_pca.pkl")
print("PCA Training set:" + str(X_train_pca.shape) + "\nPCA Test set:" + str(X_test_pca.shape))

Training set:(162955, 18)
Test set:(82328, 18)
PCA Training set:(140254, 16)
PCA Test set:(82328, 16)


In [38]:
#provera raspodele klasa u trening skupu - posle balansiranja
y_train.value_counts()

attack_cat
Normal            44799
Generic           31982
Exploits          26709
Fuzzers           14565
DoS               12800
Reconnaissance    10500
Analysis           8600
Worms              5200
Shellcode          4500
Backdoor           3300
Name: count, dtype: int64

In [39]:
from sklearn.preprocessing import LabelEncoder

def train_xgboost_classifier(X_train, X_test, y_train, y_test):

    # LabelEncoder za target promenljivu (attack_cat)
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)

    # Kreiranje modela
    xgb_model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss'
    )

    print("Počinje treniranje XGBoost modela...")
    start_time = time.time()
    xgb_model.fit(X_train, y_train_encoded)
    training_time = time.time() - start_time
    print(f"Treniranje završeno za {training_time:.2f} sekundi")

    # Predikcije
    print("Pravi predviđanja...")
    y_pred_encoded = xgb_model.predict(X_test)
    y_pred = le.inverse_transform(y_pred_encoded)

    # Evaluacija
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nTačnost modela: {accuracy:.4f} ({accuracy*100:.2f}%)")

    print("\nDetaljni classification report:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)


In [44]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import xgboost as xgb
import numpy as np
import pandas as pd
import time

def xgboost_grid_search(X_train, X_test, y_train, y_test, param_grid=None, cv=3, scoring='macro_f1'):
    if param_grid is None:
        param_grid = {
            'n_estimators': [200, 300],
            'max_depth': [6, 8],
            'learning_rate': [0.05, 0.1],
            'subsample': [0.8, 0.9],
            'colsample_bytree': [0.8, 0.9]
        }
        
    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    
    scoring_dict = {'macro_f1': make_scorer(f1_score, average='macro')}
    
    cv_strategy = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    
    model = xgb.XGBClassifier(
        random_state=42,
        n_jobs=-1,
        eval_metric='mlogloss',
        tree_method='hist'
    )
    
    grid = GridSearchCV(
        model,
        param_grid,
        scoring=scoring_dict,
        refit='macro_f1',
        cv=cv_strategy,
        verbose=1,
        n_jobs=-1,
        return_train_score=True
    )
    
    start = time.time()
    grid.fit(X_train, y_train_enc)
    print(f"Grid Search završio za {(time.time()-start):.2f} sekundi")
    
    best_model = grid.best_estimator_
    
    y_pred = best_model.predict(X_test)
    y_pred_decoded = le.inverse_transform(y_pred)
    
    print("Test set metrics:")
    print("Accuracy:", accuracy_score(y_test, y_pred_decoded))
    print("Macro F1:", f1_score(y_test, y_pred_decoded, average='macro'))
    print(classification_report(y_test, y_pred_decoded))
    print("Confusion Matrix:\n", pd.DataFrame(confusion_matrix(y_test, y_pred_decoded), 
                                              index=le.classes_, columns=le.classes_))
    
    return {
        'best_model': best_model,
        'best_params': grid.best_params_,
        'cv_best_score': grid.best_score_,
        'label_encoder': le,
        'all_results': pd.DataFrame(grid.cv_results_)
    }

# Treniranje i evaluacija pomoću XGBoost-a

Izabrani model je jedan od najčešće primenjivanih u naučnim radovima i pokazao je jedan od najboljih rezultata. U radovima se uglavnom koriste klasični ML klasifikatori, dok pojedini radovi primenjuju deep learning ili kombinovane metode za feature selection.

 Model ćemo trenirati na tri različita dataseta:
   * na podacima nad kojima je izvršen feature selection pomoću Decision Tree-a i analize korelacije
   * na podacima redukovanim pomoću PCA metode radi zadržavanja najveće varijanse
   * na podacima na kojima je primenjena metoda redukcije pomocu autoenkodera


In [41]:
X_combined = np.concatenate([X_train, X_validation], axis=0)
y_combined = np.concatenate([y_train, y_validation], axis=0)

train_xgboost_classifier(X_combined, X_test, y_combined, y_test)

Počinje treniranje XGBoost modela...
Treniranje završeno za 11.38 sekundi
Pravi predviđanja...

Tačnost modela: 0.7551 (75.51%)

Detaljni classification report:
                precision    recall  f1-score   support

      Analysis       0.06      0.27      0.09       677
      Backdoor       0.62      0.05      0.10       583
           DoS       0.47      0.12      0.19      4089
      Exploits       0.59      0.87      0.70     11128
       Fuzzers       0.32      0.57      0.41      6062
       Generic       1.00      0.96      0.98     18871
        Normal       0.96      0.73      0.83     37000
Reconnaissance       0.93      0.79      0.85      3496
     Shellcode       0.18      0.87      0.30       378
         Worms       0.41      0.70      0.52        44

      accuracy                           0.76     82328
     macro avg       0.56      0.59      0.50     82328
  weighted avg       0.84      0.76      0.77     82328


Confusion Matrix:
[[  184     0    15   419     2  

In [42]:
results = xgboost_grid_search(
    X_combined, X_test, y_combined, y_test, scoring='macro_f1'
)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
Grid Search završio za 2021.95 sekundi
Test set metrics:
Accuracy: 0.7514211446895346
Macro F1: 0.5158149862457757
                precision    recall  f1-score   support

      Analysis       0.06      0.29      0.10       677
      Backdoor       0.09      0.15      0.11       583
           DoS       0.46      0.18      0.25      4089
      Exploits       0.63      0.81      0.71     11128
       Fuzzers       0.28      0.51      0.36      6062
       Generic       1.00      0.96      0.98     18871
        Normal       0.96      0.74      0.83     37000
Reconnaissance       0.92      0.80      0.85      3496
     Shellcode       0.25      0.83      0.38       378
         Worms       0.54      0.59      0.57        44

      accuracy                           0.75     82328
     macro avg       0.52      0.59      0.52     82328
  weighted avg       0.83      0.75      0.77     82328

Confusion Matrix:
                 An

* Model je treniran na podacima redukovanim pomoću PCA metode.

In [49]:
X_pca_combined = np.concatenate([X_train_pca, X_validation_pca], axis=0)
y_pca_combined = np.concatenate([y_train_pca, y_validation_pca], axis=0)

results_pca = xgboost_grid_search(
    X_pca_combined, X_test_pca, y_pca_combined, y_test, scoring='macro_f1'
)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
Grid Search završio za 2069.19 sekundi
Test set metrics:
Accuracy: 0.7414123991837528
Macro F1: 0.4410337631849277
                precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00       677
      Backdoor       0.02      0.06      0.03       583
           DoS       0.30      0.12      0.17      4089
      Exploits       0.60      0.79      0.68     11128
       Fuzzers       0.29      0.56      0.38      6062
       Generic       1.00      0.95      0.98     18871
        Normal       0.94      0.74      0.83     37000
Reconnaissance       0.71      0.77      0.74      3496
     Shellcode       0.41      0.49      0.45       378
         Worms       0.44      0.09      0.15        44

      accuracy                           0.74     82328
     macro avg       0.47      0.46      0.44     82328
  weighted avg       0.80      0.74      0.76     82328

Confusion Matrix:
                 An

In [52]:
X_train_autoenc, y_train_autoenc = joblib.load("unsw_train_reduced_autoenc.pkl")
X_test_autoenc, y_test_eutoenc = joblib.load("unsw_test_reduced_autoenc.pkl")

train_xgboost_classifier(X_train_autoenc, X_test_autoenc, y_train_autoenc, y_test_eutoenc)

Počinje treniranje XGBoost modela...
Treniranje završeno za 11.05 sekundi
Pravi predviđanja...

Tačnost modela: 0.7136 (71.36%)

Detaljni classification report:
                precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00       677
      Backdoor       0.02      0.04      0.02       583
           DoS       0.28      0.06      0.09      4089
      Exploits       0.55      0.83      0.66     11128
       Fuzzers       0.24      0.57      0.34      6062
       Generic       1.00      0.90      0.95     18871
        Normal       0.95      0.70      0.81     37000
Reconnaissance       0.63      0.75      0.68      3496
     Shellcode       0.40      0.38      0.39       378
         Worms       0.75      0.07      0.12        44

      accuracy                           0.71     82328
     macro avg       0.48      0.43      0.41     82328
  weighted avg       0.79      0.71      0.73     82328


Confusion Matrix:
[[    0    31    82   542    10  