In [None]:
# import neophodnih biblioteka
import joblib
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd
import time

# Učitavanje podataka

In [3]:
# ucitavanje prethodno podeljenih skupova podataka

unsw_trset,unsw_testset, y_train, y_test = joblib.load("unsw_dt_corr.pkl")
print("Training set:" + str(unsw_trset.shape) + "\nTest set:" + str(unsw_testset.shape))

#ucitavanje PCA transformisanih skupova podataka

X_train_pca, X_test_pca = joblib.load("unsw_pca.pkl")
print("PCA Training set:" + str(X_train_pca.shape) + "\nPCA Test set:" + str(X_test_pca.shape))

Training set:(140254, 18)
Test set:(35064, 18)
PCA Training set:(140254, 16)
PCA Test set:(35064, 16)


In [4]:
#provera raspodele klasa u trening skupu - nebalansiran skup podataka
y_train.value_counts()

attack_cat
Normal            44799
Generic           31982
Exploits          26709
Fuzzers           14565
DoS                9797
Reconnaissance     8438
Analysis           1610
Backdoor           1358
Shellcode           892
Worms               104
Name: count, dtype: int64

In [4]:
from sklearn.preprocessing import LabelEncoder

def train_xgboost_classifier(X_train, X_test, y_train, y_test):

    # LabelEncoder za target promenljivu (attack_cat)
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)

    # Kreiranje modela
    xgb_model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss'
    )

    print("Počinje treniranje XGBoost modela...")
    start_time = time.time()
    xgb_model.fit(X_train, y_train_encoded)
    training_time = time.time() - start_time
    print(f"Treniranje završeno za {training_time:.2f} sekundi")

    # Predikcije
    print("Pravi predviđanja...")
    y_pred_encoded = xgb_model.predict(X_test)
    y_pred = le.inverse_transform(y_pred_encoded)

    # Evaluacija
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nTačnost modela: {accuracy:.4f} ({accuracy*100:.2f}%)")

    print("\nDetaljni classification report:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)


In [None]:
from sklearn.model_selection import GridSearchCV
def xgboost_grid_search(X_train, X_test, y_train, y_test, param_grid=None, cv=3, verbose=True, scoring='accuracy'):

    # Default parametri za grid search
    if param_grid is None:
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 6, 9],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 0.9, 1.0],
            'colsample_bytree': [0.8, 0.9, 1.0]
        }
    
    # LabelEncoder za string labele
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    
    # Base model
    xgb_model = xgb.XGBClassifier( 
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss'
    )
    
    print("Počinje Grid Search...")
    print(f"Ukupno kombinacija: {np.prod([len(v) for v in param_grid.values()])}")
    
    start_time = time.time()
    
    # Grid Search
    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        verbose=1 if verbose else 0
    )
    
    grid_search.fit(X_train, y_train_encoded)
    
    search_time = time.time() - start_time
    print(f"Grid Search završen za {search_time:.2f} sekundi")
    
    # Najbolji model
    best_model = grid_search.best_estimator_
    
    print(f"\nNajbolji score (CV): {grid_search.best_score_:.4f}")
    print("Najbolji parametri:")
    for param, value in grid_search.best_params_.items():
        print(f"  {param}: {value}")
    
    # Evaluacija na test setu
    print("\nEvaluacija najboljeg modela na test setu:")
    y_pred_encoded = best_model.predict(X_test)
    y_pred = le.inverse_transform(y_pred_encoded)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Tačnost: {accuracy:.4f} ({accuracy*100:.2f}%)")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    return best_model, grid_search.best_params_, grid_search.best_score_


# Treniranje i evaluacija pomoću XGBoost-a

Izabrani model je jedan od najčešće primenjivanih u naučnim radovima i pokazao je jedan od najboljih rezultata. U radovima se uglavnom koriste klasični ML klasifikatori, dok pojedini radovi primenjuju deep learning ili kombinovane metode za feature selection.

 Model ćemo trenirati na dva različita dataseta:
   * na podacima nad kojima je izvršen feature selection pomoću Decision Tree-a i analize korelacije
   * na podacima redukovanim pomoću PCA metode radi zadržavanja najveće varijanse


In [6]:

train_xgboost_classifier(unsw_trset, unsw_testset, y_train, y_test)

Počinje treniranje XGBoost modela...
Treniranje završeno za 14.33 sekundi
Pravi predviđanja...

Tačnost modela: 0.8239 (82.39%)

Detaljni classification report:
                precision    recall  f1-score   support

      Analysis       0.65      0.08      0.14       390
      Backdoor       0.95      0.10      0.17       388
           DoS       0.46      0.05      0.09      2467
      Exploits       0.61      0.94      0.74      6668
       Fuzzers       0.73      0.72      0.73      3615
       Generic       1.00      0.98      0.99      8018
        Normal       0.93      0.92      0.93     11198
Reconnaissance       0.90      0.74      0.81      2053
     Shellcode       0.66      0.63      0.65       241
         Worms       0.67      0.31      0.42        26

      accuracy                           0.82     35064
     macro avg       0.76      0.55      0.57     35064
  weighted avg       0.82      0.82      0.80     35064


Confusion Matrix:
[[   30     0     9   293     0  

Model XGBoost ostvario je tačnost od 82,39%, pri čemu odlično prepoznaje dominantne klase (Generic, Normal, Exploits), dok slabo razlikuje retke napade poput Analysis, Backdoor i DoS. Rezultati pokazuju da je model pouzdan za česte tipove saobraćaja, ali da je za praktičnu primenu potrebno dodatno balansiranje i unapređenje performansi na manjim klasama.

In [16]:
best_model, best_params, best_score = xgboost_grid_search(
    unsw_trset, unsw_testset, y_train, y_test, scoring='f1_macro'
)

Počinje Grid Search...
Ukupno kombinacija: 243
Fitting 3 folds for each of 243 candidates, totalling 729 fits
Grid Search završen za 6679.00 sekundi

Najbolji score (CV): 0.6011
Najbolji parametri:
  colsample_bytree: 0.9
  learning_rate: 0.2
  max_depth: 9
  n_estimators: 300
  subsample: 1.0

Evaluacija najboljeg modela na test setu:
Tačnost: 0.8285 (82.85%)

Classification Report:
                precision    recall  f1-score   support

      Analysis       0.57      0.15      0.24       390
      Backdoor       0.67      0.11      0.19       388
           DoS       0.37      0.14      0.20      2467
      Exploits       0.62      0.89      0.73      6668
       Fuzzers       0.77      0.74      0.76      3615
       Generic       1.00      0.98      0.99      8018
        Normal       0.94      0.93      0.94     11198
Reconnaissance       0.90      0.74      0.81      2053
     Shellcode       0.70      0.65      0.67       241
         Worms       0.62      0.38      0.48       

* Model je treniran na podacima redukovanim pomoću PCA metode.

In [5]:
train_xgboost_classifier(X_train_pca, X_test_pca, y_train, y_test)

Počinje treniranje XGBoost modela...
Treniranje završeno za 16.48 sekundi
Pravi predviđanja...

Tačnost modela: 0.8062 (80.62%)

Detaljni classification report:
                precision    recall  f1-score   support

      Analysis       0.53      0.10      0.17       390
      Backdoor       0.62      0.06      0.11       388
           DoS       0.46      0.03      0.05      2467
      Exploits       0.60      0.93      0.72      6668
       Fuzzers       0.65      0.72      0.68      3615
       Generic       1.00      0.98      0.99      8018
        Normal       0.94      0.89      0.91     11198
Reconnaissance       0.79      0.70      0.74      2053
     Shellcode       0.59      0.49      0.54       241
         Worms       0.50      0.04      0.07        26

      accuracy                           0.81     35064
     macro avg       0.67      0.49      0.50     35064
  weighted avg       0.81      0.81      0.78     35064


Confusion Matrix:
[[  40   11    8  291    0    0  

| Klasa        | F1-score (PCA) | F1-score (Original) |
| ------------ | -------------- | ------------------- |
| Analysis     | 0.17           | 0.14                | 
| Backdoor     | 0.11           | 0.17                |
| DoS          | 0.05           | 0.09                | 
| Exploits     | 0.72           | 0.74                | 
| Fuzzers      | 0.68           | 0.73                | 
| Generic      | 0.99           | 0.99                | 
| Normal       | 0.91           | 0.93                | 
| Worms        | 0.07           | 0.42                | 
| **Accuracy** | 80.62%         | 82.39%              |

Originalni dataset daje bolje rezultate i veći F1-score za klase koje su manje zastupljene.
