# Vari metodi di classificazione

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import label_binarize
from utility import Dataset, oh_encoder, t_encoder, l_encoder, scaler, get_best_features, cfs, rfe, sfs, pca, eval_metric
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

from sklearn.metrics import roc_curve, auc

columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
           'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
           'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
           'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
           'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
           'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
           'dst_host_srv_rerror_rate', 'label', 'score']

nominal_features = ['protocol_type', 'service', 'flag']
binary_features = ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']
numeric_features = [feature for feature in columns if feature not in nominal_features + binary_features + ['label', 'score', 'num_outbound_cmds']]

train_df = pd.read_csv(f'kaggle/nsl-kdd/KDDTrain+.txt', header=None)
test_df = pd.read_csv(f'kaggle/nsl-kdd/KDDTest+.txt', header=None)

train_df = Dataset(train_df, columns).get_label5()
test_df = Dataset(test_df, columns).get_label5()

# train_df, test_df = t_encoder(train_df, test_df, ['service', ])
# train_df, test_df = l_encoder(train_df, test_df, ['service', 'protocol_type', 'flag'])
# train_df, test_df = oh_encoder(train_df, test_df, ['protocol_type', 'flag'])
train_df, test_df = l_encoder(train_df, test_df, ['service', 'protocol_type', 'flag'])
train_df, test_df = scaler(train_df, test_df, numeric_features, MinMaxScaler())

for k in range(25, 30):
    print(f'k={k}')
    for method in [f_classif,]:
        print(f'method={method.__name__}')
        X_train, X_test = get_best_features(train_df, test_df, method, k)
        y_train = train_df['label']
        y_test = test_df['label']
        
        # classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        classifier = LinearSVC()
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        print(classification_report(y_test, y_pred, target_names=classifier.classes_))

# k = 30
# X_train, X_test = get_best_features(train_df, test_df, f_classif, k)
# X_train, X_test = get_best_features(train_df, test_df, mutual_info_classif, k)
# X_train, X_test = cfs(train_df, test_df)
# X_train, X_test = rfe(train_df, test_df, k)
# X_train, X_test = sfs(train_df, test_df, k)
# X_train, X_test = pca(train_df, test_df, k)



# eval_metric(classifier, X_train, y_train, X_test, y_test)

# linear svc
# rf = LinearSVC()
# rf.fit(X_train, y_train)
# y_pred = rf.predict(X_test)

# model  = CatBoostClassifier(
#     custom_loss=[metrics.Accuracy()],
#     random_seed=42,
#     logging_level='Silent'
# )

# model.fit(
#     X_train, y_train,
#     cat_features=['protocol_type', 'service', 'flag'],
#     eval_set=(X_test, y_test),
# #     logging_level='Verbose',  # you can uncomment this for text output
#     plot=True
# );
# classifier = GaussianNB()
# classifier.fit(X_train, y_train)
# y_pred = classifier.predict(X_test)

# valuta il modello
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred, average='weighted')  # 'weighted' gestisce classi sbilanciate
# recall = recall_score(y_test, y_pred, average='weighted')
# f1 = f1_score(y_test, y_pred, average='weighted')
# print(f"Accuracy: {accuracy:.2f}")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f"F1-Score: {f1:.2f}")

# cm = confusion_matrix(y_test, y_pred)

# classes = classifier.classes_
# Visualizzazione con una heatmap
# plt.figure(figsize=(10, 7))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix')
# plt.show()

# Calcolare le curve ROC
# y_score = rf.predict_proba(X_test)
# RocCurveDisplay.from_predictions(y_test, y_score[:, 1], pos_label='normal')
# plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.legend(loc="lower right")
# plt.show()
# # Plot della curva ROC
# plt.plot([0, 1], [0, 1], 'k--')  # Linea diagonale
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])

k=25
method=f_classif
Index(['protocol_type', 'service', 'flag', 'hot', 'logged_in', 'root_shell',
       'is_guest_login', 'count', 'serror_rate', 'srv_serror_rate',
       'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
       'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
       'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
       'dst_host_serror_rate', 'dst_host_srv_serror_rate',
       'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'],
      dtype='object')


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         dos       0.97      0.76      0.85      7460
      normal       0.65      0.97      0.78      9711
       probe       0.77      0.73      0.75      2421
         r2l       0.35      0.00      0.00      2885
         u2r       0.00      0.00      0.00        67

    accuracy                           0.75     22544
   macro avg       0.55      0.49      0.48     22544
weighted avg       0.73      0.75      0.70     22544

k=26
method=f_classif
Index(['duration', 'protocol_type', 'service', 'flag', 'hot', 'logged_in',
       'root_shell', 'is_guest_login', 'count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_r

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         dos       0.91      0.76      0.83      7460
      normal       0.65      0.94      0.77      9711
       probe       0.79      0.73      0.76      2421
         r2l       0.38      0.00      0.01      2885
         u2r       0.00      0.00      0.00        67

    accuracy                           0.73     22544
   macro avg       0.55      0.49      0.47     22544
weighted avg       0.71      0.73      0.69     22544

k=27
method=f_classif
Index(['duration', 'protocol_type', 'service', 'flag', 'hot', 'logged_in',
       'root_shell', 'num_shells', 'is_guest_login', 'count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_hos



              precision    recall  f1-score   support

         dos       0.94      0.76      0.84      7460
      normal       0.63      0.97      0.77      9711
       probe       0.86      0.60      0.71      2421
         r2l       0.43      0.00      0.00      2885
         u2r       0.50      0.01      0.03        67

    accuracy                           0.73     22544
   macro avg       0.67      0.47      0.47     22544
weighted avg       0.73      0.73      0.69     22544

k=28
method=f_classif
Index(['duration', 'protocol_type', 'service', 'flag', 'wrong_fragment', 'hot',
       'logged_in', 'root_shell', 'num_shells', 'is_guest_login', 'count',
       'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
       'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
       'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate



              precision    recall  f1-score   support

         dos       0.96      0.77      0.86      7460
      normal       0.64      0.97      0.78      9711
       probe       0.81      0.63      0.71      2421
         r2l       0.38      0.00      0.00      2885
         u2r       0.00      0.00      0.00        67

    accuracy                           0.74     22544
   macro avg       0.56      0.47      0.47     22544
weighted avg       0.73      0.74      0.69     22544

k=29
method=f_classif
Index(['duration', 'protocol_type', 'service', 'flag', 'wrong_fragment', 'hot',
       'num_failed_logins', 'logged_in', 'root_shell', 'num_shells',
       'is_guest_login', 'count', 'serror_rate', 'srv_serror_rate',
       'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
       'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
       'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',




              precision    recall  f1-score   support

         dos       0.97      0.72      0.83      7460
      normal       0.63      0.98      0.76      9711
       probe       0.79      0.62      0.70      2421
         r2l       0.44      0.00      0.00      2885
         u2r       0.50      0.01      0.03        67

    accuracy                           0.73     22544
   macro avg       0.67      0.47      0.46     22544
weighted avg       0.74      0.73      0.68     22544



In [None]:
param_grid = {
    "n_estimators": [50,64,100,128],
    "max_depth": [2, 3, 4,5,6],
    "learning_rate": [0.01,0,0.03, 0.05, 0.1],
    "subsample": [0.5, 0.8],
    "colsample_bytree": [0.5, 0.8]
}

XGBoost_model = XGBClassifier(random_state = 42)

XGB_grid_model = GridSearchCV(XGBoost_model,
                        param_grid,
                        scoring="f1",
                        n_jobs=-1,
                        return_train_score=True).fit(X_train, y_train)

print(XGB_grid_model.best_score_)
print(XGB_grid_model.best_params_)

Logistic_model = LogisticRegression(random_state=42)
RandomForest_model = RandomForestClassifier(random_state=42)
XGBoost = XGBoost_model.fit(X_train,y_train)
Logistic = Logistic_model.fit(X_train,y_train)
RandomForest = RandomForest_model.fit(X_train,y_train)

eval_metric(Logistic_model, X_train, y_train, X_test, y_test)
eval_metric(XGBoost_model, X_train, y_train, X_test, y_test)
eval_metric(RandomForest_model, X_train, y_train, X_test, y_test)

# Feature Importance

In [None]:
model = XGBoost_model
model.feature_importances_

feats = pd.DataFrame(index=train_new[columns].columns, data= model.feature_importances_, columns=['XGB_importance'])
print(feats.sort_values("XGB_importance", ascending = False))

# One-class classification

In [2]:
import numpy as np
from sklearn import svm

# rimuovi le righe con label diverso da normal
X_train = train_df[train_df['label'] == 'normal'].drop(columns=['label'])
X_test = test_df[test_df['label'] == 'normal'].drop(columns=['label'])

X_train_outliers = train_df[train_df['label'] != 'normal'].drop(columns=['label'])
X_test_outliers = test_df[test_df['label'] != 'normal'].drop(columns=['label'])


clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)

clf.fit(X_train)

y_pred_train = clf.predict(X_train)

y_pred_test = clf.predict(X_test)

y_pred_outliers = clf.predict(X_train_outliers)
y_pred_outliers_test = clf.predict(X_test_outliers)

print(y_pred_train)
print(y_pred_test)
print(y_pred_outliers)
print(y_pred_outliers_test)

n_error_train = y_pred_train[y_pred_train == -1].size

n_error_test = y_pred_test[y_pred_test == -1].size

n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

n_error_outliers_test = y_pred_outliers_test[y_pred_outliers_test == 1].size

print(n_error_train)
print(n_error_test)
print(n_error_outliers)
print(n_error_outliers_test)

