# Vari metodi di classificazione

In [37]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import label_binarize
from utility import Dataset, oh_encoder, t_encoder, l_encoder, scaler, get_best_features, cfs, rfe, sfs, pca, eval_metric
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

from sklearn.metrics import roc_curve, auc

columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
           'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
           'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
           'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
           'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
           'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
           'dst_host_srv_rerror_rate', 'label', 'score']

nominal_features = ['protocol_type', 'service', 'flag']
binary_features = ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']
numeric_features = [feature for feature in columns if feature not in nominal_features + binary_features + ['label', 'score', 'num_outbound_cmds']]

train_df = pd.read_csv(f'kaggle/nsl-kdd/KDDTrain+.txt', header=None)
test_df = pd.read_csv(f'kaggle/nsl-kdd/KDDTest+.txt', header=None)

train_df = Dataset(train_df, columns).get_label5()
test_df = Dataset(test_df, columns).get_label5()

# train_df, test_df = t_encoder(train_df, test_df, ['service', ])
train_df, test_df = l_encoder(train_df, test_df, ['service', 'protocol_type', 'flag', 'label'])
# train_df, test_df = oh_encoder(train_df, test_df, ['protocol_type', 'flag', 'service'])
# train_df, test_df = l_encoder(train_df, test_df, ['label', ])
train_df, test_df = scaler(train_df, test_df, numeric_features, MinMaxScaler())

models = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'LinearSVC': LinearSVC(),
    'Logistic Regression': LogisticRegression(),
    'XGB': XGBClassifier()
}

y_train = train_df['label']
y_test = test_df['label']
X_train = train_df.drop(columns=['label'])
X_test = test_df.drop(columns=['label'])

for name, model in models.items():
    print(f"Training {name}...")
    
    # Addestra il modello
    model.fit(X_train, y_train)
    
    # Fai le predizioni sul test set
    y_pred = model.predict(X_test)

    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
    print(f"F1: {f1_score(y_test, y_pred, average='weighted')}")
    # print(classification_report(y_test, y_pred, target_names=model.classes_))
    # print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}\n")
    

# XBG_param = {
#     "n_estimators": [50,64,100,128],
#     "max_depth": [2, 3, 4,5,6],
#     "learning_rate": [0.01,0,0.03, 0.05, 0.1],
#     "subsample": [0.5, 0.8],
#     "colsample_bytree": [0.5, 0.8]
# }

    # for method in [f_classif,]:
    #     print(f'method={method.__name__} classifier={classifier}')
        # k = 30
        # X_train, X_test = get_best_features(train_df, test_df, method, k)
        # X_train, X_test = get_best_features(train_df, test_df, chi2, k)
        # X_train, X_test = get_best_features(train_df, test_df, mutual_info_classif, k)
        # X_train, X_test = cfs(train_df, test_df)
        # X_train, X_test = rfe(train_df, test_df, k)
        # X_train, X_test = sfs(train_df, test_df, k)
        # X_train, X_test = pca(train_df, test_df, k)
        # y_train = train_df['label']
        # y_test = test_df['label']
        # discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
        # X_train = discretizer.fit_transform(X_train)
        # X_test = discretizer.transform(X_test)

        
        # classifier = RandomForestClassifier(random_state=42)
        # classifier = LinearSVC()
        # classifier = GaussianNB()
        # classifier = LogisticRegression(random_state=42)
        # classifier = XGBClassifier(random_state = 42)
        # XGB_grid_model = GridSearchCV(XGBoost_model, param_grid, scoring="f1", n_jobs=-1, return_train_score=True).fit(X_train, y_train)
        # print(XGB_grid_model.best_score_)
        # print(XGB_grid_model.best_params_)
        # classifier.fit(X_train, y_train)
        # y_pred = classifier.predict(X_test)
        # print(classification_report(y_test, y_pred, target_names=classifier.classes_))


# model  = CatBoostClassifier(custom_loss=[metrics.Accuracy()], random_seed=42, logging_level='Silent')
# model.fit(X_train, y_train, cat_features=['protocol_type', 'service', 'flag'], eval_set=(X_test, y_test),plot=True);

# cm = confusion_matrix(y_test, y_pred)
# classes = classifier.classes_
        
# plt.figure(figsize=(10, 7))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix')
# plt.show()

# Calcolare le curve ROC
# y_score = rf.predict_proba(X_test)
# RocCurveDisplay.from_predictions(y_test, y_score[:, 1], pos_label='normal')
# plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.legend(loc="lower right")
# plt.show()
# # Plot della curva ROC
# plt.plot([0, 1], [0, 1], 'k--')  # Linea diagonale
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])


Training Random Forest...
Accuracy: 0.7571859474804826
Precision: 0.81985026114373
Recall: 0.7571859474804826
F1: 0.7102230828042662
Training Decision Tree...
Accuracy: 0.7756831085876508
Precision: 0.7750172111699729
Recall: 0.7756831085876508
F1: 0.7435214351070989
Training LinearSVC...




Accuracy: 0.7390436479772888
Precision: 0.7277343162071214
Recall: 0.7390436479772888
F1: 0.691510728106857
Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.7375354861603974
Precision: 0.7527820477027148
Recall: 0.7375354861603974
F1: 0.6892005212449521
Training XGB...
Accuracy: 0.7753282469836764
Precision: 0.8275059908620176
Recall: 0.7753282469836764
F1: 0.7354562024323941


In [None]:
train_df = pd.read_csv(f'kaggle/nsl-kdd/KDDTrain+.txt', header=None)
test_df = pd.read_csv(f'kaggle/nsl-kdd/KDDTest+.txt', header=None)

train_df = Dataset(train_df, columns).get_label5()
test_df = Dataset(test_df, columns).get_label5()

# train_df, test_df = t_encoder(train_df, test_df, ['service', ])
# train_df, test_df = l_encoder(train_df, test_df, ['service', 'protocol_type', 'flag'])
# train_df, test_df = oh_encoder(train_df, test_df, ['protocol_type', 'flag'])
train_df, test_df = l_encoder(train_df, test_df, ['service', 'protocol_type', 'flag'])
train_df, test_df = scaler(train_df, test_df, numeric_features, StandardScaler())

# XBG_param = {
#     "n_estimators": [50,64,100,128],
#     "max_depth": [2, 3, 4,5,6],
#     "learning_rate": [0.01,0,0.03, 0.05, 0.1],
#     "subsample": [0.5, 0.8],
#     "colsample_bytree": [0.5, 0.8]
# }

for k in range(35, 40):
    for method in [f_classif,]:
        print(f'k={k}   method={method.__name__}')
        X_train, X_test = get_best_features(train_df, test_df, method, k)
        # X_train, X_test = get_best_features(train_df, test_df, f_classif, k)
        # X_train, X_test = get_best_features(train_df, test_df, mutual_info_classif, k)
        # X_train, X_test = cfs(train_df, test_df)
        # X_train, X_test = rfe(train_df, test_df, k)
        # X_train, X_test = sfs(train_df, test_df, k)
        # X_train, X_test = pca(train_df, test_df, k)
        y_train = train_df['label']
        y_test = test_df['label']
        
        classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        # classifier = LinearSVC()
        # classifier = GaussianNB()
        # classifier = LogisticRegression(random_state=42)
        # classifier = XGBClassifier(random_state = 42)
        # XGB_grid_model = GridSearchCV(XGBoost_model, param_grid, scoring="f1", n_jobs=-1, return_train_score=True).fit(X_train, y_train)
        # print(XGB_grid_model.best_score_)
        # print(XGB_grid_model.best_params_)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        print(classification_report(y_test, y_pred, target_names=classifier.classes_))


# model  = CatBoostClassifier(custom_loss=[metrics.Accuracy()], random_seed=42, logging_level='Silent')
# model.fit(X_train, y_train, cat_features=['protocol_type', 'service', 'flag'], eval_set=(X_test, y_test),plot=True);

# cm = confusion_matrix(y_test, y_pred)
# classes = classifier.classes_
        
# plt.figure(figsize=(10, 7))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix')
# plt.show()

# Calcolare le curve ROC
# y_score = rf.predict_proba(X_test)
# RocCurveDisplay.from_predictions(y_test, y_score[:, 1], pos_label='normal')
# plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.legend(loc="lower right")
# plt.show()
# # Plot della curva ROC
# plt.plot([0, 1], [0, 1], 'k--')  # Linea diagonale
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])


# Feature Importance

In [None]:
y_train = train_df['label']
y_test = test_df['label']
X_train = train_df.drop(columns=['label'])
X_test = test_df.drop(columns=['label'])

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=model.classes_))

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {indices[f]} ({importances[indices[f]]})")

for k in range(1, X_train.shape[1]+1):
    new_X_train = X_train.iloc[:, indices[:k]]
    new_X_test = X_test.iloc[:, indices[:k]]
    # riaddestra il modello con le sole k colonne
    model.fit(new_X_train, y_train)
    y_pred = model.predict(new_X_test)
    print(f"Feature ranking: {k}")
    print(classification_report(y_test, y_pred, target_names=model.classes_))

# One-class classification

In [None]:
from sklearn import svm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import label_binarize
from utility import Dataset, oh_encoder, t_encoder, l_encoder, scaler, get_best_features, cfs, rfe, sfs, pca, eval_metric
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
           'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
           'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
           'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
           'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
           'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
           'dst_host_srv_rerror_rate', 'label', 'score']

nominal_features = ['protocol_type', 'service', 'flag']
binary_features = ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']
numeric_features = [feature for feature in columns if feature not in nominal_features + binary_features + ['label', 'score', 'num_outbound_cmds']]

train_df = pd.read_csv(f'kaggle/nsl-kdd/KDDTrain+.txt', header=None)
test_df = pd.read_csv(f'kaggle/nsl-kdd/KDDTest+.txt', header=None)

train_df = Dataset(train_df, columns).get_label5()
test_df = Dataset(test_df, columns).get_label5()

train_df, test_df = l_encoder(train_df, test_df, ['service', 'protocol_type', 'flag'])
train_df, test_df = scaler(train_df, test_df, numeric_features, StandardScaler())

# rimuovi le righe con label diverso da normal
X_train = train_df[train_df['label'] == 'normal'].drop(columns=['label'])
y_train = train_df[train_df['label'] == 'normal']['label']
X_test = test_df[test_df['label'] == 'normal'].drop(columns=['label'])
y_test = test_df[test_df['label'] == 'normal']['label']

X_train_outliers = train_df[train_df['label'] != 'normal'].drop(columns=['label'])
y_train_outliers = train_df[train_df['label'] != 'normal']['label']
X_test_outliers = test_df[test_df['label'] != 'normal'].drop(columns=['label'])
y_test_outliers = test_df[test_df['label'] != 'normal']['label']

# clf = svm.OneClassSVM(kernel='rbf', gamma=0.5, nu=0.01)

# param_grid = {
#     'kernel': ['rbf', 'sigmoid', 'linear', 'poly'],
#     'gamma': ['scale', 'auto', 0.1, 0.01, 0.5],
#     'nu': [0.1, 0.05, 0.01, 0.001]
# }

clf = svm.OneClassSVM(kernel='rbf', gamma=0.5, nu=0.01)

clf.fit(X_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_train_outliers)
y_pred_outliers_test = clf.predict(X_test_outliers)

y_pred_train = ['attack' if x == -1 else 'normal' for x in y_pred_train]
y_pred_test = ['attack' if x == -1 else 'normal' for x in y_pred_test]
y_pred_outliers = ['attack' if x == -1 else 'normal' for x in y_pred_outliers]
y_pred_outliers_test = ['attack' if x == -1 else 'normal' for x in y_pred_outliers_test]

y_true_train = ['normal'] * len(y_pred_train)
y_true_test = ['normal'] * len(y_pred_test)
y_true_outliers = ['attack'] * len(y_pred_outliers)
y_true_outliers_test = ['attack'] * len(y_pred_outliers_test)

# y_true = y_true_train + y_true_test + y_true_outliers + y_true_outliers_test
# y_pred = y_pred_train + y_pred_test + y_pred_outliers + y_pred_outliers_test
y_true = y_true_test + y_true_outliers + y_true_outliers_test
y_pred = y_pred_test + y_pred_outliers + y_pred_outliers_test

print(classification_report(y_true, y_pred))
