# Vari metodi di classificazione

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import label_binarize
from utility import Dataset, oh_encoder, t_encoder, l_encoder, scaler, get_best_features, cfs, rfe, sfs, pca, eval_metric
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_curve, auc

columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
           'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
           'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
           'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
           'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
           'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
           'dst_host_srv_rerror_rate', 'label', 'score']

nominal_features = ['protocol_type', 'service', 'flag']
binary_features = ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']
numeric_features = [feature for feature in columns if feature not in nominal_features + binary_features + ['label', 'score', 'num_outbound_cmds']]

train_df = pd.read_csv(f'kaggle/nsl-kdd/KDDTrain+.txt', header=None)
test_df = pd.read_csv(f'kaggle/nsl-kdd/KDDTest+.txt', header=None)

train_df = Dataset(train_df, columns).get_label5()
test_df = Dataset(test_df, columns).get_label5()

# train_df, test_df = t_encoder(train_df, test_df, ['service', ])
# train_df, test_df = l_encoder(train_df, test_df, ['service', 'protocol_type', 'flag'])
train_df, test_df = oh_encoder(train_df, test_df, ['protocol_type', 'flag'])
train_df, test_df = l_encoder(train_df, test_df, ['service',])
train_df, test_df = scaler(train_df, test_df, MinMaxScaler())

train_df, test_df = scaler(train_df, test_df, numeric_features, MinMaxScaler())

k = 30
# f_classif, chi2, mutual_info_classif
train_selected, test_selected = get_best_features(train_df, test_df, f_classif, k)
train_selected, test_selected = get_best_features(train_df, test_df, chi2, k)
train_selected, test_selected = get_best_features(train_df, test_df, mutual_info_classif, k)
train_selected, test_selected = cfs(train_df, test_df)
train_selected, test_selected = rfe(train_df, test_df, k)
train_selected, test_selected = sfs(train_df, test_df, k)
train_selected, test_selected = pca(train_df, test_df, k)

# dividi train set e test set in X e y
X_train = train_selected.drop(['label'], axis=1)
y_train = train_selected['label']
X_test = test_selected.drop(['label'], axis=1)
y_test = test_selected['label']

# stampa le dimensioni dei dataset
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# random forest
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

eval_metric(classifier, X_train, y_train, X_test, y_test)

# linear svc
# rf = LinearSVC()
# rf.fit(X_train, y_train)
# y_pred = rf.predict(X_test)

# model  = CatBoostClassifier(
#     custom_loss=[metrics.Accuracy()],
#     random_seed=42,
#     logging_level='Silent'
# )

# model.fit(
#     X_train, y_train,
#     cat_features=['protocol_type', 'service', 'flag'],
#     eval_set=(X_test, y_test),
# #     logging_level='Verbose',  # you can uncomment this for text output
#     plot=True
# );
# classifier = GaussianNB()
# classifier.fit(X_train, y_train)
# y_pred = classifier.predict(X_test)

# valuta il modello
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred, average='weighted')  # 'weighted' gestisce classi sbilanciate
# recall = recall_score(y_test, y_pred, average='weighted')
# f1 = f1_score(y_test, y_pred, average='weighted')
# print(f"Accuracy: {accuracy:.2f}")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f"F1-Score: {f1:.2f}")

# print(classification_report(y_test, y_pred, target_names=classifier.classes_))

# cm = confusion_matrix(y_test, y_pred)

# classes = classifier.classes_
# Visualizzazione con una heatmap
# plt.figure(figsize=(10, 7))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix')
# plt.show()

# Calcolare le curve ROC
# y_score = rf.predict_proba(X_test)
# RocCurveDisplay.from_predictions(y_test, y_score[:, 1], pos_label='normal')
# plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.legend(loc="lower right")
# plt.show()
# # Plot della curva ROC
# plt.plot([0, 1], [0, 1], 'k--')  # Linea diagonale
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])

(125973, 20)
(125973,)
(22544, 20)
(22544,)
Accuracy: 0.74


In [None]:


param_grid = {
    "n_estimators": [50,64,100,128],
    "max_depth": [2, 3, 4,5,6],
    "learning_rate": [0.01,0,0.03, 0.05, 0.1],
    "subsample": [0.5, 0.8],
    "colsample_bytree": [0.5, 0.8]
}

XGBoost_model = XGBClassifier(random_state = 42)

XGB_grid_model = GridSearchCV(XGBoost_model,
                        param_grid,
                        scoring="f1",
                        n_jobs=-1,
                        return_train_score=True).fit(X_train, y_train)

print(XGB_grid_model.best_score_)
print(XGB_grid_model.best_params_)

Logistic_model = LogisticRegression(random_state=42)
RandomForest_model = RandomForestClassifier(random_state=42)
XGBoost = XGBoost_model.fit(X_train,y_train)
Logistic = Logistic_model.fit(X_train,y_train)
RandomForest = RandomForest_model.fit(X_train,y_train)

#it's a helper function in order to evaluate our model if it's overfit or underfit.


eval_metric(Logistic_model, X_train, y_train, X_test, y_test)
eval_metric(XGBoost_model, X_train, y_train, X_test, y_test)
eval_metric(RandomForest_model, X_train, y_train, X_test, y_test)

# Feature Importance

In [None]:
model = XGBoost_model
model.feature_importances_

feats = pd.DataFrame(index=train_new[columns].columns, data= model.feature_importances_, columns=['XGB_importance'])
print(feats.sort_values("XGB_importance", ascending = False))