In [60]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import RandomizedSearchCV
import joblib

In [3]:
train = pd.read_parquet(r"D:\Analytixlabs\Internship\Project 4\Cyber Security\output\train_multi_data.parquet")
test = pd.read_parquet(r"D:\Analytixlabs\Internship\Project 4\Cyber Security\output\test_multi_data.parquet")

In [4]:
y_train_type = train['attack_type']
y_test_type = test['attack_type']
Xtrain = train.drop(columns = ['attack_type', 'label'])
Xtest = test.drop(columns = ['attack_type', 'label'])

# Over Sampling

In [5]:
pd.Series(y_train_type).value_counts(normalize = True)

attack_type
dos_ddos        0.755910
portscan        0.213042
brute_force     0.024920
bot             0.004455
web_attack      0.001563
infiltration    0.000081
heartbleed      0.000030
Name: proportion, dtype: float64

In [6]:
pd.Series(y_train_type).value_counts()

attack_type
dos_ddos        225348
portscan         63511
brute_force       7429
bot               1328
web_attack         466
infiltration        24
heartbleed           9
Name: count, dtype: int64

In [7]:
y_test_type = np.where(test['attack_type'].isin(['infiltration', 'heartbleed']), 'others', test['attack_type'])
y_train_type = np.where(train['attack_type'].isin(['infiltration', 'heartbleed']), 'others', train['attack_type'])

In [8]:
pd.Series(y_train_type).value_counts(normalize = True)

dos_ddos       0.755910
portscan       0.213042
brute_force    0.024920
bot            0.004455
web_attack     0.001563
others         0.000111
Name: proportion, dtype: float64

In [9]:
pd.Series(y_test_type).value_counts()

dos_ddos       96416
portscan       27308
brute_force     3193
bot              625
web_attack       207
others            14
Name: count, dtype: int64

In [45]:
y_dict = {
    'brute_force': 20000,
    'bot': 5000,
    'web_attack': 3000,
    'others': 1000
}

In [46]:
smote = SMOTE(
    k_neighbors = 5,
    sampling_strategy = y_dict,
    n_jobs = -1,
    random_state = 43
)

Xtrain_os, ytrain_os = smote.fit_resample(Xtrain, y_train_type)



In [47]:
Xtrain_os.shape

(317859, 77)

In [48]:
pd.Series(ytrain_os).value_counts(normalize = True)

dos_ddos       0.708956
portscan       0.199809
brute_force    0.062921
bot            0.015730
web_attack     0.009438
others         0.003146
Name: proportion, dtype: float64

In [49]:
encoder = LabelEncoder()
ytrain_encoded = encoder.fit_transform(ytrain_os)
ytest_encoded = encoder.transform(y_test_type)

In [37]:
xgb_model = XGBClassifier(
    objective="multi:softprob",
    num_class=6,
    n_estimators=200,
    max_depth=4,
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1.0,
    n_jobs=-1,
    random_state=42
)


# Fitting to the Pipeline
xgb_model.fit(Xtrain_os, ytrain_encoded)

# Predicting the target
ytrain_pred = xgb_model.predict(Xtrain_os)
ytest_pred = xgb_model.predict(Xtest)

#Train Test Data Performance
print('Train Data Performance:')
print(metrics.classification_report(ytrain_encoded, ytrain_pred))
print('Test Data Performance:')
print(metrics.classification_report(ytest_encoded, ytest_pred))

Train Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10000
           1       0.99      0.90      0.94     20000
           2       1.00      1.00      1.00    225348
           3       1.00      1.00      1.00      5000
           4       1.00      1.00      1.00     63511
           5       0.80      0.97      0.88      8000

    accuracy                           0.99    331859
   macro avg       0.96      0.98      0.97    331859
weighted avg       0.99      0.99      0.99    331859

Test Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       625
           1       1.00      0.89      0.94      3193
           2       1.00      1.00      1.00     96416
           3       1.00      0.86      0.92        14
           4       1.00      1.00      1.00     27308
           5       0.35      0.93      0.51       207

    accuracy                  

In [39]:
param_dist = {
    "max_depth": [3, 4, 5, 6],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.7, 0.8, 0.9],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9],
    "learning_rate": [0.01, 0.02, 0.05, 0.1],
    "n_estimators": [50, 100, 200, 300]
}

In [40]:
xgb = XGBClassifier(
    objective="multi:softprob",
    num_class=6,
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,
    scoring="f1_macro",   # important for multiclass imbalance
    cv=3,
    verbose=2,
    random_state=42
)

random_search.fit(Xtrain_os, ytrain_encoded)

print("Best Params:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END colsample_bytree=0.8, gamma=0.05, learning_rate=0.02, max_depth=6, min_child_weight=3, n_estimators=300, subsample=0.8; total time=  28.0s
[CV] END colsample_bytree=0.8, gamma=0.05, learning_rate=0.02, max_depth=6, min_child_weight=3, n_estimators=300, subsample=0.8; total time=  27.1s
[CV] END colsample_bytree=0.8, gamma=0.05, learning_rate=0.02, max_depth=6, min_child_weight=3, n_estimators=300, subsample=0.8; total time=  24.7s
[CV] END colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=4, min_child_weight=7, n_estimators=300, subsample=0.6; total time=  21.6s
[CV] END colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=4, min_child_weight=7, n_estimators=300, subsample=0.6; total time=  21.3s
[CV] END colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=4, min_child_weight=7, n_estimators=300, subsample=0.6; total time=  21.3s
[CV] END colsample_bytree=0.7, gamma=0.2, learning

In [24]:
xgb_model = XGBClassifier(
    objective="multi:softprob",
    num_class=6,
    n_estimators=200,
    max_depth=5,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=7,
    gamma=0.2,
    reg_alpha=0.1,
    reg_lambda=1.0,
    n_jobs=-1,
    random_state=42
)


# Fitting to the Pipeline
xgb_model.fit(Xtrain_os, ytrain_encoded)

# Predicting the target
ytrain_pred = xgb_model.predict(Xtrain_os)
ytest_pred = xgb_model.predict(Xtest)

#Train Test Data Performance
print('Train Data Performance:')
print(metrics.classification_report(ytrain_encoded, ytrain_pred))
print('Test Data Performance:')
print(metrics.classification_report(ytest_encoded, ytest_pred))

Train Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2000
           1       0.99      0.90      0.94     20000
           2       1.00      1.00      1.00    225348
           3       1.00      0.99      0.99      1000
           4       1.00      1.00      1.00     63511
           5       0.80      0.94      0.86      8000

    accuracy                           0.99    319859
   macro avg       0.96      0.97      0.97    319859
weighted avg       0.99      0.99      0.99    319859

Test Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       625
           1       0.99      0.89      0.94      3193
           2       1.00      1.00      1.00     96416
           3       0.92      0.86      0.89        14
           4       1.00      1.00      1.00     27308
           5       0.35      0.89      0.51       207

    accuracy                  

In [25]:
xgb_model = XGBClassifier(
    objective="multi:softprob",
    num_class=6,
    n_estimators=200,
    max_depth=4,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=7,
    gamma=0.2,
    reg_alpha=0.1,
    reg_lambda=1.0,
    n_jobs=-1,
    random_state=42
)


# Fitting to the Pipeline
xgb_model.fit(Xtrain_os, ytrain_encoded)

# Predicting the target
ytrain_pred = xgb_model.predict(Xtrain_os)
ytest_pred = xgb_model.predict(Xtest)

#Train Test Data Performance
print('Train Data Performance:')
print(metrics.classification_report(ytrain_encoded, ytrain_pred))
print('Test Data Performance:')
print(metrics.classification_report(ytest_encoded, ytest_pred))

Train Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2000
           1       1.00      0.88      0.94     20000
           2       1.00      1.00      1.00    225348
           3       1.00      0.98      0.99      1000
           4       1.00      1.00      1.00     63511
           5       0.77      0.95      0.85      8000

    accuracy                           0.99    319859
   macro avg       0.96      0.97      0.96    319859
weighted avg       0.99      0.99      0.99    319859

Test Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       625
           1       1.00      0.87      0.93      3193
           2       1.00      1.00      1.00     96416
           3       0.92      0.86      0.89        14
           4       1.00      1.00      1.00     27308
           5       0.32      0.93      0.48       207

    accuracy                  

In [26]:
xgb_model = XGBClassifier(
    objective="multi:softprob",
    num_class=6,
    n_estimators=200,
    max_depth=4,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=7,
    gamma=0.2,
    reg_alpha=0.5,
    reg_lambda=1.0,
    n_jobs=-1,
    random_state=42
)


# Fitting to the Pipeline
xgb_model.fit(Xtrain_os, ytrain_encoded)

# Predicting the target
ytrain_pred = xgb_model.predict(Xtrain_os)
ytest_pred = xgb_model.predict(Xtest)

#Train Test Data Performance
print('Train Data Performance:')
print(metrics.classification_report(ytrain_encoded, ytrain_pred))
print('Test Data Performance:')
print(metrics.classification_report(ytest_encoded, ytest_pred))

Train Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2000
           1       1.00      0.88      0.94     20000
           2       1.00      1.00      1.00    225348
           3       1.00      0.98      0.99      1000
           4       1.00      1.00      1.00     63511
           5       0.77      0.95      0.85      8000

    accuracy                           0.99    319859
   macro avg       0.96      0.97      0.96    319859
weighted avg       0.99      0.99      0.99    319859

Test Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       625
           1       1.00      0.87      0.93      3193
           2       1.00      1.00      1.00     96416
           3       0.92      0.86      0.89        14
           4       1.00      1.00      1.00     27308
           5       0.32      0.93      0.48       207

    accuracy                  

In [50]:
xgb_model = XGBClassifier(
    objective="multi:softprob",
    num_class=6,
    n_estimators=400,
    max_depth=4,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=7,
    gamma=0.2,
    reg_alpha=0.2,
    reg_lambda=1.0,
    n_jobs=-1,
    random_state=42
)


# Fitting to the Pipeline
xgb_model.fit(Xtrain_os, ytrain_encoded)

# Predicting the target
ytrain_pred = xgb_model.predict(Xtrain_os)
ytest_pred = xgb_model.predict(Xtest)

#Train Test Data Performance
print('Train Data Performance:')
print(metrics.classification_report(ytrain_encoded, ytrain_pred))
print('Test Data Performance:')
print(metrics.classification_report(ytest_encoded, ytest_pred))

Train Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5000
           1       0.98      0.93      0.95     20000
           2       1.00      1.00      1.00    225348
           3       1.00      1.00      1.00      1000
           4       1.00      1.00      1.00     63511
           5       0.65      0.84      0.73      3000

    accuracy                           0.99    317859
   macro avg       0.94      0.96      0.95    317859
weighted avg       0.99      0.99      0.99    317859

Test Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       625
           1       0.99      0.91      0.95      3193
           2       1.00      1.00      1.00     96416
           3       0.92      0.86      0.89        14
           4       1.00      1.00      1.00     27308
           5       0.40      0.80      0.53       207

    accuracy                  

In [52]:
xgb_model = XGBClassifier(
    objective="multi:softprob",
    num_class=6,
    n_estimators=400,
    max_depth=3,
    learning_rate=0.01,
    subsample=0.7,
    colsample_bytree=0.7,
    min_child_weight=7,
    gamma=0.5,
    reg_alpha=0.2,
    reg_lambda=1.0,
    n_jobs=-1,
    random_state=42
)


# Fitting to the Pipeline
xgb_model.fit(Xtrain_os, ytrain_encoded)

# Predicting the target
ytrain_pred = xgb_model.predict(Xtrain_os)
ytest_pred = xgb_model.predict(Xtest)

#Train Test Data Performance
print('Train Data Performance:')
print(metrics.classification_report(ytrain_encoded, ytrain_pred))
print('Test Data Performance:')
print(metrics.classification_report(ytest_encoded, ytest_pred))

Train Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5000
           1       0.98      0.92      0.95     20000
           2       1.00      1.00      1.00    225348
           3       1.00      0.98      0.99      1000
           4       1.00      1.00      1.00     63511
           5       0.62      0.79      0.70      3000

    accuracy                           0.99    317859
   macro avg       0.93      0.95      0.94    317859
weighted avg       0.99      0.99      0.99    317859

Test Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       625
           1       0.98      0.91      0.94      3193
           2       1.00      1.00      1.00     96416
           3       1.00      0.86      0.92        14
           4       1.00      1.00      1.00     27308
           5       0.38      0.75      0.50       207

    accuracy                  

In [56]:
xgb_model = XGBClassifier(
    objective="multi:softprob",
    num_class=6,
    n_estimators=300,
    max_depth=2,
    learning_rate=0.01,
    subsample=0.6,
    colsample_bytree=0.5,
    colsample_bylevel = 0.5,
    min_child_weight=10,
    gamma=1.0,
    reg_alpha=0.5,
    reg_lambda=2.0,
    n_jobs=-1,
    random_state=42
)


# Fitting to the Pipeline
xgb_model.fit(Xtrain_os, ytrain_encoded)

# Predicting the target
ytrain_pred = xgb_model.predict(Xtrain_os)
ytest_pred = xgb_model.predict(Xtest)

#Train Test Data Performance
print('Train Data Performance:')
print(metrics.classification_report(ytrain_encoded, ytrain_pred))
print('Test Data Performance:')
print(metrics.classification_report(ytest_encoded, ytest_pred))

Train Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5000
           1       0.98      0.87      0.92     20000
           2       0.99      1.00      1.00    225348
           3       1.00      0.91      0.95      1000
           4       1.00      0.99      0.99     63511
           5       0.54      0.89      0.67      3000

    accuracy                           0.99    317859
   macro avg       0.92      0.94      0.92    317859
weighted avg       0.99      0.99      0.99    317859

Test Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       625
           1       0.95      0.85      0.90      3193
           2       1.00      1.00      1.00     96416
           3       1.00      0.79      0.88        14
           4       1.00      0.99      0.99     27308
           5       0.32      0.89      0.48       207

    accuracy                  

In [59]:
xgb_model = XGBClassifier(
    objective="multi:softprob",
    num_class=6,
    n_estimators=250,
    max_depth=3,
    learning_rate=0.01,
    subsample=0.6,
    colsample_bytree=0.6,
    min_child_weight=20,
    gamma=2.0,
    reg_alpha=1.0,
    reg_lambda=2.0,
    n_jobs=-1,
    random_state=42
)


# Fitting to the Pipeline
xgb_model.fit(Xtrain_os, ytrain_encoded)

# Predicting the target
ytrain_pred = xgb_model.predict(Xtrain_os)
ytest_pred = xgb_model.predict(Xtest)

#Train Test Data Performance
print('Train Data Performance:')
print(metrics.classification_report(ytrain_encoded, ytrain_pred))
print('Test Data Performance:')
print(metrics.classification_report(ytest_encoded, ytest_pred))

Train Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5000
           1       0.98      0.92      0.95     20000
           2       1.00      1.00      1.00    225348
           3       1.00      0.92      0.96      1000
           4       1.00      1.00      1.00     63511
           5       0.62      0.80      0.70      3000

    accuracy                           0.99    317859
   macro avg       0.93      0.94      0.93    317859
weighted avg       0.99      0.99      0.99    317859

Test Data Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       625
           1       0.98      0.90      0.94      3193
           2       1.00      1.00      1.00     96416
           3       1.00      0.86      0.92        14
           4       1.00      1.00      1.00     27308
           5       0.37      0.76      0.50       207

    accuracy                  

In [61]:
joblib.dump(xgb_model, r"D:\Analytixlabs\Internship\Project 4\Cyber Security\output\multiclass_model.pkl")

['D:\\Analytixlabs\\Internship\\Project 4\\Cyber Security\\output\\multiclass_model.pkl']

In [62]:
joblib.dump(encoder, r"D:\Analytixlabs\Internship\Project 4\Cyber Security\output\label_encoder.pkl")

['D:\\Analytixlabs\\Internship\\Project 4\\Cyber Security\\output\\label_encoder.pkl']

In [63]:
xgb_model.feature_names_in_

array(['destination_port', 'flow_duration', 'total_fwd_packets',
       'total_backward_packets', 'total_length_of_fwd_packets',
       'total_length_of_bwd_packets', 'fwd_packet_length_max',
       'fwd_packet_length_min', 'fwd_packet_length_mean',
       'fwd_packet_length_std', 'bwd_packet_length_max',
       'bwd_packet_length_min', 'bwd_packet_length_mean',
       'bwd_packet_length_std', 'flow_bytes_per_sec',
       'flow_packets_per_sec', 'flow_iat_mean', 'flow_iat_std',
       'flow_iat_max', 'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean',
       'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min', 'bwd_iat_total',
       'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min',
       'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags', 'bwd_urg_flags',
       'bwd_header_length', 'fwd_packets_per_sec', 'bwd_packets_per_sec',
       'min_packet_length', 'max_packet_length', 'packet_length_mean',
       'packet_length_std', 'packet_length_variance', 'fin_flag_count',
       'syn_flag