# ALL LIBRARIES

In [1]:
import glob
import time
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import joblib

# GLOBAL FUNCTION

## get dataframe function

In [3]:
def get_dataframe():
    t1 = time.time()
    print(f"==============================")

    print(f"Reading dataset...")
    files = glob.glob('dataset/*.parquet')
    for file in files:
        print(f"\t{file}")

    print(f"Get dataframe from parquet dataset...")
    dfs = [pd.read_parquet(file) for file in files]
    df = pd.concat(dfs)

    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")

    return df
    

## splitting data function

In [5]:
def save_splitting_data(df):
    t1 = time.time()
    print(f"==============================")

    print(f"Separate features and labels into X and y...")
    X = df.drop(columns=['Label'])
    y = df['Label'].str.replace('DrDoS_', '')

    print(f"Split origin data into 'training data' and 'testing data' (8:2)...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"Saving 'training data' and 'testing data' to file: common/data_splits.joblib")
    joblib.dump((X_train, X_test, y_train, y_test), 'commons/data_splits.joblib')

    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")

In [5]:
def get_splitting_data():
    X_train, X_test, y_train, y_test = joblib.load('commons/data_splits.joblib')
    return X_train, X_test, y_train, y_test

## training function

In [7]:
def training(df, n_estimators0, max_depth0):
    # training...
    t1 = time.time()
    print(f"==============================")
    print(f"Start training with n_estimators0 = {n_estimators0} and max_depth0 = {max_depth0}...")
    model = RandomForestClassifier(n_estimators=n_estimators0, max_depth=max_depth0, random_state=42)
    model.fit(X_train, y_train)
    t2 = time.time()
    print(f"Finish Training... Total time: {t2 - t1} seconds")

    # save model...
    t1 = time.time()
    print(f"==============================")
    print(f"Saving model to 'modelset/model-{n_estimators0}-{max_depth0}.joblib'...")
    joblib.dump(model, f'modelset/model-{n_estimators0}-{max_depth0}.joblib')
    t2 = time.time()
    print(f"Finish Saving model... Total time: {t2 - t1} seconds")

    # predicting...
    t1 = time.time()
    print(f"==============================")
    print(f"Predicting with 'y_train' and 'X_train'...")
    y_pred_train = model.predict(X_train)
    print(f"Training Accuracy (using 'y_train' and 'X_train'): {accuracy_score(y_train, y_pred_train)}")
    
    print(f"Predicting with 'y_test' and 'X_test'...")
    y_pred_test = model.predict(X_test)
    print(f"Training Accuracy (using 'y_test' and 'X_test'): {accuracy_score(y_test, y_pred_test)}")
    
    print(f"Classification Report (evaluated by 'y_test'):")
    print(classification_report(y_test, y_pred_test))
    t2 = time.time()
    print(f"Finish Predicting... Total time: {t2 - t1} seconds")

    return model

# PROCESSING...

In [9]:
df = get_dataframe()

Reading dataset...
	dataset\DrDoS_DNS.parquet
	dataset\DrDoS_LDAP.parquet
	dataset\DrDoS_MSSQL.parquet
	dataset\DrDoS_NetBIOS.parquet
	dataset\DrDoS_NTP.parquet
	dataset\DrDoS_SNMP.parquet
	dataset\DrDoS_SSDP.parquet
	dataset\DrDoS_UDP.parquet
	dataset\Syn.parquet
	dataset\UDPLag.parquet
Get dataframe from parquet dataset...
Finish... Total time: 0.5350253582000732 seconds


In [34]:
#save_splitting_data(df)

Separate features and labels into X and y...
Split origin data into 'training data' and 'testing data' (8:2)...
Saving 'training data' and 'testing data' to file: common/data_splits.joblib
Finish... Total time: 5.694502353668213 seconds


In [11]:
X_train, X_test, y_train, y_test = get_splitting_data()

## n_estimators = 50 and max_depth = None

In [30]:
training(df, 50, None)

Start training with n_estimators0 = 50 and max_depth0 = None...
Finish Training... Total time: 955.8171739578247 seconds
Saving model to 'modelset/model-50-None.joblib'...
Finish Saving model... Total time: 5.78500771522522 seconds
Predicting with 'y_train' and 'X_train'...
Training Accuracy (using 'y_train' and 'X_train'): 0.9512207113743876
Predicting with 'y_test' and 'X_test'...
Training Accuracy (using 'y_test' and 'X_test'): 0.6879316079574253
Classification Report (evaluated by 'y_test'):
              precision    recall  f1-score   support

      BENIGN       0.99      1.00      1.00      5877
         DNS       0.63      0.62      0.62     22383
        LDAP       0.37      0.36      0.36      6194
       MSSQL       0.87      0.88      0.88     41448
         NTP       1.00      1.00      1.00    218828
     NetBIOS       0.45      0.42      0.43      3898
        SNMP       0.77      0.79      0.78     22434
        SSDP       0.41      0.39      0.40    175792
         Syn

## n_estimators = 100 and max_depth = None

In [15]:
training(df, 100, None)

Start training with n_estimators0 = 100 and max_depth0 = None...
Finish Training... Total time: 1897.1554951667786 seconds
Saving model to 'modelset/model-100-None.joblib'...
Finish Saving model... Total time: 71.42744398117065 seconds
Predicting with 'y_train' and 'X_train'...
Training Accuracy (using 'y_train' and 'X_train'): 0.9515374878569015
Predicting with 'y_test' and 'X_test'...
Training Accuracy (using 'y_test' and 'X_test'): 0.6885361230782227
Classification Report (evaluated by 'y_test'):
              precision    recall  f1-score   support

      BENIGN       0.99      1.00      1.00      5877
         DNS       0.64      0.62      0.63     22383
        LDAP       0.37      0.36      0.37      6194
       MSSQL       0.87      0.88      0.88     41448
         NTP       1.00      1.00      1.00    218828
     NetBIOS       0.45      0.42      0.43      3898
        SNMP       0.77      0.79      0.78     22434
        SSDP       0.41      0.39      0.40    175792
        

## n_estimators = 120 and max_depth = None

In [15]:
training(df, 120, None)

Start training with n_estimators0 = 120 and max_depth0 = None...
Finish Training... Total time: 2288.4315977096558 seconds
Saving model to 'modelset/model-120-None.joblib'...
Finish Saving model... Total time: 91.57305598258972 seconds
Predicting with 'y_train' and 'X_train'...
Training Accuracy (using 'y_train' and 'X_train'): 0.9515444173424565
Predicting with 'y_test' and 'X_test'...
Training Accuracy (using 'y_test' and 'X_test'): 0.6888357408346004
Classification Report (evaluated by 'y_test'):
              precision    recall  f1-score   support

      BENIGN       0.99      1.00      1.00      5877
         DNS       0.63      0.62      0.63     22383
        LDAP       0.37      0.36      0.37      6194
       MSSQL       0.87      0.88      0.88     41448
         NTP       1.00      1.00      1.00    218828
     NetBIOS       0.45      0.42      0.43      3898
        SNMP       0.77      0.79      0.78     22434
        SSDP       0.41      0.39      0.40    175792
        

## n_estimators = 50 and max_depth = 5

In [20]:
training(df, 50, 5)

Start training with n_estimators0 = 50 and max_depth0 = 5...
Finish Training... Total time: 260.6212365627289 seconds
Saving model to 'modelset/model-50-5.joblib'...
Finish Saving model... Total time: 0.023298978805541992 seconds
Predicting with 'y_train' and 'X_train'...
Training Accuracy (using 'y_train' and 'X_train'): 0.7230179031508701
Predicting with 'y_test' and 'X_test'...
Training Accuracy (using 'y_test' and 'X_test'): 0.7233432589964521
Classification Report (evaluated by 'y_test'):
              precision    recall  f1-score   support

      BENIGN       0.79      0.92      0.85      5877
         DNS       0.54      0.43      0.48     22383
        LDAP       0.68      0.06      0.11      6194
       MSSQL       0.79      0.94      0.86     41448
         NTP       0.99      1.00      0.99    218828
     NetBIOS       0.94      0.20      0.33      3898
        SNMP       0.66      0.90      0.76     22434
        SSDP       0.56      0.06      0.10    175792
         Syn  

## n_estimators = 100 and max_depth = 5

In [22]:
training(df, 100, 5)

Start training with n_estimators0 = 100 and max_depth0 = 5...
Finish Training... Total time: 497.07209873199463 seconds
Saving model to 'modelset/model-100-5.joblib'...
Finish Saving model... Total time: 0.03954291343688965 seconds
Predicting with 'y_train' and 'X_train'...
Training Accuracy (using 'y_train' and 'X_train'): 0.7227031065213718
Predicting with 'y_test' and 'X_test'...
Training Accuracy (using 'y_test' and 'X_test'): 0.7230291223179591
Classification Report (evaluated by 'y_test'):
              precision    recall  f1-score   support

      BENIGN       0.79      0.92      0.85      5877
         DNS       0.53      0.41      0.47     22383
        LDAP       0.69      0.05      0.09      6194
       MSSQL       0.79      0.94      0.86     41448
         NTP       0.99      1.00      0.99    218828
     NetBIOS       0.94      0.22      0.35      3898
        SNMP       0.66      0.90      0.76     22434
        SSDP       0.56      0.06      0.10    175792
         Syn

## n_estimators = 120 and max_depth = 5

In [13]:
training(df, 120, 5)

Start training with n_estimators0 = 120 and max_depth0 = 5...
Finish Training... Total time: 593.924623966217 seconds
Saving model to 'modelset/model-120-5.joblib'...
Finish Saving model... Total time: 0.04880809783935547 seconds
Predicting with 'y_train' and 'X_train'...
Training Accuracy (using 'y_train' and 'X_train'): 0.7227080561539111
Predicting with 'y_test' and 'X_test'...
Training Accuracy (using 'y_test' and 'X_test'): 0.7230317621219801
Classification Report (evaluated by 'y_test'):
              precision    recall  f1-score   support

      BENIGN       0.79      0.92      0.85      5877
         DNS       0.53      0.42      0.47     22383
        LDAP       0.74      0.03      0.06      6194
       MSSQL       0.79      0.94      0.86     41448
         NTP       0.99      1.00      0.99    218828
     NetBIOS       0.94      0.22      0.35      3898
        SNMP       0.66      0.90      0.76     22434
        SSDP       0.56      0.06      0.10    175792
         Syn  

## n_estimators = 50 and max_depth = 10

In [46]:
training(df, 50, 10)

Start training with n_estimators0 = 50 and max_depth0 = 10...
Finish Training... Total time: 423.1658065319061 seconds
Saving model to 'modelset/model-50-10.joblib'...
Finish Saving model... Total time: 0.028000354766845703 seconds
Predicting with 'y_train' and 'X_train'...
Training Accuracy (using 'y_train' and 'X_train'): 0.7398281751562764
Predicting with 'y_test' and 'X_test'...
Training Accuracy (using 'y_test' and 'X_test'): 0.73912928704173
Classification Report (evaluated by 'y_test'):
              precision    recall  f1-score   support

      BENIGN       0.98      1.00      0.99      5877
         DNS       0.71      0.54      0.61     22383
        LDAP       0.57      0.31      0.40      6194
       MSSQL       0.83      0.95      0.88     41448
         NTP       0.99      1.00      1.00    218828
     NetBIOS       0.88      0.37      0.52      3898
        SNMP       0.73      0.90      0.81     22434
        SSDP       0.55      0.15      0.24    175792
         Syn  

## n_estimators = 100 and max_depth = 10

In [20]:
training(df, 100, 10)

Start training with n_estimators0 = 100 and max_depth0 = 10...
Finish Training... Total time: 841.1239614486694 seconds
Saving model to 'modelset/model-100-10.joblib'...
Finish Saving model... Total time: 0.05100417137145996 seconds
Predicting with 'y_train' and 'X_train'...
Training Accuracy (using 'y_train' and 'X_train'): 0.7399100090809259
Predicting with 'y_test' and 'X_test'...
Training Accuracy (using 'y_test' and 'X_test'): 0.7391873627301909
Classification Report (evaluated by 'y_test'):
              precision    recall  f1-score   support

      BENIGN       0.97      1.00      0.99      5877
         DNS       0.71      0.54      0.61     22383
        LDAP       0.56      0.31      0.40      6194
       MSSQL       0.83      0.95      0.88     41448
         NTP       0.99      1.00      1.00    218828
     NetBIOS       0.88      0.37      0.52      3898
        SNMP       0.73      0.90      0.81     22434
        SSDP       0.55      0.16      0.24    175792
         Sy

## n_estimators = 120 and max_depth = 10

In [22]:
training(df, 120, 10)

Start training with n_estimators0 = 120 and max_depth0 = 10...
Finish Training... Total time: 1004.7814826965332 seconds
Saving model to 'modelset/model-120-10.joblib'...
Finish Saving model... Total time: 0.06002521514892578 seconds
Predicting with 'y_train' and 'X_train'...
Training Accuracy (using 'y_train' and 'X_train'): 0.7399053894238892
Predicting with 'y_test' and 'X_test'...
Training Accuracy (using 'y_test' and 'X_test'): 0.7392480782226728
Classification Report (evaluated by 'y_test'):
              precision    recall  f1-score   support

      BENIGN       0.97      1.00      0.99      5877
         DNS       0.71      0.54      0.62     22383
        LDAP       0.56      0.31      0.40      6194
       MSSQL       0.83      0.95      0.88     41448
         NTP       0.99      1.00      1.00    218828
     NetBIOS       0.88      0.37      0.52      3898
        SNMP       0.73      0.90      0.81     22434
        SSDP       0.55      0.16      0.24    175792
         S

## n_estimators = 50 and max_depth = 20

In [24]:
training(df, 50, 20)

Start training with n_estimators0 = 50 and max_depth0 = 20...
Finish Training... Total time: 657.6202626228333 seconds
Saving model to 'modelset/model-50-20.joblib'...
Finish Saving model... Total time: 0.3361048698425293 seconds
Predicting with 'y_train' and 'X_train'...
Training Accuracy (using 'y_train' and 'X_train'): 0.7672748775130934
Predicting with 'y_test' and 'X_test'...
Training Accuracy (using 'y_test' and 'X_test'): 0.7455228923804696
Classification Report (evaluated by 'y_test'):
              precision    recall  f1-score   support

      BENIGN       0.99      1.00      1.00      5877
         DNS       0.74      0.64      0.69     22383
        LDAP       0.49      0.44      0.46      6194
       MSSQL       0.86      0.95      0.90     41448
         NTP       0.99      1.00      1.00    218828
     NetBIOS       0.78      0.44      0.56      3898
        SNMP       0.79      0.87      0.83     22434
        SSDP       0.54      0.28      0.37    175792
         Syn  

## n_estimators = 100 and max_depth = 20

In [26]:
training(df, 100, 20)

Start training with n_estimators0 = 100 and max_depth0 = 20...
Finish Training... Total time: 1313.1412506103516 seconds
Saving model to 'modelset/model-100-20.joblib'...
Finish Saving model... Total time: 0.643972635269165 seconds
Predicting with 'y_train' and 'X_train'...
Training Accuracy (using 'y_train' and 'X_train'): 0.7673972984245649
Predicting with 'y_test' and 'X_test'...
Training Accuracy (using 'y_test' and 'X_test'): 0.7453526250211184
Classification Report (evaluated by 'y_test'):
              precision    recall  f1-score   support

      BENIGN       0.99      1.00      1.00      5877
         DNS       0.74      0.64      0.69     22383
        LDAP       0.48      0.44      0.46      6194
       MSSQL       0.86      0.95      0.90     41448
         NTP       0.99      1.00      1.00    218828
     NetBIOS       0.78      0.44      0.56      3898
        SNMP       0.79      0.87      0.83     22434
        SSDP       0.54      0.28      0.36    175792
         Syn

## n_estimators = 120 and max_depth = 20

In [28]:
training(df, 120, 20)

Start training with n_estimators0 = 120 and max_depth0 = 20...
Finish Training... Total time: 1568.722737312317 seconds
Saving model to 'modelset/model-120-20.joblib'...
Finish Saving model... Total time: 0.8541219234466553 seconds
Predicting with 'y_train' and 'X_train'...
Training Accuracy (using 'y_train' and 'X_train'): 0.7674992608548742
Predicting with 'y_test' and 'X_test'...
Training Accuracy (using 'y_test' and 'X_test'): 0.7454489778678831
Classification Report (evaluated by 'y_test'):
              precision    recall  f1-score   support

      BENIGN       0.99      1.00      1.00      5877
         DNS       0.74      0.64      0.69     22383
        LDAP       0.48      0.44      0.46      6194
       MSSQL       0.86      0.95      0.90     41448
         NTP       0.99      1.00      1.00    218828
     NetBIOS       0.79      0.43      0.56      3898
        SNMP       0.79      0.87      0.83     22434
        SSDP       0.54      0.28      0.37    175792
         Syn