# ALL LIBRARIES

In [14]:
import glob
import time
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# GLOBAL FUNCTION

## GET DATAFRAME FUNCTION

In [17]:
def get_dataframe(files):
    t1 = time.time()
    print(f"==============================")

    print(f"Get dataframe from parquet dataset...")
    dfs = [pd.read_parquet(file) for file in files]
    df = pd.concat(dfs)

    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")

    return df
    

## TRAINING FUNCTION

In [19]:
def training(df):

    t1 = time.time()
    print(f"==============================")

    print(f"Separate features and labels into X and y...")
    X = df.drop(columns=['Label'])
    y = df['Label'].str.replace('DrDoS_', '')

    print(f"Split origin data into 'training data' and 'testing data' (8:2)...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"Start training...")
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")

    return X, y, X_train, y_train, X_test, y_test, model
    

## PREDICTING FUNCTION

In [35]:
def predicting(model, X_test, y_test):
    t1 = time.time()
    print(f"==============================")

    print(f"Start predicting...")
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred))

    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")

    return y_pred

## SAVING MODEL FUNCTION

In [23]:
def saving(model, file):
    t1 = time.time()
    print(f"==============================")

    print(f"Saving model to {file}...")
    joblib.dump(model, file)

    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")

# MAIN PROCESSING...

## READING DATASET...

In [25]:
t1 = time.time()
print(f"==============================")

print(f"Reading dataset...")
files = glob.glob('dataset/*.parquet')
for file in files:
    print(f"\t{file}")

t2 = time.time()
print(f"Finish... Total time: {t2 - t1} seconds")
print(f"==============================")

Reading dataset...
	dataset\DrDoS_DNS.parquet
	dataset\DrDoS_LDAP.parquet
	dataset\DrDoS_MSSQL.parquet
	dataset\DrDoS_NetBIOS.parquet
	dataset\DrDoS_NTP.parquet
	dataset\DrDoS_SNMP.parquet
	dataset\DrDoS_SSDP.parquet
	dataset\DrDoS_UDP.parquet
	dataset\Syn.parquet
	dataset\UDPLag.parquet
Finish... Total time: 0.0020084381103515625 seconds


## GET DATAFRAME...

In [29]:
df = get_dataframe(files)

Get dataframe from parquet dataset...
Finish... Total time: 0.985748291015625 seconds


## TRAINING...

In [31]:
X, y, X_train, y_train, X_test, y_test, model = training(df)

Separate features and labels into X and y...
Split origin data into 'training data' and 'testing data' (8:2)...
Start training...
Finish... Total time: 1959.0261125564575 seconds


## PREDICTING...

In [37]:
y_pred = predicting(model, X_test, y_test)

Start predicting...
Accuracy: 0.6885361230782227
Classification Report:
              precision    recall  f1-score   support

      BENIGN       0.99      1.00      1.00      5877
         DNS       0.64      0.62      0.63     22383
        LDAP       0.37      0.36      0.37      6194
       MSSQL       0.87      0.88      0.88     41448
         NTP       1.00      1.00      1.00    218828
     NetBIOS       0.45      0.42      0.43      3898
        SNMP       0.77      0.79      0.78     22434
        SSDP       0.41      0.39      0.40    175792
         Syn       0.97      0.96      0.97     31068
         UDP       0.52      0.54      0.53    212053
     UDP-lag       0.79      0.70      0.74     17657

    accuracy                           0.69    757632
   macro avg       0.71      0.70      0.70    757632
weighted avg       0.69      0.69      0.69    757632

Finish... Total time: 165.21587014198303 seconds


## SAVING MODEL...

In [41]:
saving(model, 'modelset/model.joblib')

Saving model to modelset/model.joblib...
Finish... Total time: 76.11339521408081 seconds
