# ALL LIBRARIES

In [2]:
import glob
import json
import pandas as pd
import numpy as np
import time
from fastai.tabular.all import df_shrink

# GLOBAL FUNCTION

## LOAD CONSTANT FUNCTION

In [2]:
def load_constant(file):
    t1 = time.time()
    print(f"==============================")

    print(f"Loading file: {file}...")
    with open(f"commons/{file}", "r") as f:
        features = json.load(f)

    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")
    
    return features

## PRE PROCESS FUNCTION

In [5]:
def pre_process(file, df):
    t1 = time.time()
    print(f"==============================")
    print(f"Processing file: {file}...")
    print(f"\tDimensions before process: {df.shape}")
    
    print(f"\tStrip columns name...")
    df.columns = df.columns.str.strip()
    
    print(f"\tRename columns...")
    df.rename(columns=mapper_features, inplace=True)
    
    print(f"\tDrop columns...")
    df.drop(columns=drop_features, inplace=True)
    
    print(f"\tReplace 'infinity value' by 'nan'...")
    df.replace(to_replace=[np.inf, -np.inf], value=np.nan, inplace=True)
    
    print(f"\tDrop rows having 'nan' value...")
    print(f"\t...has been droping {df.isna().any(axis=1).sum()} rows")
    df.dropna(inplace=True)
    
    print(f"\tDrop duplicate rows...")
    print(f"\t...has been droping {df.duplicated().sum()} rows...")
    df.drop_duplicates(inplace=True)

    print(f"\tReset index...")
    df.reset_index(inplace=True, drop=True)

    print(f"\tShrink data-frame type...")
    df = df_shrink(df)
    print(f"\tDimensions after process: {df.shape}")

    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")
    
    return df

# PRE-PROCESSING...

## LOADING CONSTANT...

In [8]:
mapper_features = load_constant("mapper_for_training.json")
drop_features = load_constant("drop_for_training.json")

Loading file: mapper_for_training.json...
Finish... Total time: 0.0020837783813476562 seconds
Loading file: drop_for_training.json...
Finish... Total time: 0.0010089874267578125 seconds


## READING DATASET...

In [9]:
t1 = time.time()
print(f"==============================")

print(f"Reading dataset...")
files = glob.glob('dataset/*.csv')
for file in files:
    print(f"\t{file}")

t2 = time.time()
print(f"Finish... Total time: {t2 - t1} seconds")
print(f"==============================")

Reading dataset...
	dataset\DrDoS_DNS.csv
	dataset\DrDoS_LDAP.csv
	dataset\DrDoS_MSSQL.csv
	dataset\DrDoS_NetBIOS.csv
	dataset\DrDoS_NTP.csv
	dataset\DrDoS_SNMP.csv
	dataset\DrDoS_SSDP.csv
	dataset\DrDoS_UDP.csv
	dataset\Syn.csv
	dataset\UDPLag.csv
Finish... Total time: 0.0014524459838867188 seconds


## PROCESSING DATASET...

In [11]:
t1 = time.time()
print(f"==============================")

print(f"Processing dataset...")
for file in files:
    df = pre_process(file, pd.read_csv(file))
    df.to_parquet(f"dataset/{file.split('\\')[-1].replace('.csv', '.parquet')}",engine="pyarrow")

t2 = time.time()
print(f"Finish... Total time: {t2 - t1} seconds")
print(f"==============================")

Processing dataset...


  df = pre_process(file, pd.read_csv(file))


Processing file: dataset\DrDoS_DNS.csv...
	Dimensions before process: (5074413, 88)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Drop rows having 'nan' value...
	...has been droping 0 rows
	Drop duplicate rows...
	...has been droping 4958216 rows...
	Reset index...
	Shrink data-frame type...
	Dimensions after process: (116197, 22)
Finish... Total time: 6.970324277877808 seconds


  df = pre_process(file, pd.read_csv(file))


Processing file: dataset\DrDoS_LDAP.csv...
	Dimensions before process: (2181542, 88)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Drop rows having 'nan' value...
	...has been droping 0 rows
	Drop duplicate rows...
	...has been droping 2150105 rows...
	Reset index...
	Shrink data-frame type...
	Dimensions after process: (31437, 22)
Finish... Total time: 2.9922029972076416 seconds


  df = pre_process(file, pd.read_csv(file))


Processing file: dataset\DrDoS_MSSQL.csv...
	Dimensions before process: (4524498, 88)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Drop rows having 'nan' value...
	...has been droping 0 rows
	Drop duplicate rows...
	...has been droping 4315964 rows...
	Reset index...
	Shrink data-frame type...
	Dimensions after process: (208534, 22)
Finish... Total time: 6.4344642162323 seconds


  df = pre_process(file, pd.read_csv(file))


Processing file: dataset\DrDoS_NetBIOS.csv...
	Dimensions before process: (4094986, 88)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Drop rows having 'nan' value...
	...has been droping 0 rows
	Drop duplicate rows...
	...has been droping 4073998 rows...
	Reset index...
	Shrink data-frame type...
	Dimensions after process: (20988, 22)
Finish... Total time: 5.282648324966431 seconds


  df = pre_process(file, pd.read_csv(file))


Processing file: dataset\DrDoS_NTP.csv...
	Dimensions before process: (1217007, 88)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Drop rows having 'nan' value...
	...has been droping 0 rows
	Drop duplicate rows...
	...has been droping 111793 rows...
	Reset index...
	Shrink data-frame type...
	Dimensions after process: (1105214, 22)
Finish... Total time: 3.7970619201660156 seconds


  df = pre_process(file, pd.read_csv(file))


Processing file: dataset\DrDoS_SNMP.csv...
	Dimensions before process: (5161377, 88)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Drop rows having 'nan' value...
	...has been droping 0 rows
	Drop duplicate rows...
	...has been droping 5047260 rows...
	Reset index...
	Shrink data-frame type...
	Dimensions after process: (114117, 22)
Finish... Total time: 6.923769235610962 seconds


  df = pre_process(file, pd.read_csv(file))


Processing file: dataset\DrDoS_SSDP.csv...
	Dimensions before process: (2611374, 88)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Drop rows having 'nan' value...
	...has been droping 0 rows
	Drop duplicate rows...
	...has been droping 1730687 rows...
	Reset index...
	Shrink data-frame type...
	Dimensions after process: (880687, 22)
Finish... Total time: 5.861386299133301 seconds


  df = pre_process(file, pd.read_csv(file))


Processing file: dataset\DrDoS_UDP.csv...
	Dimensions before process: (3136802, 88)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Drop rows having 'nan' value...
	...has been droping 0 rows
	Drop duplicate rows...
	...has been droping 2073713 rows...
	Reset index...
	Shrink data-frame type...
	Dimensions after process: (1063089, 22)
Finish... Total time: 6.993658542633057 seconds


  df = pre_process(file, pd.read_csv(file))


Processing file: dataset\Syn.csv...
	Dimensions before process: (1582681, 88)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Drop rows having 'nan' value...
	...has been droping 0 rows
	Drop duplicate rows...
	...has been droping 1427189 rows...
	Reset index...
	Shrink data-frame type...
	Dimensions after process: (155492, 22)
Finish... Total time: 2.6489596366882324 seconds


  df = pre_process(file, pd.read_csv(file))


Processing file: dataset\UDPLag.csv...
	Dimensions before process: (370166, 88)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Drop rows having 'nan' value...
	...has been droping 0 rows
	Drop duplicate rows...
	...has been droping 277761 rows...
	Reset index...
	Shrink data-frame type...
	Dimensions after process: (92405, 22)
Finish... Total time: 0.7242546081542969 seconds
Finish... Total time: 275.7635884284973 seconds
