# ALL LIBRARIES

In [6]:
import glob
import json
import pandas as pd
import numpy as np
import time
from fastai.tabular.all import df_shrink
import joblib
from sklearn.metrics import classification_report, accuracy_score

# GLOBAL FUNCTION

## LOAD CONSTANT FUNCTION

In [10]:
def load_constant(file):
    t1 = time.time()
    print(f"==============================")

    print(f"Loading file: {file}...")
    with open(f"commons/{file}", "r") as f:
        features = json.load(f)

    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")
    
    return features

## GET DATAFRAME FUNCTION

In [12]:
def get_dataframe(files):
    t1 = time.time()
    print(f"==============================")

    print(f"Get dataframe from real data...")
    dfs = [pd.read_csv(file) for file in files]

    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")

    return dfs
    

## PRE PROCESS FUNCTION

In [56]:
def pre_process(file, df):
    t1 = time.time()
    print(f"==============================")
    print(f"Processing file: {file}...")
    print(f"\tDimensions before process: {df.shape}")
    origin_df = df.copy()
    
    print(f"\tStrip columns name...")
    df.columns = df.columns.str.strip()
    
    print(f"\tRename columns...")
    df.rename(columns=mapper_features, inplace=True)
    
    print(f"\tDrop columns...")
    df.drop(columns=drop_features, inplace=True)
    
    print(f"\tReplace 'infinity value' by 'nan'...")
    df.replace(to_replace=[np.inf, -np.inf], value=np.nan, inplace=True)
    
    # print(f"\tDrop rows having 'nan' value...")
    # print(f"\t...has been droping {df.isna().any(axis=1).sum()} rows")
    # df.dropna(inplace=True)
    
    # print(f"\tDrop duplicate rows...")
    # print(f"\t...has been droping {df.duplicated().sum()} rows...")
    # df.drop_duplicates(inplace=True)

    # print(f"\tReset index...")
    # df.reset_index(inplace=True, drop=True)

    print(f"\tShrink data-frame type...")
    df = df_shrink(df)
    print(f"\tDimensions after process: {df.shape}")

    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")
    
    return df, origin_df

## PREDICTING FUNCTION

In [16]:
def predict_process(origin_df, df, file, model):
    t1 = time.time()
    print(f"==============================")

    print(f"Dataframe belong to file: {file}")
    print(f"Start predicting...")
    X = df.drop(columns='Label')
    y_pred = model.predict(X)

    arr = np.array(y_pred)
    unique_values, counts = np.unique(arr, return_counts=True)
    for value, count in zip(unique_values, counts):
        print(f'{value}: {count}')

    print(f"Saving result...")
    origin_df['Label'] = y_pred
    origin_df.to_csv(f"predict_result/{file.split('\\')[-1]}", index=False)
    
    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")

    return y_pred

## LOAD MODEL FUNCTION

In [18]:
def load_model(file):
    t1 = time.time()
    print(f"==============================")

    print(f"Loading model from {file}...")
    model = joblib.load('modelset/model.joblib', mmap_mode='r')
    
    t2 = time.time()
    print(f"Finish... Total time: {t2 - t1} seconds")
    print(f"==============================")

    return model

# MAIN PROCESS

## LOADING CONSTANT...

In [20]:
mapper_features = load_constant("mapper_for_predicting.json")
drop_features = load_constant("drop_for_predicting.json")

Loading file: mapper_for_predicting.json...
Finish... Total time: 0.0 seconds
Loading file: drop_for_predicting.json...
Finish... Total time: 0.012997865676879883 seconds


## READING DATA...

In [28]:
t1 = time.time()
print(f"==============================")

print(f"Reading dataset...")
files = glob.glob('share/*.csv')
for file in files:
    print(f"\t{file}")

t2 = time.time()
print(f"Finish... Total time: {t2 - t1} seconds")
print(f"==============================")

Reading dataset...
	share\slowhttptest.pcap_Flow.csv
	share\syn-hping.pcap_Flow.csv
	share\syn-metasploit.pcap_Flow.csv
	share\syn.pcap_Flow.csv
	share\udp-hping.pcap_Flow.csv
	share\udp.pcap_Flow.csv
Finish... Total time: 0.0010111331939697266 seconds


In [24]:
dfs = get_dataframe(files)
for df in dfs:
    print(df.shape)

Get dataframe from real data...
Finish... Total time: 0.26117491722106934 seconds
(1087, 22)
(12410, 22)
(10227, 22)
(14284, 84)
(23611, 22)
(108, 22)


## PRE PROCESSING...

In [58]:
t1 = time.time()
print(f"==============================")

print(f"Processing dataset...")
dfs = []
origin_dfs = []
for file in files:
    df, origin_df = pre_process(file, pd.read_csv(file))
    dfs.append(df)
    origin_dfs.append(origin_df)

t2 = time.time()
print(f"Finish... Total time: {t2 - t1} seconds")
print(f"==============================")

Processing dataset...
Processing file: share\slowhttptest.pcap_Flow.csv...
	Dimensions before process: (1089, 84)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Shrink data-frame type...
	Dimensions after process: (1089, 22)
Finish... Total time: 0.007988452911376953 seconds
Processing file: share\syn-hping.pcap_Flow.csv...
	Dimensions before process: (22323, 84)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Shrink data-frame type...
	Dimensions after process: (22323, 22)
Finish... Total time: 0.021010398864746094 seconds
Processing file: share\syn-metasploit.pcap_Flow.csv...
	Dimensions before process: (10231, 84)
	Strip columns name...
	Rename columns...
	Drop columns...
	Replace 'infinity value' by 'nan'...
	Shrink data-frame type...
	Dimensions after process: (10231, 22)
Finish... Total time: 0.011000394821166992 seconds
Processing file: share\syn.pcap_Flow.csv...
	Dimensions befo

In [72]:
len(files)

6

## LOADING MODEL...

In [32]:
model = load_model("modelset/model.joblib")

Loading model from modelset/model.joblib...
Finish... Total time: 42.04454827308655 seconds


## PREDICTING...

In [60]:
for origin_df, df, file in zip(origin_dfs, dfs, files):
    predict_process(origin_df, df, file, model)

Dataframe belong to file: share\slowhttptest.pcap_Flow.csv
Start predicting...
BENIGN: 981
DNS: 81
LDAP: 5
NTP: 19
SNMP: 1
UDP: 2
Saving result...
Finish... Total time: 0.0800163745880127 seconds
Dataframe belong to file: share\syn-hping.pcap_Flow.csv
Start predicting...
BENIGN: 10790
DNS: 1003
NTP: 2042
Syn: 7262
UDP: 871
UDP-lag: 355
Saving result...
Finish... Total time: 1.008678913116455 seconds
Dataframe belong to file: share\syn-metasploit.pcap_Flow.csv
Start predicting...
BENIGN: 756
Syn: 9475
Saving result...
Finish... Total time: 0.38400864601135254 seconds
Dataframe belong to file: share\syn.pcap_Flow.csv
Start predicting...
BENIGN: 877
NTP: 4407
Syn: 6286
UDP: 1340
UDP-lag: 1374
Saving result...
Finish... Total time: 0.6448721885681152 seconds
Dataframe belong to file: share\udp-hping.pcap_Flow.csv
Start predicting...
BENIGN: 59469
NTP: 2751
Saving result...
Finish... Total time: 2.544891595840454 seconds
Dataframe belong to file: share\udp.pcap_Flow.csv
Start predicting...


In [10]:
file_path = 'syn.pcap_Flow.csv'
df = pd.read_csv(f"predict_result/syn.pcap_Flow.csv")
df.head(5)

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.100.154-192.168.100.186-36664-80-6,192.168.100.154,36664,192.168.100.186,80,6,23/11/2024 05:43:49 PM,91454,2,1,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NTP
1,192.168.100.154-192.168.100.186-36665-80-6,192.168.100.154,36665,192.168.100.186,80,6,23/11/2024 05:43:49 PM,91455,2,1,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NTP
2,192.168.100.154-192.168.100.186-36666-80-6,192.168.100.154,36666,192.168.100.186,80,6,23/11/2024 05:43:49 PM,91102,2,1,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NTP
3,192.168.100.154-192.168.100.186-36667-80-6,192.168.100.154,36667,192.168.100.186,80,6,23/11/2024 05:43:49 PM,91102,2,1,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NTP
4,192.168.100.154-192.168.100.186-36668-80-6,192.168.100.154,36668,192.168.100.186,80,6,23/11/2024 05:43:49 PM,91102,2,1,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NTP


In [6]:
import sys
print(f"Major: {sys.version_info.major}")
print(f"Minor: {sys.version_info.minor}")
print(f"Micro: {sys.version_info.micro}")
print(sys.version)

Major: 3
Minor: 12
Micro: 4
3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:03:56) [MSC v.1929 64 bit (AMD64)]
