In [18]:
import pandas as pd
import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [21]:
data = pd.read_csv("Raw_data/benign/Benign_test\Benign_00068_20230914231014.csv")

In [22]:
# Convert column names to a list
column_list = data.columns.tolist()

# Print column names along with their indices (order)
for index, column in enumerate(column_list):
    print(f"{index}: {column}")

0: Flow ID
1: Src IP
2: Src Port
3: Dst IP
4: Dst Port
5: Protocol
6: timestamp
7: Flow Duration
8: Tot Fwd Pkts
9: Tot Bwd Pkts
10: TotLen Fwd Pkts
11: TotLen Bwd Pkts
12: Fwd Pkt Len Max
13: Fwd Pkt Len Min
14: Fwd Pkt Len Mean
15: Fwd Pkt Len Std
16: Bwd Pkt Len Max
17: Bwd Pkt Len Min
18: Bwd Pkt Len Mean
19: Bwd Pkt Len Std
20: Flow Byts/s
21: Flow Pkts/s
22: Flow IAT Mean
23: Flow IAT Std
24: Flow IAT Max
25: Flow IAT Min
26: Fwd IAT Tot
27: Fwd IAT Mean
28: Fwd IAT Std
29: Fwd IAT Max
30: Fwd IAT Min
31: Bwd IAT Tot
32: Bwd IAT Mean
33: Bwd IAT Std
34: Bwd IAT Max
35: Bwd IAT Min
36: Fwd PSH Flags
37: Bwd PSH Flags
38: Fwd URG Flags
39: Bwd URG Flags
40: Fwd Header Len
41: Bwd Header Len
42: Fwd Pkts/s
43: Bwd Pkts/s
44: Pkt Len Min
45: Pkt Len Max
46: Pkt Len Mean
47: Pkt Len Std
48: Pkt Len Var
49: FIN Flag Cnt
50: SYN Flag Cnt
51: RST Flag Cnt
52: PSH Flag Cnt
53: ACK Flag Cnt
54: URG Flag Cnt
55: CWE Flag Count
56: ECE Flag Cnt
57: Down/Up Ratio
58: Pkt Size Avg
59: Fwd Seg 

In [2]:
import numpy as np
import pandas as pd
import glob


def aggregate_csv_features(data, chunk_size=1000):
    num_rows = len(data)
    aggregated_data = []

    for i in range(0, num_rows, chunk_size):
        chunk = data.iloc[i : i + chunk_size]

        # Count the number of packets with SYN and FIN flags
        syn_count = chunk[chunk["SYN Flag Cnt"] == 1].shape[0]
        fin_count = chunk[chunk["FIN Flag Cnt"] == 1].shape[0]
        print(f"Số gói tin có cờ SYN: {syn_count}")
        print(f"Số gói tin có cờ FIN: {fin_count}")

        # Count the number of TCP packets
        tcp_count = chunk[chunk["Protocol"] == 6].shape[0]
        print(f"Số gói tin TCP: {tcp_count}")

        # Calculate SYN and FIN per TCP ratio, avoid division by zero
        SYN_per_TCP = syn_count / tcp_count if tcp_count > 0 else 0
        FIN_per_TCP = fin_count / tcp_count if tcp_count > 0 else 0

        # Sum of backward and forward packets
        totbwd = chunk["Tot Bwd Pkts"].sum()
        totfwd = chunk["Tot Fwd Pkts"].sum()

        # Calculate backward packets per forward packets ratio, avoid division by zero
        bwk_per_fwk = totbwd / totfwd if totfwd > 0 else 0
        print(f"Backward per forward packet ratio: {bwk_per_fwk}")

        # Count ARP operations
        arp_1 = chunk[chunk["arp_operation"] == 1].shape[0]
        arp_2 = chunk[chunk["arp_operation"] == 2].shape[0]
        print(f"Số gói tin gửi đi: {arp_1}")
        print(f"Số gói tin phản hồi: {arp_2}")

        # Calculate ARP response rate, avoid division by zero
        rate_arp = arp_2 / arp_1 if arp_1 > 0 else 0
        print(rate_arp)

        # Average number of IP addresses mapped to a MAC address
        mac_addresses = chunk[chunk["arp_operation"] == 2]["sender_mac"].nunique()
        ip_addresses = chunk[chunk["arp_operation"] == 2]["sender_ip"].nunique()

        # Avoid division by zero
        avg_ip_per_mac = ip_addresses / mac_addresses if mac_addresses > 0 else 0
        print(f"Unique MAC addresses: {mac_addresses}")
        print(f"Unique IP addresses: {ip_addresses}")
        print(f"Average IPs per MAC: {avg_ip_per_mac}")

        aggregated_features = []

        for feature in data.columns:
            if feature not in [
                "Source IP",
                "Destination IP",
                "Label",
                "Protocol",
                "arp_operation",
                "protocol_type",
                "sender_mac",
                "sender_ip",
                "target_mac",
                "target_ip",
            ]:
                # Convert to numeric and coerce errors to NaN
                numeric_values = pd.to_numeric(chunk[feature], errors="coerce")
                mean_value = (
                    np.nanmean(numeric_values) if len(numeric_values) > 0 else 0
                )
                std_value = np.nanstd(numeric_values) if len(numeric_values) > 0 else 0
                skew_value = (
                    0 if len(numeric_values) < 3 else pd.Series(numeric_values).skew()
                )
                kurtosis_value = (
                    0
                    if len(numeric_values) < 4
                    else pd.Series(numeric_values).kurtosis()
                )
                median_value = (
                    np.nanmedian(numeric_values) if len(numeric_values) > 0 else 0
                )

                aggregated_features.extend(
                    [mean_value, std_value, skew_value, kurtosis_value, median_value]
                )

        aggregated_features.extend(
            [SYN_per_TCP, FIN_per_TCP, bwk_per_fwk, rate_arp, avg_ip_per_mac]
        )
        aggregated_data.append(aggregated_features)

    aggregated_df = pd.DataFrame(aggregated_data)
    return aggregated_df


In [15]:
files = glob.glob("Raw_data/recon/VulScan/*/*.csv")
aggregated_results = []
final_df = []
for indx, file in enumerate(files):
    data = pd.read_csv(file, low_memory=False)
    data = data.drop(
        columns=[
            "Label",
            "Flow ID",
            "Src IP",
            "Src Port",
            "Dst IP",
            "Dst Port",
            "Timestamp",
            "hardware_type",
            "hardware_address_length",
            "protocol_address_length",
            "arp_header_length",
            "eth_type",
        ]
    )

    aggregated_df = aggregate_csv_features(data)
    # Add Label column with value 1 (attack category)

    aggregated_results.append(aggregated_df)
    aggregated_df["Label"] = 1

# Concatenate all the results
final_df = pd.concat(aggregated_results, ignore_index=True)

# Save the final dataframe to a CSV file
final_df.to_csv("New_Data/RECON_VULSCAN.csv", index=False)

Số gói tin có cờ SYN: 974
Số gói tin có cờ FIN: 7
Số gói tin TCP: 987
Backward per forward packet ratio: 0.9297752808988764
Số gói tin gửi đi: 0
Số gói tin phản hồi: 0
0
Unique MAC addresses: 0
Unique IP addresses: 0
Average IPs per MAC: 0
Số gói tin có cờ SYN: 980
Số gói tin có cờ FIN: 3
Số gói tin TCP: 990
Backward per forward packet ratio: 0.954136690647482
Số gói tin gửi đi: 0
Số gói tin phản hồi: 0
0
Unique MAC addresses: 0
Unique IP addresses: 0
Average IPs per MAC: 0
Số gói tin có cờ SYN: 60
Số gói tin có cờ FIN: 0
Số gói tin TCP: 60
Backward per forward packet ratio: 1.0
Số gói tin gửi đi: 0
Số gói tin phản hồi: 0
0
Unique MAC addresses: 0
Unique IP addresses: 0
Average IPs per MAC: 0
Số gói tin có cờ SYN: 978
Số gói tin có cờ FIN: 9
Số gói tin TCP: 991
Backward per forward packet ratio: 0.8862068965517241
Số gói tin gửi đi: 0
Số gói tin phản hồi: 0
0
Unique MAC addresses: 0
Unique IP addresses: 0
Average IPs per MAC: 0
Số gói tin có cờ SYN: 976
Số gói tin có cờ FIN: 13
Số gói 

  np.subtract(arr, avg, out=arr, casting='unsafe', where=where)
  adjusted = values - mean
  adjusted = values - mean


In [None]:
# Assuming 'data' is your DataFrame
columns_str = ", ".join(data.columns)
# Assuming 'data' is your DataFrame
for idx, col in enumerate(data.columns, start=1):
    print(f"{idx}: {col}")

In [None]:
import numpy as np
import pandas as pd
import glob


# Aggregation function
def aggregate_csv_features(data, chunk_size=1000):
    aggregated_data = []

    # Count the number of packets with SYN and FIN flags
    syn_count = data[data["SYN Flag Cnt"] == 1].shape[0]
    fin_count = data[data["FIN Flag Cnt"] == 1].shape[0]

    # Count the number of TCP packets
    tcp_count = data[data["Protocol"] == 6].shape[0]

    # Calculate SYN and FIN per TCP ratio, avoid division by zero
    SYN_per_TCP = syn_count / tcp_count if tcp_count > 0 else 0
    FIN_per_TCP = fin_count / tcp_count if tcp_count > 0 else 0

    # Sum of backward and forward packets
    totbwd = data["Tot Bwd Pkts"].sum()
    totfwd = data["Tot Fwd Pkts"].sum()

    # Calculate backward packets per forward packets ratio, avoid division by zero
    bwk_per_fwk = totbwd / totfwd if totfwd > 0 else 0

    # Count ARP operations
    arp_1 = data[data["arp_operation"] == 1].shape[0]
    arp_2 = data[data["arp_operation"] == 2].shape[0]

    rate_arp = arp_2 / arp_1 if arp_1 > 0 else 0

    mac_addresses = data[data["arp_operation"] == 2]["sender_mac"].nunique()
    ip_addresses = data[data["arp_operation"] == 2]["sender_ip"].nunique()

    avg_ip_per_mac = ip_addresses / mac_addresses if mac_addresses > 0 else 0

    aggregated_features = []

    for feature in data.columns:
        if feature not in [
            "Source IP",
            "Destination IP",
            "Label",
            "Protocol",
            "arp_operation",
            "protocol_type",
            "sender_mac",
            "sender_ip",
            "target_mac",
            "target_ip",
        ]:

            numeric_values = pd.to_numeric(data[feature], errors="coerce")
            mean_value = np.nanmean(numeric_values) if len(numeric_values) > 0 else 0
            std_value = np.nanstd(numeric_values) if len(numeric_values) > 0 else 0
            skew_value = (
                0 if len(numeric_values) < 3 else pd.Series(numeric_values).skew()
            )
            kurtosis_value = (
                0 if len(numeric_values) < 4 else pd.Series(numeric_values).kurtosis()
            )
            median_value = (
                np.nanmedian(numeric_values) if len(numeric_values) > 0 else 0
            )

            aggregated_features.extend(
                [mean_value, std_value, skew_value, kurtosis_value, median_value]
            )

    aggregated_features.extend(
        [SYN_per_TCP, FIN_per_TCP, bwk_per_fwk, rate_arp, avg_ip_per_mac]
    )
    aggregated_data.append(aggregated_features)

    aggregated_df = pd.DataFrame(aggregated_data)
    return aggregated_df

files = glob.glob("ATTACK/Train/Dos/UDP/*/*.csv")
aggregated_results = []

for indx, file in enumerate(files):
    data = pd.read_csv(file, low_memory=False)
    data = data.drop(
        columns=[
            "Label",
            "Flow ID",
            "Src IP",
            "Src Port",
            "Dst IP",
            "Dst Port",
            "Timestamp",
            "hardware_type",
            "hardware_address_length",
            "protocol_address_length",
            "arp_header_length",
            "eth_type",
        ]
    )

    aggregated_df = aggregate_csv_features(data)
    aggregated_df["Label"] = 1  
    aggregated_results.append(aggregated_df)

final_df = pd.concat(aggregated_results, ignore_index=True)
final_df.to_csv(f"New_Data/hehe/dos_UDP.csv", index=False)

In [None]:
import numpy as np
import pandas as pd
import glob


def aggregate_csv_features(data):
    # Average number of IP addresses mapped to a MAC address
    mac_addresses = data[data["arp_operation"] == 2]["sender_mac"].nunique()
    ip_addresses = data[data["arp_operation"] == 2]["sender_ip"].nunique()

    avg_ip_per_mac = ip_addresses / mac_addresses if mac_addresses > 0 else 0

    return avg_ip_per_mac


# Array to store average IPs per MAC for each file
avg_ip_per_mac_array = []

# List all CSV files in the directory
files = glob.glob("ATTACK/Train/*/*.csv")

# Process each file
for file in files:
    data = pd.read_csv(file, low_memory=False)

    # Drop non-numeric or unnecessary columns
    data = data.drop(
        columns=[
            "Label",
            "Flow ID",
            "Src IP",
            "Src Port",
            "Dst IP",
            "Dst Port",
            "Timestamp",
            "hardware_type",
            "hardware_address_length",
            "protocol_address_length",
            "arp_header_length",
            "eth_type",
        ]
    )

    # Calculate the average IPs per MAC
    avg_ip_per_mac = aggregate_csv_features(data)
    avg_ip_per_mac_array.append(avg_ip_per_mac)

# Load the benign.csv file
benign_data = pd.read_csv("New_Data/test1.csv")
benign_data = benign_data.drop(columns=["Label1"])
# Append the calculated averages as a new column
benign_data["399"] = avg_ip_per_mac_array

# Add a column "Label1" with a constant value of 0
benign_data["Label1"] = 1

benign_data.to_csv("New_Data/test1_updated.csv", index=False)

#### Calculate features

In [None]:
import glob
import pandas as pd

files = glob.glob("New_Data/*.csv")

dataframes = []

for file in files:
    df = pd.read_csv(file)
    dataframes.append(df)

benign_data = pd.concat(dataframes, ignore_index=True)
print(benign_data)


In [None]:
benign_data = benign_data.drop(columns=["Label1"])
# Append the calculated averages as a new column
benign_data["399"] = avg_ip_per_mac_array

# Add a column "Label1" with a constant value of 0
benign_data["Label1"] = 1

benign_data.to_csv("New_Data/train_updated.csv", index=False)

In [None]:
benign_data.to_csv(f"New_Data/benign.csv", index=False)

In [None]:
import glob
import pandas as pd

files = glob.glob("New_Data/6/6/*.csv")

dataframes = []

for file in files:
    df = pd.read_csv(file)
    dataframes.append(df)

test1_data = pd.concat(dataframes, ignore_index=True)
print(test1_data)
test1_data.to_csv(f"New_Data/6.csv", index=False)

In [None]:
import glob
import pandas as pd

files = glob.glob("New_Data/*.csv")

dataframes = []

for file in files:
    df = pd.read_csv(file)
    dataframes.append(df)

data = pd.concat(dataframes, ignore_index=True)

In [None]:
# Replace infinity values with a large number
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
#  Fill NaN values with 0 
data.fillna(0, inplace=True)
data.fillna(0, inplace=True)

In [None]:
data.head()

In [None]:
X = data.drop(columns=["Label1"])
y = data["Label1"]

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

new_data = X_scaled_df.copy()
new_data["Label"] = y.values

In [None]:
new_data.to_csv("data2.csv", index=False)

#### Trích xuất label = 0

In [None]:
import pandas as pd
data = pd.read_csv("New_binary_label_data_cleaned.csv")

In [None]:
data = data.drop(columns=["395","396","397","398","399"])

In [None]:
data.shape

In [None]:
data.to_csv("359_features.csv")

In [None]:
import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame
label_counts = data["Label"].value_counts()

# Plotting the value counts
plt.figure(figsize=(10, 6))
label_counts.plot(kind="bar")

# Adding titles and labels
plt.title("Counts of Unique Labels in 'Label' Column")
plt.xlabel("Labels")
plt.ylabel("Counts")
plt.xticks(rotation=45)  # Rotate labels for better readability
plt.grid(axis="y")  # Add a horizontal grid

# Display the plot
plt.show()

In [None]:
print(len(data.columns))

In [None]:
columns_to_drop = [col for col in data.columns if data[col].nunique() == 1]

data_cleaned = data.drop(columns=columns_to_drop)
print(f"Các cột đã bị loại bỏ: {columns_to_drop}")

data_cleaned.to_csv("New_binary_label_data_cleaned.csv", index=False)