# Imports

In [2]:
import pandas as pd
import os

# Data preprocessing

Dataset to use: [CIC IoT Dataset 2023](https://unb.ca/cic/datasets/iotdataset-2023.html)

In [3]:
def combine_csv_files(folder_path):
    """
    Combina todos los archivos CSV de una carpeta en un único DataFrame.

    Args:
    folder_path (str): Ruta de la carpeta que contiene los archivos CSV.

    Returns:
    pandas.DataFrame: DataFrame que contiene todos los datos combinados.
    """

    # Lista para guardar los dataframes individuales
    all_dataframes = []

    # Itera sobre cada archivo en la carpeta
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            all_dataframes.append(df)

    # Combina todos los dataframes en uno solo
    combined_dataframe = pd.concat(all_dataframes, ignore_index=True)
    
    return combined_dataframe

df = combine_csv_files('dataset/CICIoT2023')

# Exploratory Data Analysis

In [4]:
df.shape

(46686579, 47)

In [7]:
df.columns

Index(['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate',
       'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count',
       'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
       'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC',
       'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number',
       'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'label'],
      dtype='object')

In [9]:
df.describe()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
count,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,...,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0
mean,5.765449,76705.96,9.06569,66.35072,9064.057,9064.057,5.463949e-06,0.08657207,0.2073353,0.09050473,...,124.6688,33.32481,124.6916,83182530.0,9.498489,13.12182,47.09498,30724.36,0.0964376,141.5124
std,285.0342,461331.7,8.945533,14.01919,99562.49,99562.49,0.007250766,0.281207,0.4053978,0.2869035,...,240.9915,160.3357,241.5493,17047350.0,0.8191532,8.628579,226.7696,323710.7,0.233001,21.06831
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0
25%,0.0,54.0,6.0,64.0,2.091856,2.091856,0.0,0.0,0.0,0.0,...,50.0,0.0,50.0,83071570.0,9.5,10.0,0.0,0.0,0.0,141.55
50%,0.0,54.0,6.0,64.0,15.75423,15.75423,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,83124520.0,9.5,10.3923,0.0,0.0,0.0,141.55
75%,0.1051381,280.555,14.33,64.0,117.3848,117.3848,0.0,0.0,0.0,0.0,...,54.04973,0.3719096,54.06,83343910.0,9.5,10.39671,0.5059213,1.344216,0.08,141.55
max,394357.2,9907148.0,47.0,255.0,8388608.0,8388608.0,29.71522,1.0,1.0,1.0,...,13583.0,12385.24,13583.0,167639400.0,15.0,164.8211,17551.27,154902200.0,1.0,244.6


In [10]:
df.head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.037456,15099.0,17.0,64.0,10001.102371,10001.102371,0.0,0.0,0.0,0.0,...,0.0,50.0,83102150.0,9.5,10.0,0.0,0.0,0.0,141.55,DDoS-UDP_Flood
1,0.0,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,54.0,83331770.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-PSHACK_Flood
2,0.010346,9662.5,17.0,64.0,21380.056228,21380.056228,0.0,0.0,0.0,0.0,...,0.0,50.0,83098790.0,9.5,10.0,0.0,0.0,0.0,141.55,DDoS-UDP_Flood
3,0.0,54.0,6.0,64.0,241.333973,241.333973,0.0,0.0,0.0,0.0,...,0.0,54.0,82951120.0,9.5,10.392305,0.0,0.0,0.0,141.55,DoS-TCP_Flood
4,0.195109,95.58,6.0,64.0,6.762174,6.762174,0.0,0.0,1.0,0.0,...,0.0,54.0,83365400.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-SynonymousIP_Flood
