In [2]:
import glob
import pandas as pd
import numpy as np

csv_files = sorted(glob.glob("../../data/*.csv"))[:3]  # use first 3 CSVs to keep size small
print("Using files:", csv_files)


# Reads each CSV file into a pandas DataFrame using list comprehension.
# For each file 'f' in csv_files, 'pd.read_csv(f)' loads it into memory as a DataFrame.
# 'dfs' will be a list containing 3 DataFrames.
dfs = [pd.read_csv(f) for f in csv_files]

# Concatenates all DataFrames in 'dfs' into one large DataFrame 'df'.
# 'ignore_index=True' resets the row index so it runs sequentially from 0 to total_rows-1.
df = pd.concat(dfs, ignore_index=True)

# Prints the number of rows and columns in the combined DataFrame.
# Example output: "Rows, Columns: (600000, 85)"
print("Rows, Columns:", df.shape)

print("\n\nCheckpoint 1")

# Displays concise information about the DataFrame.
# Output includes: 
#   - Number of non-null values per column
#   - Data types (int64, float64, object, etc.)
#   - Memory usage
df.info()            # shows dtypes and non-null counts

print("\n\nCheckpoint 2")

# Prints the first 50 column names to quickly inspect the dataset structure.
# Example: Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets', ...], dtype='object')
# print(df.columns[:50])  # show first 50 column names

# Configure pandas to display *all* columns (no truncation)
# pd.set_option('display.max_columns', None)   # None = no column limit
# pd.set_option('display.width', 2000)         # Expands line width for full display

# Print all column names
print("\nAll Columns Before Dropping the irrelevent columns:\n", df.columns.tolist())  # .tolist() shows them in a Python list format



Using files: ['../../data\\Friday-02-03-2018_TrafficForML_CICFlowMeter.csv', '../../data\\Friday-16-02-2018_TrafficForML_CICFlowMeter.csv', '../../data\\Friday-23-02-2018_TrafficForML_CICFlowMeter.csv']


  dfs = [pd.read_csv(f) for f in csv_files]


Rows, Columns: (3145725, 80)


Checkpoint 1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3145725 entries, 0 to 3145724
Data columns (total 80 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   Dst Port           object
 1   Protocol           object
 2   Timestamp          object
 3   Flow Duration      object
 4   Tot Fwd Pkts       object
 5   Tot Bwd Pkts       object
 6   TotLen Fwd Pkts    object
 7   TotLen Bwd Pkts    object
 8   Fwd Pkt Len Max    object
 9   Fwd Pkt Len Min    object
 10  Fwd Pkt Len Mean   object
 11  Fwd Pkt Len Std    object
 12  Bwd Pkt Len Max    object
 13  Bwd Pkt Len Min    object
 14  Bwd Pkt Len Mean   object
 15  Bwd Pkt Len Std    object
 16  Flow Byts/s        object
 17  Flow Pkts/s        object
 18  Flow IAT Mean      object
 19  Flow IAT Std       object
 20  Flow IAT Max       object
 21  Flow IAT Min       object
 22  Fwd IAT Tot        object
 23  Fwd IAT Mean       object
 24  Fwd IAT Std        object
 25  F

In [None]:
######### Remove irrelevant columns #########

to_drop = [
    'Timestamp',
    'Dst Port',
    'Protocol',
    'Flow Duration',
    'Fwd Header Len',
    'Bwd Header Len',
    'Fwd PSH Flags',
    'Bwd PSH Flags',
    'Fwd URG Flags',
    'Bwd URG Flags',
    'CWE Flag Count',
    'ECE Flag Cnt',
    'Fwd Byts/b Avg',
    'Fwd Pkts/b Avg',
    'Fwd Blk Rate Avg',
    'Bwd Byts/b Avg',
    'Bwd Pkts/b Avg',
    'Bwd Blk Rate Avg'
]
drop_list = [c for c in to_drop if c in df.columns]
df.drop(columns=drop_list, inplace=True, errors='ignore')
print("\n\nDropped Columns:", drop_list)


print("\n\nCheckpoint 1")


# Print all column names
print("\nAll Columns After Dropping the irrelevant columns:\n", df.columns.tolist())  # .tolist() shows them in a Python list format




Dropped Columns: ['Timestamp', 'Dst Port', 'Protocol', 'Flow Duration', 'Fwd Header Len', 'Bwd Header Len', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'CWE Flag Count', 'ECE Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg']


Checkpoint 1

All Columns After Dropping the irrelevant columns:
 ['Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 

In [4]:
######### Handle missing values #########


# 1 Drop rows with too many missing values (>30%)
threshold = 0.30  # 30% missing allowed
missing_fraction_per_row = df.isnull().mean(axis=1)

# Count how many rows will be dropped
rows_to_drop = (missing_fraction_per_row > threshold).sum()
print(f"Dropping {rows_to_drop} rows with more than {threshold*100}% missing values...")

# Keep rows that have <=30% missing data
df = df.loc[missing_fraction_per_row <= threshold].reset_index(drop=True)


# 2 Replace numerical NaNs in the features with the median of the entire column
num_cols = df.select_dtypes(include=[np.number]).columns

for col in num_cols:
    if df[col].isnull().any():  # Only fill if NaNs exist
        median_value = df[col].median()  # You can change to df[col].mean() if desired
        df[col].fillna(median_value, inplace=True)

print("Missing values handled successfully!")
print("Remaining NaNs per column:")
print(df.isnull().sum()[df.isnull().sum() > 0])


Dropping 0 rows with more than 30.0% missing values...
Missing values handled successfully!
Remaining NaNs per column:
Flow Byts/s    6312
dtype: int64


In [5]:
print(df.isnull().sum())

Tot Fwd Pkts       0
Tot Bwd Pkts       0
TotLen Fwd Pkts    0
TotLen Bwd Pkts    0
Fwd Pkt Len Max    0
                  ..
Idle Mean          0
Idle Std           0
Idle Max           0
Idle Min           0
Label              0
Length: 62, dtype: int64


In [None]:
##### Handle infinite values #####

# 1 Identify numeric columns only
num_cols = df.select_dtypes(include=[np.number]).columns

# 2 Replace +inf and -inf with the column’s finite max/min values
for col in num_cols:
    finite_vals = df[col][np.isfinite(df[col])]
    if not finite_vals.empty:
        df[col].replace(np.inf, finite_vals.max(), inplace=True)
        df[col].replace(-np.inf, finite_vals.min(), inplace=True)

# 3 Fill NaNs (in numeric columns only) with each column’s median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# 4 Verify remaining NaNs
print("Remaining NaNs per column (should be zero):")
print(df.isnull().sum())


Remaining NaNs per column (should be zero):
Tot Fwd Pkts       0
Tot Bwd Pkts       0
TotLen Fwd Pkts    0
TotLen Bwd Pkts    0
Fwd Pkt Len Max    0
                  ..
Idle Mean          0
Idle Std           0
Idle Max           0
Idle Min           0
Label              0
Length: 62, dtype: int64
