In [1]:
import glob
import pandas as pd

csv_files = sorted(glob.glob("../../data/*.csv"))[:3]  # use first 3 CSVs to keep size small
print("Using files:", csv_files)


# Reads each CSV file into a pandas DataFrame using list comprehension.
# For each file 'f' in csv_files, 'pd.read_csv(f)' loads it into memory as a DataFrame.
# 'dfs' will be a list containing 3 DataFrames.
dfs = [pd.read_csv(f) for f in csv_files]

# Concatenates all DataFrames in 'dfs' into one large DataFrame 'df'.
# 'ignore_index=True' resets the row index so it runs sequentially from 0 to total_rows-1.
df = pd.concat(dfs, ignore_index=True)

# Prints the number of rows and columns in the combined DataFrame.
# Example output: "Rows, Columns: (600000, 85)"
print("Rows, Columns:", df.shape)

# Displays concise information about the DataFrame.
# Output includes: 
#   - Number of non-null values per column
#   - Data types (int64, float64, object, etc.)
#   - Memory usage
df.info()            # shows dtypes and non-null counts

# Prints the first 50 column names to quickly inspect the dataset structure.
# Example: Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets', ...], dtype='object')
print(df.columns[:50])  # show first 50 column names

# Checks if a column named 'Label' exists (since it's common in IDS datasets).
# If it exists, prints all unique class labels (e.g., ['BENIGN', 'DoS attack', 'PortScan', ...]).
# If not found, prints "No Label column".
print("Label values:", df['Label'].unique() if 'Label' in df.columns else 'No Label column')


Using files: ['../../data\\Friday-02-03-2018_TrafficForML_CICFlowMeter.csv', '../../data\\Friday-16-02-2018_TrafficForML_CICFlowMeter.csv', '../../data\\Friday-23-02-2018_TrafficForML_CICFlowMeter.csv']


  dfs = [pd.read_csv(f) for f in csv_files]


Rows, Columns: (3145725, 80)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3145725 entries, 0 to 3145724
Data columns (total 80 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   Dst Port           object
 1   Protocol           object
 2   Timestamp          object
 3   Flow Duration      object
 4   Tot Fwd Pkts       object
 5   Tot Bwd Pkts       object
 6   TotLen Fwd Pkts    object
 7   TotLen Bwd Pkts    object
 8   Fwd Pkt Len Max    object
 9   Fwd Pkt Len Min    object
 10  Fwd Pkt Len Mean   object
 11  Fwd Pkt Len Std    object
 12  Bwd Pkt Len Max    object
 13  Bwd Pkt Len Min    object
 14  Bwd Pkt Len Mean   object
 15  Bwd Pkt Len Std    object
 16  Flow Byts/s        object
 17  Flow Pkts/s        object
 18  Flow IAT Mean      object
 19  Flow IAT Std       object
 20  Flow IAT Max       object
 21  Flow IAT Min       object
 22  Fwd IAT Tot        object
 23  Fwd IAT Mean       object
 24  Fwd IAT Std        object
 25  Fwd IAT Max     