 ## installing libraries and importing

In [None]:
!pip install pandas numpy scikit-learn tensorflow matplotlib seaborn



In [None]:
!sudo dpkg --configure -a
!sudo apt-get install -y tshark

Setting up wireshark-common (3.6.2-2) ...
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
Setting up tshark (3.6.2-2) ...
Processing triggers for hicolor-icon-theme (0.17-2) ...
Processing triggers for libc-bin (2.35-0ubuntu3.8) ...
/sbin/ldconfig.real: /usr/local/lib/libtcm.so.1 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link

/sbin/ldconfig.rea

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau


## Downloading dataset and unzipping it

In [None]:

!wget -c -O scada_dataset.tar "https://g-83fdd0.1beed.03c0.data.globus.org/cartuids/14271_2329f583703f14b8991e1b9c259b8a4b.tar"

# checkin file size
!ls -lh scada_dataset.tar

#  Extract the TAR archive


!mkdir -p /content/scada_raw
!tar -xvf scada_dataset.tar -C /content/scada_raw || true

# list what‚Äôs inside
!find /content/scada_raw -maxdepth 3 -type f


--2025-11-09 14:51:37--  https://g-83fdd0.1beed.03c0.data.globus.org/cartuids/14271_2329f583703f14b8991e1b9c259b8a4b.tar
Resolving g-83fdd0.1beed.03c0.data.globus.org (g-83fdd0.1beed.03c0.data.globus.org)... 192.101.102.34
Connecting to g-83fdd0.1beed.03c0.data.globus.org (g-83fdd0.1beed.03c0.data.globus.org)|192.101.102.34|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/x-tar]
Saving to: ‚Äòscada_dataset.tar‚Äô

scada_dataset.tar       [        <=>         ]   7.41G  67.8MB/s    in 1m 41s  

2025-11-09 14:53:19 (75.4 MB/s) - ‚Äòscada_dataset.tar‚Äô saved [7957268480]

-rw-r--r-- 1 root root 7.5G Nov  9 14:53 scada_dataset.tar
14271_2329f583703f14b8991e1b9c259b8a4b.tar/
14271_2329f583703f14b8991e1b9c259b8a4b.tar/Data/
14271_2329f583703f14b8991e1b9c259b8a4b.tar/Data/Physical Data/
14271_2329f583703f14b8991e1b9c259b8a4b.tar/Data/Physical Data/Baseline.zip
14271_2329f583703f14b8991e1b9c259b8a4b.tar/Data/Physical Data/Test3.zip
14271_2329f

In [None]:
# Unzip Data.zip
import os
import zipfile

data_zip_path = "/content/scada_raw/14271_2329f583703f14b8991e1b9c259b8a4b.tar/Data.zip"
extract_dir = "/content/scada_data/"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(data_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(" Data.zip extracted to:", extract_dir)


 Data.zip extracted to: /content/scada_data/


## Converting the pcaps (native wireshark format) to csvs

In [None]:
import os, subprocess

pcap_dir = "/content/scada_data/Data/PCAPs/Long_duration_PCAPs"
csv_dir = "/content/scada_csv"
os.makedirs(csv_dir, exist_ok=True)

# Tshark fields based on the paper
fields = [
    "frame.number",
    "frame.time_relative",
    "ip.src",
    "ip.dst",
    "_ws.col.Protocol",
    "frame.len",
    "_ws.col.Info"
]

selected_pcaps = [
    "day1_02_10_20.pcap",
    "day3_03_16_20.pcap",
    "day4_03_17_20.pcap",
    "day5_03_18_20.pcap",
    "day6_03_19_20.pcap"
]

# helper function run a shell command and capture output
def run_cmd(cmd):
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return result.stdout.strip(), result.stderr.strip()

for fname in selected_pcaps:
    input_path = os.path.join(pcap_dir, fname)
    output_path = os.path.join(csv_dir, fname.replace(".pcap", ".csv"))

    print(f"\n Inspecting {fname} ...")

    # 1Ô∏è‚É£ Check if file exists and summarize basic info
    if not os.path.exists(input_path):
        print(f"File not found: {input_path}")
        continue

    # Count packets quickly
    out, _ = run_cmd(f"tshark -r '{input_path}' -q -z io,phs")
    lines = out.splitlines()
    total_packets = [l for l in lines if "Frames" in l]
    if total_packets:
        print(f" Summary of I/O Phases:")
        for l in total_packets[:10]:
            print("   ", l)
    else:
        print("Could not summarize PCAP structure.")

    # 2 Check available fields (display filter names)
    print("\nüîé Checking field existence:")
    out, _ = run_cmd("tshark -G fields | grep 'ip.' | head -5")
    print("   Example IP-related fields:", out.splitlines()[:3])
    out, _ = run_cmd("tshark -G fields | grep 'frame.' | head -5")
    print("   Example Frame-related fields:", out.splitlines()[:3])
    out, _ = run_cmd("tshark -G fields | grep '_ws.col.Protocol'")
    print("   _ws.col.Protocol found:", bool(out.strip()))

    # 3 Test extract first 10 packets to preview columns
    preview_cmd = (
        f"tshark -r '{input_path}' -T fields "
        f"-E header=y -E separator=, "
        + " ".join([f"-e {f}" for f in fields])
        + " -c 10"
    )
    preview_out, preview_err = run_cmd(preview_cmd)
    print("\nüßæ Preview of first 10 packets:\n", preview_out[:500])
    if preview_err:
        print(" Tshark warnings/errors:\n", preview_err)

    #  Now convert full file ‚Üí CSV
    print(f"\nüöÄ Converting {fname} ‚Üí {os.path.basename(output_path)} ...")
    cmd = [
        "tshark",
        "-r", input_path,
        "-T", "fields",
        "-E", "header=y",
        "-E", "separator=,",
        *sum([["-e", f] for f in fields], [])
    ]
    with open(output_path, "w") as f:
        subprocess.run(cmd, stdout=f)
    print(f" Done: {output_path}\n")



 Inspecting day1_02_10_20.pcap ...
Could not summarize PCAP structure.

üîé Checking field existence:
   Example IP-related fields: ['F\tDescriptor Type\tieee17221.descriptor_type\tFT_UINT16\tieee17221\tBASE_HEX\t0x0\t', 'F\tDescriptor Index\tieee17221.descriptor_index\tFT_UINT16\tieee17221\tBASE_HEX\t0x0\t', 'F\tLocalized Description\tieee17221.configuration_name_string\tFT_UINT16\tieee17221\tBASE_DEC\t0x0\t']
   Example Frame-related fields: ['F\tFrames TX\tieee17221.frames_tx\tFT_UINT32\tieee17221\tBASE_DEC\t0x0\t', 'F\tFrames RX\tieee17221.frames_rx\tFT_UINT32\tieee17221\tBASE_DEC\t0x0\t', 'F\tIncorrect frame dependent field value, shall be 0\tiec61883.6_incorrect_cip_fdf\tFT_NONE\tiec61883\t\t0x0\t']
   _ws.col.Protocol found: False

üßæ Preview of first 10 packets:
 frame.number,frame.time_relative,ip.src,ip.dst,_ws.col.Protocol,frame.len,_ws.col.Info
1,0.000000000,172.17.0.60,172.17.0.22,TCP,66,1031 ‚Üí 20000 [ACK] Seq=1 Ack=1 Win=16367 Len=0
2,0.000320000,172.17.0.60,172.17.

In [None]:
import os, subprocess

pcap_dir = "/content/scada_data/Data/PCAPs/Long_duration_PCAPs"
proto_dir = "/content/protocol_csvs"
os.makedirs(proto_dir, exist_ok=True)

selected_pcaps = [
    "day1_02_10_20.pcap",
    "day3_03_16_20.pcap",
    "day4_03_17_20.pcap",
    "day5_03_18_20.pcap",
    "day6_03_19_20.pcap"
]

for fname in selected_pcaps:
    input_path = os.path.join(pcap_dir, fname)
    output_path = os.path.join(proto_dir, fname.replace(".pcap", "_protocols.csv"))

    print(f"Extracting protocols from {fname} ...")
    cmd = [
        "tshark",
        "-r", input_path,
        "-T", "fields",
        "-E", "header=y",
        "-E", "separator=,",
        "-e", "frame.number",
        "-e", "frame.protocols"
    ]
    with open(output_path, "w") as f:
        subprocess.run(cmd, stdout=f)
    print(f" Saved protocol CSV: {output_path}\n")


Extracting protocols from day1_02_10_20.pcap ...
 Saved protocol CSV: /content/protocol_csvs/day1_02_10_20_protocols.csv

Extracting protocols from day3_03_16_20.pcap ...
 Saved protocol CSV: /content/protocol_csvs/day3_03_16_20_protocols.csv

Extracting protocols from day4_03_17_20.pcap ...
 Saved protocol CSV: /content/protocol_csvs/day4_03_17_20_protocols.csv

Extracting protocols from day5_03_18_20.pcap ...
 Saved protocol CSV: /content/protocol_csvs/day5_03_18_20_protocols.csv

Extracting protocols from day6_03_19_20.pcap ...
 Saved protocol CSV: /content/protocol_csvs/day6_03_19_20_protocols.csv



In [None]:
import pandas as pd

def robust_read_csv(path):
    """
    Safely read slightly malformed CSVs (e.g., commas inside Info column)
    without breaking parsing.
    """
    try:
        df = pd.read_csv(
            path,
            engine="python",        # slower but handles malformed lines
            on_bad_lines="skip",    # skip problematic lines instead of failing
            quotechar='"',          # handle quoted fields properly
            sep=",",                # comma separator
            error_bad_lines=False,  # backward compatibility (ignored in new pandas)
            warn_bad_lines=True
        )
    except Exception as e:
        print(f"‚ö†Ô∏è Error reading {path}: {e}")
        # fallback: manual re-split
        with open(path, "r", errors="ignore") as f:
            lines = f.readlines()
        header = lines[0].strip().split(",")
        data = [l.strip().split(",")[:len(header)] for l in lines[1:]]
        df = pd.DataFrame(data, columns=header)
    return df


In [None]:

# Python packages
!pip install -q scikit-learn tensorflow pandas matplotlib seaborn networkx python-louvain tqdm ujson


[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/57.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m57.4/57.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## saved the dataset to drive, reopening the uploaded data from drive

In [None]:
import os
import glob
import pandas as pd
from zipfile import ZipFile

# ----- Unzip folder -----
zip_path = '/content/drive/MyDrive/scada_csv_backup.zip'
extract_dir = '/content/scada_csv_extracted'
os.makedirs(extract_dir, exist_ok=True)
with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# ----- Output dir -----
OUT_DIR = '/content/scada_analysis'
os.makedirs(OUT_DIR, exist_ok=True)

# ----- List CSVs -----
csvs = sorted(glob.glob(os.path.join(extract_dir, '*.csv')))
print("Found CSVs:", csvs)

# ----- Read CSVs (only first 7 columns) -----
for file in csvs:
    try:
        # Read first 7 columns by index
        df = pd.read_csv(file, usecols=range(7), engine='python')  # <- key!

        print(f"\nFile: {file}")
        print(f"Rows: {len(df)}, Columns: {len(df.columns)}")
        print(df.head(3))

        # Save cleaned CSV
        out_file = os.path.join(OUT_DIR, os.path.basename(file))
        df.to_csv(out_file, index=False)

    except Exception as e:
        print(f"Error reading {file}: {e}")


Found CSVs: ['/content/scada_csv_extracted/day1_02_10_20.csv', '/content/scada_csv_extracted/day3_03_16_20.csv', '/content/scada_csv_extracted/day4_03_17_20.csv', '/content/scada_csv_extracted/day5_03_18_20.csv', '/content/scada_csv_extracted/day6_03_19_20.csv']

File: /content/scada_csv_extracted/day1_02_10_20.csv
Rows: 5522384, Columns: 7
   frame.number  frame.time_relative       ip.src       ip.dst  \
0             1              0.00000  172.17.0.60  172.17.0.22   
1             2              0.00032  172.17.0.60  172.17.0.21   
2             3              0.00086  172.17.0.38  172.17.0.33   

  _ws.col.Protocol frame.len                                    _ws.col.Info  
0              TCP        66  1031 ‚Üí 20000 [ACK] Seq=1 Ack=1 Win=16367 Len=0  
1              TCP        66  1030 ‚Üí 20000 [ACK] Seq=1 Ack=1 Win=16384 Len=0  
2              TCP        66                                502 ‚Üí 58980 [FIN  

File: /content/scada_csv_extracted/day3_03_16_20.csv
Rows: 2950364, C

## Standardize columns , canonical protocol extraction

We create canonical columns: frame.number (int), time (float), ip.src, ip.dst, src_port, dst_port, protocol (final layer like modbus), frame.len, info.

In [None]:
import os
import glob
import pandas as pd
import numpy as np
from zipfile import ZipFile
from pathlib import Path

OUT_DIR = '/content/scada_analysis'
std_dir = os.path.join(OUT_DIR, "standardized")
os.makedirs(std_dir, exist_ok=True)

# ------------------- List CSV files -------------------
csvs = sorted(glob.glob(os.path.join(extract_dir, '*.csv')))
print("Found CSVs:", csvs)

# ------------------- Canonicalization function -------------------
def canonicalize_df(df):
    # frame.number
    if 'frame.number' not in df.columns:
        for c in df.columns:
            if 'frame.number' in c:
                df = df.rename(columns={c:'frame.number'})
                break
    df['frame.number'] = pd.to_numeric(df.get('frame.number'), errors='coerce').astype('Int64')

    # time column
    time_cols = [c for c in df.columns if 'time' in c.lower()]
    if time_cols:
        df['time_rel'] = pd.to_numeric(df[time_cols[0]], errors='coerce')
    else:
        df['time_rel'] = pd.NA

    # IP columns
    for c in df.columns:
        if c.lower().strip() in ('ip.src','ip.src.addr','src'):
            df = df.rename(columns={c:'ip.src'})
        if c.lower().strip() in ('ip.dst','ip.dst.addr','dst'):
            df = df.rename(columns={c:'ip.dst'})

    # frame.len
    if 'frame.len' not in df.columns:
        for c in df.columns:
            if 'len' in c.lower() and 'frame' in c.lower():
                df = df.rename(columns={c:'frame.len'})
                break
    df['frame.len'] = pd.to_numeric(df.get('frame.len'), errors='coerce')

    # info column
    for c in df.columns:
        if '_ws.col.info' in c.lower() or c.lower() == 'info':
            df = df.rename(columns={c:'info'})
            break

    # protocol
    if 'protocol' not in df.columns:
        candidates = [c for c in df.columns if 'protocol' in c.lower()]
        if candidates:
            df = df.rename(columns={candidates[0]:'protocol'})
    if 'protocol' in df.columns:
        df['protocol'] = df['protocol'].astype(str).fillna('').str.split(':').str[-1].replace('', np.nan)

    # TCP/UDP ports
    for col in list(df.columns):
        if 'tcp.srcport' in col.lower() or col.lower() == 'tcp.srcport':
            df = df.rename(columns={col:'tcp.srcport'})
        if 'tcp.dstport' in col.lower() or col.lower() == 'tcp.dstport':
            df = df.rename(columns={col:'tcp.dstport'})
    # generic src/dst port
    for col in list(df.columns):
        if 'srcport' in col.lower() and 'tcp' not in col.lower():
            df = df.rename(columns={col:'tcp.srcport'})
        if 'dstport' in col.lower() and 'tcp' not in col.lower():
            df = df.rename(columns={col:'tcp.dstport'})

    # cast ports
    for p in ['tcp.srcport', 'tcp.dstport']:
        if p in df.columns:
            df[p] = pd.to_numeric(df[p], errors='coerce').astype('Int64')

    return df

# ------------------- 5. Read, canonicalize, save -------------------
for path in csvs:
    try:
        # Only read first 7 columns to avoid parser errors
        df = pd.read_csv(path, usecols=range(7), engine='python')
        df = canonicalize_df(df)

        out_file = os.path.join(std_dir, Path(path).stem + "_std.csv")
        df.to_csv(out_file, index=False)
        print(f"Standardized and saved: {out_file}")

    except Exception as e:
        print(f"Error reading {path}: {e}")

print("\nAll standardized CSVs saved to:", std_dir)


Unzipped CSVs to: /content/scada_csv_extracted
Found CSVs: ['/content/scada_csv_extracted/day1_02_10_20.csv', '/content/scada_csv_extracted/day3_03_16_20.csv', '/content/scada_csv_extracted/day4_03_17_20.csv', '/content/scada_csv_extracted/day5_03_18_20.csv', '/content/scada_csv_extracted/day6_03_19_20.csv']
Standardized and saved: /content/scada_analysis/standardized/day1_02_10_20_std.csv
Standardized and saved: /content/scada_analysis/standardized/day3_03_16_20_std.csv
Error reading /content/scada_csv_extracted/day4_03_17_20.csv: cannot safely cast non-equivalent object to int64
Error reading /content/scada_csv_extracted/day5_03_18_20.csv: cannot safely cast non-equivalent object to int64
Standardized and saved: /content/scada_analysis/standardized/day6_03_19_20_std.csv

All standardized CSVs saved to: /content/scada_analysis/standardized


## FEATURE ENGINEERING:  Build protocol when missing (port mapping + info keyword)

We synthesize protocol_final using these rules:

existing protocol column ‚Üí split last layer

port mapping: dstport 502 ‚Üí modbus; 20000 ‚Üí dnp3

_ws.col.Info keyword match (modbus, dnp3, read, write)

fallback: tcp / udp / unknown

In [None]:
import glob, pandas as pd

port_map = {502:'modbus', 20000:'dnp3', 2404:'iec104', 44818:'ethernetip'}  # add as needed
std_files = sorted(glob.glob(std_dir + "/*_std.csv"))

for f in std_files:
    df = pd.read_csv(f, low_memory=False)
    # derive from existing protocol if present
    if 'protocol' in df.columns:
        proto = df['protocol'].astype(str).str.lower().str.split(':').str[-1]
    else:
        proto = pd.Series([pd.NA]*len(df))

    # port-based
    if 'tcp.dstport' in df.columns:
        port_proto = df['tcp.dstport'].map(port_map).fillna(pd.NA)
    else:
        port_proto = pd.Series([pd.NA]*len(df))

    # info-based fuzzy match
    info_proto = pd.Series([pd.NA]*len(df))
    if 'info' in df.columns:
        info_lower = df['info'].astype(str).str.lower()
        info_proto = info_lower.str.extract('(modbus|dnp3|iec104|ethernetip)', expand=False)

    # combine with priority: protocol > port_proto > info_proto > frame.protocols_last_token > fallback
    df['protocol_final'] = proto.fillna(port_proto).fillna(info_proto)
    # try falling back to frame.protocols if present
    if 'frame.protocols' in df.columns and df['protocol_final'].isna().any():
        df['protocol_final'] = df['protocol_final'].fillna(df['frame.protocols'].astype(str).str.split(':').str[-1])
    df['protocol_final'] = df['protocol_final'].fillna('tcp').fillna('unknown')
    # save
    df.to_csv(f.replace("_std.csv","_std_proto.csv"), index=False)
print("protocol_final added to all standardized CSVs")


protocol_final added to all standardized CSVs


## Add class Labeling (Day1 ‚Üí Normal; Days3‚Äì6 ‚Üí Attack)

these dyas are categorised in the original paper

In [None]:
label_map = {
    "day1_02_10_20": "Normal",
    "day3_03_16_20": "Attack",
    "day4_03_17_20": "Attack",
    "day5_03_18_20": "Attack",
    "day6_03_19_20": "Attack"
}

proto_files = sorted(glob.glob(std_dir + "/*_std_proto.csv"))
dfs = []
for f in proto_files:
    stem = Path(f).stem.replace("_std_proto", "")
    if stem not in label_map:
        print("Skipping", stem)
        continue
    df = pd.read_csv(f, low_memory=False)
    df['day_label'] = label_map[stem]
    df['source_file'] = stem
    dfs.append(df[['frame.number','time_rel','ip.src','ip.dst','frame.len','info','protocol_final','day_label','source_file']])
full = pd.concat(dfs, ignore_index=True)
full.to_csv(os.path.join(OUT_DIR,"scada_full_raw.csv"), index=False)
print("Combined dataset shape:", full.shape)


Combined dataset shape: (11024243, 9)


In [None]:
full.head()

Unnamed: 0,frame.number,time_rel,ip.src,ip.dst,frame.len,info,protocol_final,day_label,source_file
0,1,0.0,172.17.0.60,172.17.0.22,66.0,1031 ‚Üí 20000 [ACK] Seq=1 Ack=1 Win=16367 Len=0,tcp,Normal,day1_02_10_20
1,2,0.00032,172.17.0.60,172.17.0.21,66.0,1030 ‚Üí 20000 [ACK] Seq=1 Ack=1 Win=16384 Len=0,tcp,Normal,day1_02_10_20
2,3,0.00086,172.17.0.38,172.17.0.33,66.0,502 ‚Üí 58980 [FIN,tcp,Normal,day1_02_10_20
3,4,0.001152,172.17.0.33,172.17.0.38,66.0,58980 ‚Üí 502 [ACK] Seq=1 Ack=2 Win=8192 Len=0,tcp,Normal,day1_02_10_20
4,5,0.074918,172.17.0.32,172.17.0.38,66.0,65001 ‚Üí 502 [SYN] Seq=0 Win=4096 Len=0 MSS=1460,tcp,Normal,day1_02_10_20


now we have saved scada_full_raw for further processing.