In [2]:
import pickle

In [1]:
import pandas as pd
import subprocess

# Define the mapping for protocols, services, and states
protocol_list = [
    'udp', 'arp', 'tcp', 'igmp', 'ospf', 'sctp', 'gre', 'ggp', 'ip',
    'ipnip', 'st2', 'argus', 'chaos', 'egp', 'emcon', 'nvp', 'pup',
    'xnet', 'mux', 'dcn', 'hmp', 'prm', 'trunk-1', 'trunk-2',
    'xns-idp', 'leaf-1', 'leaf-2', 'irtp', 'rdp', 'netblt', 'mfe-nsp',
    'merit-inp', '3pc', 'idpr', 'ddp', 'idpr-cmtp', 'tp++', 'ipv6',
    'sdrp', 'ipv6-frag', 'ipv6-route', 'idrp', 'mhrp', 'i-nlsp', 'rvd',
    'mobile', 'narp', 'skip', 'tlsp', 'ipv6-no', 'any', 'ipv6-opts',
    'cftp', 'sat-expak', 'ippc', 'kryptolan', 'sat-mon', 'cpnx', 'wsn',
    'pvp', 'br-sat-mon', 'sun-nd', 'wb-mon', 'vmtp', 'ttp', 'vines',
    'nsfnet-igp', 'dgp', 'eigrp', 'tcf', 'sprite-rpc', 'larp', 'mtp',
    'ax.25', 'ipip', 'aes-sp3-d', 'micp', 'encap', 'pri-enc', 'gmtp',
    'ifmp', 'pnni', 'qnx', 'scps', 'cbt', 'bbn-rcc', 'igp', 'bna',
    'swipe', 'visa', 'ipcv', 'cphb', 'iso-tp4', 'wb-expak', 'sep',
    'secure-vmtp', 'xtp', 'il', 'rsvp', 'unas', 'fc', 'iso-ip',
    'etherip', 'pim', 'aris', 'a/n', 'ipcomp', 'snp', 'compaq-peer',
    'ipx-n-ip', 'pgm', 'vrrp', 'l2tp', 'zero', 'ddx', 'iatp', 'stp',
    'srp', 'uti', 'sm', 'smp', 'isis', 'ptp', 'fire', 'crtp', 'crudp',
    'sccopmce', 'iplt', 'pipe', 'sps', 'ib'
]

service_list = [
    '-', 'http', 'ftp', 'ftp-data', 'smtp', 'pop3', 'dns', 'snmp',
    'ssl', 'dhcp', 'irc', 'radius', 'ssh'
]

state_list = ['INT', 'FIN', 'REQ', 'ACC', 'CON', 'RST', 'CLO']

# Create mappings
protocol_mapping = {protocol: index + 1 for index, protocol in enumerate(protocol_list)}
service_mapping = {service: index + 1 for index, service in enumerate(service_list)}
state_mapping = {state: index + 1 for index, state in enumerate(state_list)}

# Function to process each line of output from Tshark
def process_line(line):
    fields = line.strip().split(",")
    
    if len(fields) < 10:  # Adjust based on expected number of fields
        return None

    # Extract fields; ensure the correct indexes based on your Tshark output
    row = {
        'dur': 0,  # Placeholder for duration, calculate if needed
        'proto_code': protocol_mapping.get(fields[10], -1),  # Adjust index for actual protocol
        'service_code': service_mapping.get(fields[11], -1),  # Adjust index for actual service
        'state_code': state_mapping.get(fields[12], -1),  # Adjust index for actual state
        'spkts': 0,  # Placeholder for source packets, calculate if needed
        'dpkts': 0,  # Placeholder for destination packets, calculate if needed
        'sbytes': 0,  # Placeholder for source bytes
        'dbytes': 0,  # Placeholder for destination bytes
        'rate': 0,  # Placeholder for rate
        'sttl': 0,  # Placeholder for source TTL
        'dttl': 0,  # Placeholder for destination TTL
        'sload': 0,  # Placeholder for source load
        'dload': 0,  # Placeholder for destination load
        'sloss': 0,  # Placeholder for source loss
        'dloss': 0,  # Placeholder for destination loss
        'sinpkt': 0,  # Placeholder for source packets in interval
        'dinpkt': 0,  # Placeholder for destination packets in interval
        'sjit': 0,  # Placeholder for source jitter
        'djit': 0,  # Placeholder for destination jitter
        'swin': 0,  # Placeholder for source window
        'stcpb': 0,  # Placeholder for source TCP bytes
        'dtcpb': 0,  # Placeholder for destination TCP bytes
        'dwin': 0,  # Placeholder for destination window
        'tcprtt': 0,  # Placeholder for TCP round trip time
        'synack': 0,  # Placeholder for SYN-ACK
        'ackdat': 0,  # Placeholder for ACK data
        'smean': 0,  # Placeholder for source mean
        'dmean': 0,  # Placeholder for destination mean
        'trans_depth': 0,  # Placeholder for transaction depth
        'response_body_len': 0,  # Placeholder for response body length
        'ct_srv_src': 0,  # Placeholder for count service source
        'ct_state_ttl': 0,  # Placeholder for count state TTL
        'ct_dst_ltm': 0,  # Placeholder for count destination last time
        'ct_src_dport_ltm': 0,  # Placeholder for count source destination port last time
        'ct_dst_sport_ltm': 0,  # Placeholder for count destination source port last time
        'ct_dst_src_ltm': 0,  # Placeholder for count destination source last time
        'is_ftp_login': 0,  # Placeholder for FTP login flag
        'ct_ftp_cmd': 0,  # Placeholder for count FTP command
        'ct_flw_http_mthd': 0,  # Placeholder for count flow HTTP method
        'ct_src_ltm': 0,  # Placeholder for count source last time
        'ct_srv_dst': 0,  # Placeholder for count service destination
        'is_sm_ips_ports': 0,  # Placeholder for SM IPs/Ports flag
    }

    return row

# Start the Tshark process
tshark_command = [
    "tshark", "-i", "any",  # Listen on all interfaces; change if needed
    "-T", "fields",
    "-e", "frame.time_epoch",
    "-e", "frame.len",
    "-e", "eth.src",
    "-e", "eth.dst",
    "-e", "ip.src",
    "-e", "ip.dst",
    "-e", "ip.len",
    "-e", "ip.ttl",
    "-e", "tcp.srcport",
    "-e", "tcp.dstport",
    # Add additional fields as necessary
    "-E", "separator=,",  # Use a comma as the separator
]

process = subprocess.Popen(tshark_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Initialize an empty DataFrame with the specified columns in the correct order
columns = ['dur', 'proto_code', 'service_code', 'state_code', 'spkts', 'dpkts',
           'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss',
           'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb',
           'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
           'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
           'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
           'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
           'ct_srv_dst', 'is_sm_ips_ports']

df = pd.DataFrame(columns=columns)

# Continuously read output from Tshark
try:
    for line in iter(process.stdout.readline, b''):
        line = line.decode('utf-8')
        row = process_line(line)

        if row is not None:
            df = df.append(row, ignore_index=True)
except:

    KeyboardInterrupt
    print("Stopping Tshark...")
finally:
    process.terminate()

# Optional: save the DataFrame to a CSV file
df.to_csv("network_data.csv", index=False)

# Do something with the DataFrame df, e.g., save or process further

FileNotFoundError: [WinError 2] The system cannot find the file specified

In [28]:
df.columns

Index(['dur', 'proto_code', 'service_code', 'state_code', 'spkts', 'dpkts',
       'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss',
       'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb',
       'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports'],
      dtype='object')

In [29]:
with open('label_model.pkl', 'rb') as model_file:
    label_model = pickle.load(model_file)

with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

with open('rf_model.pkl', 'rb') as model_file:
    rf_model = pickle.load(model_file)


In [30]:
X_scaled = scaler.transform(df)

predictions = label_model.predict(X_scaled)

df['label'] = predictions


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- label


In [None]:


# Now you can use loaded_scaler and loaded_rf_model for predictions
X_test_scaled = loaded_scaler.transform(X1_test)
predictions = loaded_rf_model.predict(X_test_scaled)