In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
def convert_to_int(x):
    try:
        if isinstance(x, str) or isinstance(x, float):
            return int(x)
        else:
            return x
    except ValueError:
        return x

In [3]:
# I would like to add more features. I will be focusing on tcp.seq :
# 1. Large, unexplained jumps in TCP sequence numbers can indicate packet loss or reordering. This might happen due to network congestion, faulty hardware, or malicious activities disrupting normal traffic flow.
# 2. If we observe packets with the same sequence number but different payload data, it could be indicative of a session hijacking attack or man-in-the-middle attack, where an attacker is trying to inject malicious packets into a legitimate TCP stream.
# 3. Patterns like regular gaps or strange distributions in sequence numbers might suggest an attacker is trying to infer the state of a TCP session or manipulate TCP traffic.
# 4. Anomalous sequence number patterns can also be associated with port scanning activities, where attackers send TCP packets to various ports to discover services they can exploit.
# 5. In a SYN flooding attack, attackers exploit the TCP handshake mechanism by sending a large number of SYN packets with spoofed IP addresses. An unusual pattern in sequence numbers, combined with a high volume of SYN packets, could indicate such an attack.
# 6. RST packets with sequence numbers that don't align with the expected flow of the ongoing TCP session might be an attempt to prematurely close a TCP connection, potentially indicating a denial-of-service attack or other malicious disruptions.
df = pd.read_csv('../attack-sample-3m.csv')

  df = pd.read_csv('../attack-sample-3m.csv')


In [6]:
df.columns

Index(['frame.number', 'frame.len', 'frame.time', 'frame.time_epoch',
       'frame.protocols', 'eth.src', 'eth.dst', 'eth.type', 'ip.src', 'ip.dst',
       'ip.len', 'ip.ttl', 'ip.flags', 'ip.frag_offset', 'ip.proto',
       'ip.version', 'ip.dsfield', 'ip.checksum', 'tcp.srcport', 'tcp.dstport',
       'tcp.len', 'tcp.seq', 'tcp.ack', 'tcp.flags', 'tcp.flags.syn',
       'tcp.flags.ack', 'tcp.flags.fin', 'tcp.flags.reset', 'tcp.window_size',
       'tcp.checksum', 'tcp.stream', 'udp.srcport', 'udp.dstport',
       'udp.length', 'udp.checksum', 'icmp.type', 'icmp.code', 'icmp.checksum',
       'http.request.method', 'http.request.uri', 'http.request.version',
       'http.request.full_uri', 'http.response.code', 'http.user_agent',
       'http.content_length_header', 'http.content_type', 'http.cookie',
       'http.host', 'http.referer', 'http.location', 'http.authorization',
       'http.connection', 'dns.qry.name', 'dns.qry.type', 'dns.qry.class',
       'dns.flags.response', 'dns.f

In [4]:
columns_to_keep = ['frame.number', 'frame.time', 'frame.protocols', 'tcp.seq', 'tcp.stream', 'tcp.srcport','tcp.dstport', 'udp.srcport', 'udp.dstport', 'tcp.flags.syn', 'tcp.flags.reset', 'ip.src', 'ip.dst', 'alert']

In [5]:
df = df[columns_to_keep]
df.shape

(3000000, 14)

In [9]:
df['tcp.seq'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 3000000 entries, 0 to 2999999
Series name: tcp.seq
Non-Null Count    Dtype 
--------------    ----- 
2996269 non-null  object
dtypes: object(1)
memory usage: 22.9+ MB


In [12]:
df.loc[df['tcp.seq'].apply(lambda x: not isinstance(x, float))]

Unnamed: 0,frame.number,frame.time,frame.protocols,tcp.seq,tcp.stream,tcp.srcport,tcp.dstport,udp.srcport,udp.dstport,tcp.flags.syn,tcp.flags.reset,ip.src,ip.dst
2039808,5025081,"Jun 4, 2023 11:54:48.903692000 EDT",eth:ethertype:ip:tcp:http,9681,89649,59544,80,,,0,0,10.20.30.103,10.20.30.101
2039809,3374184,"Jun 4, 2023 11:48:50.351842000 EDT",eth:ethertype:ip:tcp:http,14443,82366,40352,80,,,0,0,10.20.30.103,10.20.30.101
2039810,3137260,"Jun 4, 2023 11:48:00.518732000 EDT",eth:ethertype:ip:tcp,6283,81239,56466,80,,,0,0,10.20.30.103,10.20.30.101
2039811,1295432,"Jun 4, 2023 11:35:55.144273000 EDT",eth:ethertype:ip:tcp:http,9331,73352,57504,80,,,0,0,10.20.30.103,10.20.30.101
2039812,4616984,"Jun 4, 2023 11:52:59.232568000 EDT",eth:ethertype:ip:tcp:http,25165,87610,40900,80,,,0,0,10.20.30.103,10.20.30.101
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2047995,1127451,"Jun 4, 2023 11:34:42.752891000 EDT",eth:ethertype:ip:tcp:http,11587,72529,45090,80,,,0,0,10.20.30.103,10.20.30.101
2047996,5347165,"Jun 4, 2023 12:08:21.131228000 EDT",eth:ethertype:ip:tcp,86062,91192,80,58622,,,0,0,10.20.30.101,10.20.30.103
2047997,900431,"Jun 4, 2023 11:33:08.888339000 EDT",eth:ethertype:ip:tcp:http:data-text-lines,28668,71419,80,57996,,,0,0,10.20.30.101,10.20.30.103
2047998,4676276,"Jun 4, 2023 11:53:13.465266000 EDT",eth:ethertype:ip:tcp:http:data-text-lines,95067,87980,80,42164,,,0,0,10.20.30.101,10.20.30.103


In [6]:
df['tcp.seq'] = df['tcp.seq'].apply(convert_to_int)

In [14]:
df.loc[df['tcp.seq'].apply(lambda x: not isinstance(x, int))]

Unnamed: 0,frame.number,frame.time,frame.protocols,tcp.seq,tcp.stream,tcp.srcport,tcp.dstport,udp.srcport,udp.dstport,tcp.flags.syn,tcp.flags.reset,ip.src,ip.dst
2,6220814,"Jun 4, 2023 12:49:03.684361000 EDT",eth:ethertype:ip:udp:dns,,,,,43998.0,53.0,,,10.20.30.103,10.20.30.1
821,5660535,"Jun 4, 2023 12:31:19.537102000 EDT",eth:ethertype:ip:udp:dns,,,,,37982.0,53.0,,,10.20.30.103,10.20.30.1
2943,5302458,"Jun 4, 2023 12:03:54.649791000 EDT",eth:ethertype:ip:udp:dns,,,,,44776.0,53.0,,,10.20.30.103,10.20.30.1
3480,5613249,"Jun 4, 2023 12:27:47.429178000 EDT",eth:ethertype:ip:udp:dns,,,,,39117.0,53.0,,,10.20.30.103,10.20.30.1
3614,6053987,"Jun 4, 2023 12:43:18.556629000 EDT",eth:ethertype:ip:icmp:data,,,,,,,,,10.20.30.103,10.20.30.101
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2992803,5310352,"Jun 4, 2023 12:04:43.218403000 EDT",eth:ethertype:ip:udp:dns,,,,,58316.0,53.0,,,10.20.30.103,10.20.30.1
2993072,5814079,"Jun 4, 2023 12:32:52.456929000 EDT",eth:ethertype:ip:icmp:data,,,,,,,,,10.20.30.101,10.20.30.103
2996716,5958091,"Jun 4, 2023 12:33:44.468790000 EDT",eth:ethertype:ip:icmp:data,,,,,,,,,10.20.30.103,10.20.30.101
2998023,5568365,"Jun 4, 2023 12:24:22.385555000 EDT",eth:ethertype:ip:icmp:data,,,,,,,,,10.20.30.103,10.20.30.101


In [7]:
df = df.dropna(subset = ['tcp.seq'])

In [16]:
df.loc[df['tcp.seq'].apply(lambda x: not isinstance(x, int))]

Unnamed: 0,frame.number,frame.time,frame.protocols,tcp.seq,tcp.stream,tcp.srcport,tcp.dstport,udp.srcport,udp.dstport,tcp.flags.syn,tcp.flags.reset,ip.src,ip.dst
2042160,frame.number,frame.time,frame.protocols,tcp.seq,tcp.stream,tcp.srcport,tcp.dstport,udp.srcport,udp.dstport,tcp.flags.syn,tcp.flags.reset,ip.src,ip.dst


In [8]:
df = df[df['tcp.seq'] != 'tcp.seq']

In [18]:
# # I would like to see the distribution of this feature:
# fig = px.histogram(df, x='tcp.seq',title='Distribution of TCP Sequence Numbers' )

# fig.update_layout(
#     xaxis_title='TCP Sequence Number',
#     yaxis_title='Frequency',
#     bargap=0.2  # Adjust the gap between bars
# )

# # Show the plot
# fig.show()

In [9]:
df.columns

Index(['frame.number', 'frame.time', 'frame.protocols', 'tcp.seq',
       'tcp.stream', 'tcp.srcport', 'tcp.dstport', 'udp.srcport',
       'udp.dstport', 'tcp.flags.syn', 'tcp.flags.reset', 'ip.src', 'ip.dst',
       'alert'],
      dtype='object')

In [12]:
# I would like to see if there is a jump in a specific tcp.seq number in at a specific time
ç

0     Jun  4, 2023 12:23:33.466544000 EDT
1     Jun  4, 2023 11:29:51.272605000 EDT
3     Jun  4, 2023 11:33:42.040858000 EDT
4     Jun  4, 2023 11:41:44.312690000 EDT
5     Jun  4, 2023 11:30:50.703997000 EDT
6     Jun  4, 2023 12:05:56.948452000 EDT
7     Jun  4, 2023 11:48:46.111849000 EDT
8     Jun  4, 2023 11:40:20.679234000 EDT
9     Jun  4, 2023 12:18:56.912841000 EDT
10    Jun  4, 2023 11:46:23.237869000 EDT
Name: frame.time, dtype: object

In [13]:
# I need to convert the frame.time feature to a datetime format
df['frame.time'] = pd.to_datetime(df['frame.time'])




In [14]:
df['frame.time'].head(10)

0    2023-06-04 12:23:33.466544
1    2023-06-04 11:29:51.272605
3    2023-06-04 11:33:42.040858
4    2023-06-04 11:41:44.312690
5    2023-06-04 11:30:50.703997
6    2023-06-04 12:05:56.948452
7    2023-06-04 11:48:46.111849
8    2023-06-04 11:40:20.679234
9    2023-06-04 12:18:56.912841
10   2023-06-04 11:46:23.237869
Name: frame.time, dtype: datetime64[ns]

In [15]:
# Setting the 'frame.time' as the index
df.set_index('frame.time', inplace=True)

In [16]:
df['tcp.seq'].value_counts()

tcp.seq
1          147321
0          104339
1449        15321
2897         1658
3312          805
            ...  
1166799         1
139417          1
268495          1
367740          1
348866          1
Name: count, Length: 286187, dtype: int64

In [18]:
# I'm choosin a specific tcp.seq number(1) to investigate
specific_sequence_number = 1

# I will now filter the df
df_tcp_sq_1 = df[df['tcp.seq'] == specific_sequence_number]

In [24]:
df_tcp_sq_1

Unnamed: 0_level_0,frame.number,frame.protocols,tcp.seq,tcp.stream,tcp.srcport,tcp.dstport,udp.srcport,udp.dstport,tcp.flags.syn,tcp.flags.reset,ip.src,ip.dst,alert
frame.time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-06-04 11:25:33.953265,45067,eth:ethertype:ip:tcp,1,22340.0,54437.0,48942.0,,,0.0,1.0,10.20.30.101,10.20.30.103,suspicious
2023-06-04 11:55:19.390367,5099834,eth:ethertype:ip:tcp,1,90077.0,58740.0,80.0,,,0.0,0.0,10.20.30.103,10.20.30.101,suspicious
2023-06-04 11:35:30.646321,1237944,eth:ethertype:ip:tcp,1,73071.0,47764.0,80.0,,,0.0,0.0,10.20.30.103,10.20.30.101,suspicious
2023-06-04 12:33:18.716646,5855649,eth:ethertype:ip:tcp,1,119269.0,24849.0,40030.0,,,0.0,1.0,10.20.30.101,10.20.30.103,suspicious
2023-06-04 12:48:33.204212,6153495,eth:ethertype:ip:tcp,1,170657.0,80.0,55662.0,,,0.0,0.0,10.20.30.101,10.20.30.103,suspicious
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-04 12:33:19.451894,5878300,eth:ethertype:ip:tcp,1,130583.0,13549.0,60014.0,,,0.0,1.0,10.20.30.101,10.20.30.103,suspicious
2023-06-04 11:54:10.527097,4885730,eth:ethertype:ip:tcp,1,88934.0,49684.0,80.0,,,0.0,0.0,10.20.30.103,10.20.30.101,suspicious
2023-06-04 11:45:42.814420,2501150,eth:ethertype:ip:tcp,1,78820.0,80.0,40212.0,,,0.0,0.0,10.20.30.101,10.20.30.103,suspicious
2023-06-04 12:33:11.947976,5819327,eth:ethertype:ip:tcp:http,1,101828.0,80.0,51806.0,,,0.0,0.0,10.20.30.101,10.20.30.103,suspicious


In [26]:
# Group by the index with a specific time frequency and count occurrences, include 'alert' and 'frame.protocols' in the group by

counts_over_time = (df_tcp_sq_1.resample('T')
.agg({'tcp.seq': 'size', 'alert': 'last', 'frame.protocols': 'last'})
.reset_index()
.rename(columns={'tcp.seq': 'count'}))


# resampling('T') --> This is resampling the data based on time intervals. The 'T' argument specifies that the data should be resampled into 1-minute intervals.
# For the 'tcp.seq' column, it calculates the size of each group, which is the number of rows in each 1-minute interval.
# For the 'alert' column, it takes the last value in each 1-minute interval. This could be used to get the most recent alert state.
# .reset_index() --> method is used to reset the index of the DataFrame. After resampling, the new time intervals become the index of the DataFrame. By resetting the index, you move the time intervals back into a regular column and create a standard integer index.
# .rename(columns={'tcp.seq': 'count'}) -->Finally, the .rename() method is used to rename the columns of the DataFrame. Here, it's renaming the column 'tcp.seq' to 'count'. This is probably done to clarify that the column now represents the count of occurrences of 'tcp.seq' in each time interval, rather than the sequence numbers themselves.

In [27]:
counts_over_time

Unnamed: 0,frame.time,count,alert,frame.protocols
0,2023-06-04 11:23:00,20,benign,eth:ethertype:ip:tcp
1,2023-06-04 11:24:00,36,benign,eth:ethertype:ip:tcp:http:data-text-lines
2,2023-06-04 11:25:00,31918,benign,eth:ethertype:ip:tcp:http:data-text-lines
3,2023-06-04 11:26:00,582,benign,eth:ethertype:ip:tcp:http:data-text-lines
4,2023-06-04 11:27:00,801,suspicious,eth:ethertype:ip:tcp:http:data-text-lines
...,...,...,...,...
85,2023-06-04 12:48:00,11198,suspicious,eth:ethertype:ip:tcp
86,2023-06-04 12:49:00,70,suspicious,eth:ethertype:ip:tcp
87,2023-06-04 12:50:00,1428,suspicious,eth:ethertype:ip:tcp
88,2023-06-04 12:51:00,7,suspicious,eth:ethertype:ip:tcp:http:data-text-lines


In [28]:
fig = px.line(counts_over_time,
              x='frame.time',
              y='count',
              hover_data={'count': True, 'alert': True, 'frame.protocols': True},
              title='TCP Sequence Number 1 Occurrences Over Time')


fig.update_traces(hovertemplate="Time: %{x}<br>Count: %{y}<br>TCP Seq: " + str(specific_sequence_number) +
                  "<br>Alert: %{customdata[1]}<br>Frame Protocols: %{customdata[2]}")

fig.show()