In [11]:
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
df = joblib.load('anomalous_df.csv')

Based on the results and the comparison of the charts in the WANDB I will set the model parameters to get the best result: precision: %99 and recall: %40

In [12]:
df['frame.time'].head(10)

1     Jun  4, 2023 11:46:30.692656000 EDT
4     Jun  4, 2023 11:45:50.803925000 EDT
5     Jun  4, 2023 11:46:22.023191000 EDT
6     Jun  4, 2023 12:31:40.526404000 EDT
7     Jun  4, 2023 11:49:37.150543000 EDT
13    Jun  4, 2023 11:47:34.587035000 EDT
14    Jun  4, 2023 12:07:39.975212000 EDT
15    Jun  4, 2023 11:42:02.270370000 EDT
16    Jun  4, 2023 11:52:58.659032000 EDT
21    Jun  4, 2023 11:48:24.611942000 EDT
Name: frame.time, dtype: object

In [13]:
# I need to convert the 'frame.time' column to datetime
df['frame.time'] = pd.to_datetime(df['frame.time'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anomalous_df['frame.time'] = pd.to_datetime(anomalous_df['frame.time'])


In [14]:
df['frame.time'].head(10)

1    2023-06-04 11:46:30.692656
4    2023-06-04 11:45:50.803925
5    2023-06-04 11:46:22.023191
6    2023-06-04 12:31:40.526404
7    2023-06-04 11:49:37.150543
13   2023-06-04 11:47:34.587035
14   2023-06-04 12:07:39.975212
15   2023-06-04 11:42:02.270370
16   2023-06-04 11:52:58.659032
21   2023-06-04 11:48:24.611942
Name: frame.time, dtype: datetime64[ns]

In [15]:
# Split the 'frame.protocols' entries into individual protocols and stack them
protocols = df['frame.protocols'].str.split(':', expand=True).stack()

# Count the occurrences of each protocol
protocol_counts = protocols.value_counts()

In [17]:
print(f' anomalous_df.shape: {df.shape}')
protocol_counts

 anomalous_df.shape: (400626, 74)


eth                400626
ethertype          400626
ip                 400626
tcp                400626
http               260037
data-text-lines    144098
xml                 79697
urlencoded-form     13645
png                   545
image-gif             405
media                  73
tls                    51
x509sat                22
data                   17
image-jfif             12
ssh                     6
nbss                    1
smb                     1
Name: count, dtype: int64

In [18]:
df['frame.protocols']

1         eth:ethertype:ip:tcp:http:data-text-lines:xml
4         eth:ethertype:ip:tcp:http:data-text-lines:xml
5                                  eth:ethertype:ip:tcp
6             eth:ethertype:ip:tcp:http:data-text-lines
7         eth:ethertype:ip:tcp:http:data-text-lines:xml
                              ...                      
999992                             eth:ethertype:ip:tcp
999993                             eth:ethertype:ip:tcp
999994        eth:ethertype:ip:tcp:http:urlencoded-form
999996                             eth:ethertype:ip:tcp
999998                             eth:ethertype:ip:tcp
Name: frame.protocols, Length: 400626, dtype: object

In [19]:
# I will take the last protocol for the packet
df['primary_protocol'] = df['frame.protocols'].apply(lambda x:x.split(':')[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anomalous_df['primary_protocol'] = anomalous_df['frame.protocols'].apply(lambda x:x.split(':')[-1])


In [20]:
df['primary_protocol']


1                     xml
4                     xml
5                     tcp
6         data-text-lines
7                     xml
               ...       
999992                tcp
999993                tcp
999994    urlencoded-form
999996                tcp
999998                tcp
Name: primary_protocol, Length: 400626, dtype: object

In [None]:
fig = px.scatter(df,
                 x='frame.time',
                 y='tcp.len',
                 color='primary_protocol',
                 hover_data=['frame.number', 'eth.src', 'eth.dst',
                            'ip.src','ip.dst', 'tcp.srcport','tcp.dstport', 'http.host'],
                 title='Anomalous Packets: TCP Length Over Time Colored by Protocol',
                 labels={'frame.time': 'Time of Packet Capture', 'tcp.len': 'TCP Length'})

fig.update_layout(legend_title_text='Protocol')
fig.show()