In [None]:
from scapy.all import *
import numpy as np
import pandas as pd

In [None]:
def pcap2csv(packets:PacketList, file_name:str):
    with open(file_name, 'w') as writer:
        writer.write("src,dst,type,IP.version,IP.ihl,IP.tos,IPlen,IP.id,IP.flags,IP.frag,IP.ttl,IP.proto,IP.chksum,IP.src,IP.dst,UDP.sport,UDP.dport,UDP.len,UDP.chksum\n")
        for p in packets:
            if p.haslayer('UDP'):
                writer.write( p.sprintf("%src%,%dst%,%type%,%IP.version%,%IP.ihl%,%IP.tos%,%IP.len%,%IP.id%,%IP.flags%,%IP.frag%,%IP.ttl%,%IP.proto%,%IP.chksum%,%IP.src%,%IP.dst%,%UDP.sport%,%UDP.dport%,%UDP.len%,%UDP.chksum%\n") )


In [None]:
# packets = rdpcap('data/demo.pcapng')
packets = rdpcap('data/leshan-6000.pcap')
print("# packets:", len(packets))

In [None]:
pcap2csv( packets, 'data/demo.csv')

In [None]:
df = pd.read_csv( 'data/demo.csv')
print("# UDP packets", len(df))
df

### handle hexadecimal values
So as to plot distributions and process distances we want properties like chksum to be cnverted to integers.
Several possibilities : read_cvs converters or datafame apply. Can be lambda or define function...

In [None]:

def hex_int(x):
    return int(x, 16)
df = pd.read_csv( 'data/demo.csv', converters={"IP.chksum": hex_int, "UDP.chksum": hex_int})
df

In [None]:
df.dtypes

## Histograms

In [None]:
df['UDP.len'].plot.hist(bins=50)

In [None]:
df['IP.chksum'].plot.hist(bins=100)

In [None]:
print("As expected checksum has more or less a uniform distribution... Seems to be less true as for UDP checksum.")
df['UDP.chksum'].plot.hist(bins=100)

In [None]:
df['IP.id'].plot.hist(bins=10)

In [None]:
df.describe()

In [None]:
df.to_csv('demo-int.csv')

In [None]:
import seaborn as sns
sns.distplot(df['UDP.len'])

## Pincipal Component Analysis
Simple usage of sklearn PCA support

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
df_numeric = df[ ['IP.version', 'IP.ihl', 'IPlen', 'IP.id', 'IP.frag', 'IP.ttl', 'IP.chksum', 'UDP.sport', 'UDP.dport', 'UDP.len', 'UDP.chksum'] ]
print(df_numeric.shape)
df_numeric

In [None]:
X=df_numeric.values

In [None]:
scaler.fit(X)

In [None]:
X_scaled=scaler.transform(X)

In [None]:
from sklearn.decomposition import PCA
pca_4 = PCA(n_components=4)
pca_4.fit(X_scaled)
print("variance explained by 4 principal components:", sum(pca_4.explained_variance_ratio_ * 100))

In [None]:
pca_5 = PCA(n_components=5)
pca_5.fit(X_scaled)
print("variance explained by 5 principal components:", sum(pca_5.explained_variance_ratio_ * 100))

In [None]:
sns.pairplot(df, vars=['IPlen', 'UDP.len'])

In [None]:
sns.pairplot(df, vars=['IPlen', 'IP.id'])