In [4]:
import pandas as pd
import matplotlib.pyplot as plt

from ntto_parser import NetworkTrackerTrafficParser

class NetworkResource:
    def __init__(self, path) -> None:
        self.parser = NetworkTrackerTrafficParser(path)
        self.resources = pd.DataFrame(
            [
                res
                for domain in self.parser.map(lambda resources: resources)
                for res in domain
            ]
        )
        self.resources["sizes"] = self.parser.map_index(
            lambda res: int(self.get_layer(res, "tcp")["tcp.len"]), self.resources
        )
        self.resources["ip_src"] = self.parser.map_index(
            lambda res: self.get_layer(res, "ip")["ip.src"], self.resources
        )
        self.resources["ip_dst"] = self.parser.map_index(
            lambda res: self.get_layer(res, "ip")["ip.dst"], self.resources
        )
        self.resources["rel_time"] = self.parser.map_index(
            lambda res: float(self.get_layer(res, "frame")["frame.time_delta"]),
            self.resources,
        )
        self.resources["requests"] = self.resources.apply(
            lambda row: [
                row["packets"][i]
                for i, ip in enumerate(row["ip_src"])
                if ip.startswith("172.17")
            ],
            axis=1,
        )
        self.resources["request_sizes"] = self.resources.apply(
            lambda row: [
                row["sizes"][i]
                for i, ip in enumerate(row["ip_src"])
                if ip.startswith("172.17")
            ],
            axis=1,
        )

    def get_layer(self, obj, key):
        if (
            "_source" in obj
            and "layers" in obj["_source"]
            and key in obj["_source"]["layers"]
        ):
            return obj["_source"]["layers"][key]
        else:
            return None

In [5]:
resources = NetworkResource("data")
y = resources.resources['tracker'].tolist()
X = resources.resources[['sizes', 'request_sizes']]
X['sizes'] = X['sizes'].apply(sum)
X['request_sizes'] = X['request_sizes'].apply(lambda x: sum(x) / len(x))
X['start'] = resources.resources['rel_time'].apply(lambda x: x[0])
X['end'] = resources.resources['rel_time'].apply(lambda x: x[-1])

TypeError: 'NoneType' object is not subscriptable

In [None]:
df = pd.concat([X, pd.Series(y, name="is_Tracker")], axis=1)
df

Unnamed: 0,sizes,request_sizes,start,end,is_Tracker
0,1303,425.0,0.000558,0.036515,False
1,79597,329.0,0.002036,0.000037,False
2,68442,163.0,0.003692,0.000010,False
3,59056,88.0,0.001578,0.000009,False
4,62911,91.0,0.001144,0.000362,False
...,...,...,...,...,...
457,46606,277.0,0.000279,0.000018,False
458,593,92.0,0.057035,0.016499,True
459,29725,77.0,0.136246,0.000008,False
460,1133,88.0,0.109135,0.019879,True


In [None]:
from sklearn.preprocessing import StandardScaler
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=['sizes', 'request_sizes', 'start', 'end'])

In [None]:
X

Unnamed: 0,sizes,request_sizes,start,end
0,-0.586000,0.672944,-0.195513,0.752336
1,0.498553,0.333603,-0.191530,-0.188996
2,0.344031,-0.253175,-0.187067,-0.189693
3,0.214013,-0.518285,-0.192764,-0.189719
4,0.267413,-0.507680,-0.193933,-0.180609
...,...,...,...,...
457,0.041551,0.149793,-0.196265,-0.189486
458,-0.595835,-0.504146,-0.043310,0.235814
459,-0.192290,-0.557168,0.170159,-0.189745
460,-0.588355,-0.518285,0.097096,0.323036


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
finalDf = pd.concat([principalDf, df[['is_Tracker']]], axis = 1)
finalDf

Unnamed: 0,principal component 1,principal component 2,is_Tracker
0,1.109348,-0.361127,False
1,-0.217983,-0.375151,False
2,-0.480197,-0.121724,False
3,-0.565793,-0.005113,False
4,-0.584875,-0.018522,False
...,...,...,...
457,-0.070441,-0.239331,False
458,0.157220,0.241012,True
459,-0.298319,0.401850,False
460,0.215726,0.371606,True


In [6]:
groups = finalDf.groupby('is_Tracker')

fig, ax = plt.subplots()

ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    plt.plot(group["principal component 1"], group["principal component 2"], marker="o", linestyle="", label=name)
plt.legend()


NameError: name 'finalDf' is not defined

In [None]:
pca.explained_variance_ratio_

array([0.33031371, 0.25741199])

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report

kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
predict = kmeans.labels_

In [None]:
print(classification_report(y, predict))

              precision    recall  f1-score   support

       False       0.87      1.00      0.93       402
        True       0.50      0.02      0.03        60

    accuracy                           0.87       462
   macro avg       0.69      0.51      0.48       462
weighted avg       0.82      0.87      0.81       462

