# Cleaning 60min dataset (Netflix, Prime, YouTube and Twitch)

10.42.0.1 pc-router, 10.42.0.130 pc-analyzed

In [1]:
import pandas as pd 

df = pd.read_csv('test1_60min.csv')
df.drop(["Unnamed: 0", "src_oui", "dst_oui", "id", "expiration_id","client_fingerprint", "server_fingerprint", "src_mac", "dst_mac", "vlan_id", "tunnel_id", "expiration_id","client_fingerprint", "server_fingerprint", "user_agent", "bidirectional_first_seen_ms", "bidirectional_last_seen_ms", "bidirectional_duration_ms", "dst2src_packets", "dst2src_packets", "dst2src_first_seen_ms", "dst2src_last_seen_ms", "dst2src_duration_ms", "src2dst_first_seen_ms", "src2dst_last_seen_ms", "src2dst_duration_ms", "src2dst_packets", "src2dst_bytes", "dst2src_bytes", "bidirectional_packets"], axis=1, inplace=True)
df.sort_values(["src_ip"], inplace=True)
df.reset_index(drop=True, inplace=True)

## Create the data structure with the infos

### Remove IPv6 addresses

In [13]:
# For now testing only IPv4 addresses
# IPv6 used for MDNS
df.src_ip.unique()

array(['10.42.0.1', '10.42.0.130', '34.252.28.38', '54.213.37.69'],
      dtype=object)

In [14]:
df.dst_ip.unique()

array(['224.0.0.251', '10.42.0.1', '99.80.110.85', '52.46.155.15',
       '108.128.36.17', '216.58.209.42', '52.18.92.59', '95.101.180.169',
       '93.184.220.29', '52.84.221.236', '52.222.131.113', '52.84.223.77',
       '3.224.160.91', '95.140.230.191', '8.238.124.252', '52.85.15.101',
       '216.58.206.78', '142.250.184.110', '216.58.209.46',
       '172.217.21.77', '142.250.180.66', '216.58.206.70',
       '74.125.153.24', '142.250.180.129', '142.250.180.67',
       '142.250.180.99', '216.58.205.67', '216.58.208.174',
       '104.18.21.226', '52.222.128.123', '35.165.11.166',
       '52.11.130.196', '142.250.180.142', '44.226.254.74',
       '151.101.14.167', '54.192.125.217', '216.58.208.130',
       '216.58.208.161', '74.125.99.56', '142.250.180.161',
       '142.250.180.98', '142.250.184.102', '142.250.180.100',
       '216.58.208.142', '216.58.198.22', '74.125.99.59', '54.239.21.68',
       '52.222.132.163', '54.213.37.69', '142.250.180.106',
       '35.244.247.133', '52.222.

In [6]:
df = df[(df['src_ip'] != "ipv6_1") & (df['src_ip'] != "ipv6_2")]
# df = df[(df['dst_ip'] != "ipv6_3")], not necessary

### Create data structure

In [7]:
def check_address(addr):
    parts = addr.split(".")
    # Convert string to int
    for i in range(0, len(parts)):
        parts[i] = int(parts[i])
    
    # local address 10.0.0.0 - 10.255.255.255
    if parts[0] == 10:
        return addr
    # local address 172.16.0.0 - 172.31.255.255
    elif (parts[0] == 172) and (parts[1] >= 16) and (parts[1] <= 31):
        return addr
    # local address 192.168.0.0 - 192.168.255.55
    elif (parts[0] == 192) and (parts[1] == 168):
        return addr
    else:
        return "remote"

In [8]:
# Template: {"src1":{"dst1":[n_bytes, app1, app2], "dst2":n_bytes, ...}, "src2":...... }
# All the remote addresses end up in "remote" address (source and destination)

sources = {}
src_ips = df.src_ip.unique()

for i in src_ips:
    temp = df[df['src_ip'] == i]
    i = check_address(i)
    aux_dict = {}

    for index, row in temp.iterrows():
        dst_ip = check_address(row['dst_ip'])
        b_bytes = row['bidirectional_bytes']
        app_name = row['application_name']
        
        try:
            pres = aux_dict[dst_ip]
            pres[0] += int(b_bytes)
            if not app_name in pres:
                pres.append(app_name) 
        except KeyError:
            aux_dict[dst_ip] = [int(b_bytes), app_name]
            
    sources[i] = aux_dict

sources.keys()

dict_keys(['10.42.0.1', '10.42.0.130', 'remote'])

In [9]:
for i in sources.keys():
    for j in sources[i].keys():
        print(i, " talk to ", j, ", ", sources[i][j][0], " bytes | ", sources[i][j][1:])

10.42.0.1  talk to  remote ,  1206  bytes |  ['MDNS']
10.42.0.130  talk to  10.42.0.1 ,  119296  bytes |  ['DNS', 'DNS.Amazon', 'DNS.AmazonVideo', 'DNS.Google', 'DNS.YouTube', 'DNS.NetFlix', 'DNS.GoogleServices', 'DNS.Twitch', 'DNS.Microsoft', 'DHCP']
10.42.0.130  talk to  remote ,  488956708  bytes |  ['TLS.AmazonVideo', 'TLS.Amazon', 'TLS.GoogleServices', 'TLS', 'HTTP', 'HTTP.Amazon', 'TLS.Google', 'QUIC.Google', 'TLS.YouTube', 'HTTP.Google', 'HTTP.Cloudflare', 'TLS.Twitch', 'QUIC.GoogleServices', 'QUIC.YouTube', 'MDNS', 'ICMP.Google', 'TLS.NetFlix']
remote  talk to  10.42.0.130 ,  858  bytes |  ['TLS.Amazon']


Create and save the graph

In [2]:
from pyvis import network as net

g=net.Network(height='500px', width='800px',heading='')
g.add_nodes(sources.keys())

# Add edges
# First try without weights
for i in sources.keys():
    elem = sources[i]
    g.add_nodes(elem.keys())
    for j in elem.keys(): 
        g.add_edge(i, j)

g.save_graph('example.html')

NameError: name 'sources' is not defined

Export map services

In [11]:
import json

with open('export.json', 'w') as fd:
    json.dump(sources, fd)

# with open('export.json', 'r') as fd:
#     x = json.load(fd)
# print(x)