In [1]:
from IPython.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import pandas as pd
from pathlib import Path
from pyngrok import ngrok
import re
import subprocess
from uuid import uuid4

from netunicorn.base import Pipeline
# from netunicorn.library.tasks.capture.tcpdump import StartCapture, StopNamedCapture
# from netunicorn.library.tasks.measurements.ookla_speedtest import SpeedTest, SpeedTestLinuxImplementation

from execute_pipeline import execute_pipeline, healthcheck
from get_proxies import get_proxies
# from background_traffic import StartBackgroundTraffic, StopNamedBackgroundTraffic
from youtube_watcher import WatchYouTubeVideo
from vimeo_watcher import WatchVimeoVideo
from twitch_watcher import WatchTwitchStream
from webdav import UploadToWebDav

In [3]:
healthcheck()

Health Check: True
[[snl-server-5], <Uncountable node pool with next node template: [aws-fargate-A-cs190n4-, aws-fargate-B-cs190n4-, aws-fargate-ARM64-cs190n4-]>]


Manually `exec` each module so they get pickled by value.

In [4]:
# __name_bak__ = __name__
# for __name__ in [
#     "background_traffic",
#     "youtube_watcher",
#     "vimeo_watcher",
#     "twitch_watcher",
#     "webdav"
# ]:
#     with open(f"{__name__}.py", "r") as file:
#         exec(file.read())
# __name__ = __name_bak__

from cloudpickle import cloudpickle
import youtube_watcher
import vimeo_watcher
import twitch_watcher
import webdav
cloudpickle.register_pickle_by_value(youtube_watcher)
cloudpickle.register_pickle_by_value(vimeo_watcher)
cloudpickle.register_pickle_by_value(twitch_watcher)
cloudpickle.register_pickle_by_value(webdav)

These are the list of videos that we are testing. 

In [5]:
youtube_ids = [
    "dQw4w9WgXcQ",
    "r5JYHXtt_rw",
    "pxEV1A5mTYM",
    "Ct6BUPvE2sM",
    "KjtYZpqvt50"
]

youtube_videos = [f"https://www.youtube.com/watch?v={v}" for v in youtube_ids]

vimeo_ids = [
    375468729,
    347119375,
    297124334,
    476306167,
    515893651
]

vimeo_videos = [f"https://vimeo.com/{v}?autoplay=1" for v in vimeo_ids]

twitch_ids = [
    2322690366,
    2298318732,
    2316652767,
    1867242354,
    2320975412
]

twitch_videos = [f"https://twitch.tv/video/{v}" for v in twitch_ids]

Load a list of proxies from https://free-proxy-list.net/ and create a Chrome argument for the proxy configuration.

In [6]:
proxies = [proxy for proxy in get_proxies() if proxy.https]

proxy_arg = f"""--proxy-server={','.join([
    f'https://{proxy.ip}:{proxy.port}'
    for proxy in proxies
] + ['direct://'])}"""

Use `ngrok` to expose port `18080`. We'll use this as a https proxy.

In [7]:
tcp_tunnel = ngrok.connect("18080", "tcp")
self_proxy = f"https://{tcp_tunnel.public_url[6:]}"
self_proxy_arg = f"--proxy-server={self_proxy},direct://"
tcp_tunnel
# self_proxy

<NgrokTunnel: "tcp://6.tcp.us-cal-1.ngrok.io:10789" -> "localhost:18080">

Create 10 pipelines, each doing these steps for YouTube, Vimeo, and Twitch:
- watch video without proxies or background traffic
- watch video using a proxy from https://free-proxy-list.net/ but no background traffic
- watch video using the host machine as a proxy but no background traffic
- watch video without proxies but with background traffic

Each pipeline then uploads the result to http://snl-server-5.cs.ucsb.edu/cs190n/cs190n4/capture

In [8]:
from netunicorn.base import Task
from collections.abc import Callable
from typing import Type

watch_tasks: dict[
    str,
    tuple[Type[WatchYouTubeVideo | WatchVimeoVideo | WatchTwitchStream], list[str]]
] = {
    'youtube': (WatchYouTubeVideo, youtube_videos),
    'vimeo': (WatchVimeoVideo, vimeo_videos),
    'twitch': (WatchTwitchStream, twitch_videos),
}

watch_types: dict[str, dict[str, any]] = {
    '_': {
        'get_processes': lambda name: [
            ["sudo", "tcpdump", "-i", "any", "-U", "-w", f"{name}.pcap"]
        ],
        'requirements': [
            "sudo apt-get update",
            "sudo apt-get install -y tcpdump"
        ]
    },
    '_proxy_': {
        'get_processes': lambda name: [
            ["sudo", "tcpdump", "-i", "any", "-U", "-w", f"{name}.pcap"]
        ],
        'webdriver_arguments': [proxy_arg],
        'requirements': [
            "sudo apt-get update",
            "sudo apt-get install -y tcpdump"
        ]
    },
    '_proxy1_': {
        'get_processes': lambda name: [
            ["sudo", "tcpdump", "-i", "any", "-U", "-w", f"{name}.pcap"]
        ],
        'webdriver_arguments': [self_proxy_arg],
        'requirements': [
            "sudo apt-get update",
            "sudo apt-get install -y tcpdump"
        ]
    },
    '_speedtest_': {
        'get_processes': lambda name: [
            ["sudo", "tcpdump", "-i", "any", "-U", "-w", f"{name}.pcap"],
            ["sh", "-c", "while true; do speedtest-cli --simple --secure; done"]
        ],
        'requirements': [
            "sudo apt-get update",
            "sudo apt-get install -y tcpdump",
            "pip install speedtest-cli"
        ]
    }
}

def generate_pipeline(i: int, subdirectory: str) -> Pipeline:
    pipeline = Pipeline(early_stopping=False)
    
    for watch_type, kwargs in watch_types.items():
        for site, (TaskImpl, videos) in watch_tasks.items():
            name = f"{site}{watch_type}{i}"

            pipeline.then(TaskImpl(
                video_url=videos,
                duration=30,
                name=f"{site}{watch_type}{i}",
                **kwargs
            ))
    
            pipeline.then(UploadToWebDav(
                filepaths={f"{name}.pcap"},
                endpoint="http://snl-server-5.cs.ucsb.edu/cs190n/cs190n4/capture",
                username="uploader",
                password="uploader",
                subdirectory=subdirectory,
                name=f"upload_{i}"
            ))
    
    return pipeline

In [9]:
working_node = "aws-fargate-B"
experiment_label = "team-4-experiment-12"

In [None]:
# with open("./proxy_out.log", "w") as stdout, open("./proxy_err.log", "w") as stderr:
#     proxy_env = os.environ.copy()
#     proxy_env["PORT"] = "18080"
#     proxy_process = subprocess.Popen(["node", "proxy.js"], env=proxy_env, stdout=stdout, stderr=stderr)
#     result = execute_pipeline(pipelines, working_node, experiment_label)
#     proxy_process.terminate()

subdirectory = str(uuid4())
pipelines = [generate_pipeline(i, subdirectory) for i in range(10)]
result = execute_pipeline(pipelines, working_node, experiment_label)
result

deployment executor_id: 
commands:
apt install -y python3-pip wget xvfb procps
pip3 install pytest-playwright
playwright install-deps
playwright install chromium
sudo apt-get update
sudo apt-get install -y tcpdump
sudo apt-get install -y curl
pip install speedtest-cli

deployment executor_id: 
commands:
apt install -y python3-pip wget xvfb procps
pip3 install pytest-playwright
playwright install-deps
playwright install chromium
sudo apt-get update
sudo apt-get install -y tcpdump
sudo apt-get install -y curl
pip install speedtest-cli

deployment executor_id: 
commands:
apt install -y python3-pip wget xvfb procps
pip3 install pytest-playwright
playwright install-deps
playwright install chromium
sudo apt-get update
sudo apt-get install -y tcpdump
sudo apt-get install -y curl
pip install speedtest-cli

deployment executor_id: 
commands:
apt install -y python3-pip wget xvfb procps
pip3 install pytest-playwright
playwright install-deps
playwright install chromium
sudo apt-get update
sudo apt

In [11]:
directory = f"/mnt/md0/cs190n/cs190n4/capture/{subdirectory}"
    
for file in os.listdir(directory):
    if file.endswith(".pcap"): 
        print(os.path.join(directory, file))
        !docker run -v {directory}:/tmp/input -v ./data/unprocessed:/tmp/output mielverkerken/cicflowmeter /tmp/input/{file} /tmp/output > /dev/null 2>&1

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/md0/cs190n/cs190n4/capture/5dcb8695-45ff-4c44-a3c8-70efe1ae8b69'

In [12]:
ips = []

# Youtube
ips += """	
2404:6800:4003:c04::5d
2404:6800:4004:809::200e
2404:6800:4006:809::200e
2607:f8b0:4004:814::200e
2607:f8b0:4005:809::200e
2607:f8b0:400a:809::200e
2800:3f0:4001:80a::200e
2a00:1450:4009:81d::200e
2a00:1450:400b:804::200e
74.125.90.110
142.250.179.238
172.217.3.206
172.217.6.46
172.217.26.14
172.217.30.78
172.217.164.142
172.217.194.91
172.217.194.93
172.217.194.136
172.217.194.190
216.58.203.110
""".strip().split("\n")

# Vimeo
ips += """
151.101.0.217
151.101.64.217
151.101.128.217
151.101.192.217
""".strip().split("\n")

# Twitch
ips += """
151.101.2.167
151.101.66.167
151.101.130.167
151.101.194.167
""".strip().split("\n")

# proxies
ips += [proxy.ip for proxy in proxies]

ips.append(re.search(r"(?<=https:\/\/).*?(?=:)", self_proxy).group())

In [13]:
twitch_captures = []
twitch_capture_proxies = []
twitch_capture_proxies1 = []
twitch_capture_speedtests = []
vimeo_captures = []
vimeo_capture_proxies = []
vimeo_capture_proxies1 = []
vimeo_capture_speedtests = []
youtube_captures = []
youtube_capture_proxies = []
youtube_capture_proxies1 = []
youtube_capture_speedtests = []

twitch_ips = []
vimeo_ips = []
youtube_ips = []

for glob, captures, ips in [
    ("twitch_*.pcap_Flow.csv", twitch_captures, twitch_ips),
    ("vimeo_*.pcap_Flow.csv", vimeo_captures, vimeo_ips),
    ("youtube_*.pcap_Flow.csv", youtube_captures, youtube_ips),
]:
    for path in Path("data/unprocessed").glob(glob):
        df = pd.read_csv(path, sep=",")
        df = df[((df['Total Fwd Packet'] > 30) | (df['Total Bwd packets'] > 30)) & (df["Protocol"] == 6)]
        df = df.drop(['Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp'], axis=1)
        df["Label"] = glob.split("_", 1)[0]
        ips.extend(df["Dst IP"].unique())
        captures.append(df)

for glob, captures, ips in [
    ("twitch_proxy_*.pcap_Flow.csv", twitch_capture_proxies, twitch_ips),
    ("twitch_proxy1_*.pcap_Flow.csv", twitch_capture_proxies1, twitch_ips),
    ("vimeo_proxy_*.pcap_Flow.csv", vimeo_capture_proxies, vimeo_ips),
    ("vimeo_proxy1_*.pcap_Flow.csv", vimeo_capture_proxies1, vimeo_ips),
    ("youtube_proxy_*.pcap_Flow.csv", youtube_capture_proxies, youtube_ips),
    ("youtube_proxy1_*.pcap_Flow.csv", youtube_capture_proxies1, youtube_ips)
]:
    for path in Path("data/unprocessed").glob(glob):
        df = pd.read_csv(path, sep=",")
        df = df[((df['Total Fwd Packet'] > 30) | (df['Total Bwd packets'] > 30)) & (df["Protocol"] == 6)]
        ips.extend(df["Dst IP"].unique())
        df = df.drop(['Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp'], axis=1)
        df["Label"] = glob.split("_", 1)[0]

for glob, captures, ips in [
    ("twitch_speedtest_*.pcap_Flow.csv", twitch_capture_speedtests, twitch_ips),
    ("vimeo_speedtest_*.pcap_Flow.csv", vimeo_capture_speedtests, vimeo_ips),
    ("youtube_speedtest_*.pcap_Flow.csv", youtube_capture_speedtests, youtube_ips),
]:
    for path in Path("data/unprocessed").glob(glob):
        df = pd.read_csv(path, sep=",")
        df = df[(df["Dst IP"].isin(ips)) & (df["Protocol"] == 6)]
        df = df.drop(['Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp'], axis=1)
        df["Label"] = glob.split("_", 1)[0]
        captures.append(df)

KeyError: 'Dst IP'

In [16]:
pd.concat(twitch_captures).to_csv("data/twitch_captures.csv")
pd.concat(twitch_capture_proxies).to_csv("data/twitch_capture_proxies.csv")
pd.concat(twitch_capture_proxies1).to_csv("data/twitch_capture_proxies1.csv")
pd.concat(twitch_capture_speedtests).to_csv("data/twitch_capture_speedtests.csv")
pd.concat(vimeo_captures).to_csv("data/vimeo_captures.csv")
pd.concat(vimeo_capture_proxies).to_csv("data/vimeo_capture_proxies.csv")
pd.concat(vimeo_capture_proxies1).to_csv("data/vimeo_capture_proxies1.csv")
pd.concat(vimeo_capture_speedtests).to_csv("data/vimeo_capture_speedtests.csv")
pd.concat(youtube_captures).to_csv("data/youtube_captures.csv")
pd.concat(youtube_capture_proxies).to_csv("data/youtube_capture_proxies.csv")
pd.concat(youtube_capture_proxies1).to_csv("data/youtube_capture_proxies1.csv")
pd.concat(youtube_capture_speedtests).to_csv("data/youtube_capture_speedtests.csv")

In [20]:
# from scapy.all import *
# 
# # rdpcap("/mnt/md0/cs190n/cs190n4/capture/f96448c3-e829-4752-bd3e-752097078e98/tmp/youtube_capture_0.pcap")
# # rdpcap("/mnt/md0/cs190n/cs190n4/capture/edae735b-ace9-45ef-b0ff-0b4ebade43b2/tmp/twitch_capture.pcap")
# foo = rdpcap("/mnt/md0/cs190n/cs190n4/capture/ac772645-dffb-4a44-957e-0b67ee4be6ab/tmp/youtube_capture_0.pcap")
# foo

<youtube_capture_0.pcap: TCP:3334 UDP:80068 ICMP:0 Other:29>

In [21]:
# foo[0]

<CookedLinuxV2  proto=IPv4 reserved=0 ifindex=4 lladdrtype=0x1 pkttype=sent-by-us lladdrlen=6 src=b'\x02\xb1\xdc\x80T\x8b' |<IP  version=4 ihl=5 tos=0x0 len=58 id=63199 flags=DF frag=0 ttl=127 proto=udp chksum=0xea18 src=10.113.5.215 dst=10.113.0.2 |<UDP  sport=57380 dport=domain len=38 chksum=0x1af2 |<DNS  id=7563 qr=0 opcode=QUERY aa=0 tc=0 rd=1 ra=0 z=0 ad=0 cd=0 rcode=ok qdcount=1 ancount=0 nscount=0 arcount=0 qd=[<DNSQR  qname=b'plausible.io.' qtype=A unicastresponse=0 qclass=IN |>] |>>>>

In [None]:
# result