In [4]:
# --- Core imports ---
import re
import math
import pandas as pd

from urllib.parse import urlsplit
from IPython.display import display, HTML

import ipywidgets as widgets
import plotly.express as px

# --- Optional: robust registrable-domain extraction via Public Suffix List ---
# If tldextract isn't installed, run: !pip -q install tldextract
try:
    import tldextract
    _TLDEXTRACT = tldextract.TLDExtract(cache_dir=False)  # no filesystem cache required
except Exception as e:
    _TLDEXTRACT = None
    print("WARNING: tldextract not available. Install with: !pip -q install tldextract")
    print("         Falling back to simple TLD parsing (less accurate for co.uk, etc).")


# -----------------------------
# URL parsing helpers
# -----------------------------
_SCHEME_RE = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.-]*://")

def _normalize_url_for_parsing(u: str) -> str:
    """
    ZIA URLs often come without a scheme (e.g. incoming.telemetry.mozilla.org/submit/...)
    urlsplit needs a scheme to reliably separate netloc.
    """
    if u is None or (isinstance(u, float) and math.isnan(u)):
        return ""
    u = str(u).strip()
    if not u:
        return ""
    # Remove surrounding quotes if present
    u = u.strip('"').strip("'")

    # If it's scheme-less but looks like a host/path, prepend https://
    if not _SCHEME_RE.match(u):
        u = "https://" + u
    return u

def extract_host_domain_tld(raw_url: str) -> tuple[str, str, str]:
    """
    Returns:
      url_host: full host (incl. subdomain) from URL
      domain: registrable domain (e.g., github.com, cloudflare-dns.com)
      tld: public suffix (e.g., com, net, co.uk)
    """
    u = _normalize_url_for_parsing(raw_url)
    if not u:
        return ("", "", "")

    try:
        parts = urlsplit(u)
        host = (parts.hostname or "").lower()
    except Exception:
        return ("", "", "")

    # Strip trailing dot if any
    host = host.rstrip(".")

    if not host:
        return ("", "", "")

    # Use PSL-aware extraction if available
    if _TLDEXTRACT is not None:
        ext = _TLDEXTRACT(host)
        # ext.domain + ext.suffix = registrable domain
        tld = (ext.suffix or "").lower()
        domain = ""
        if ext.domain and ext.suffix:
            domain = f"{ext.domain}.{ext.suffix}".lower()
        elif ext.domain:
            domain = ext.domain.lower()
        else:
            domain = host  # fallback
        return (host, domain, tld)

    # Fallback: naive split (less accurate for multi-part suffixes)
    labels = host.split(".")
    if len(labels) >= 2:
        tld = labels[-1].lower()
        domain = ".".join(labels[-2:]).lower()
    else:
        tld = ""
        domain = host.lower()

    return (host, domain, tld)


# -----------------------------
# Load datasets
# -----------------------------
LAB_PATH = "../data/lab_dprk.csv"
MAJ_PATH = "../data/majestic_million.csv"

df = pd.read_csv(LAB_PATH, low_memory=False)
df.columns = df.columns.str.strip()

maj = pd.read_csv(MAJ_PATH)
# Majestic has a "Domain" column in your header sample, and also "TLD"
maj_domains = set(maj["Domain"].astype(str).str.lower().str.strip())
maj_tlds = set(maj["TLD"].astype(str).str.lower().str.strip())

# -----------------------------
# Parse URL column -> host/domain/tld
# -----------------------------
# NOTE: column name in your headers is exactly "URL"
parsed = df["URL"].apply(extract_host_domain_tld)
df[["url_host", "domain", "tld"]] = pd.DataFrame(parsed.tolist(), index=df.index)

# Enrichment flags
df["in_majestic"] = df["domain"].isin(maj_domains)
df["tld_in_majestic"] = df["tld"].isin(maj_tlds)

# Convenience metrics for hunting
df["is_uncommon_domain"] = ~df["in_majestic"]
df["is_uncommon_tld"] = ~df["tld_in_majestic"]

# Quick sanity check with your examples:
_examples = [
    "github.com/anthropics/skills",
    "mozilla.cloudflare-dns.com/dns-query",
]
print("Sanity checks:")
for x in _examples:
    h, d, t = extract_host_domain_tld(x)
    print(f"  {x} -> host={h} | domain={d} | tld={t}")

display(df[["URL","url_host","domain","tld","in_majestic","is_uncommon_tld"]].head(5))

Sanity checks:
  github.com/anthropics/skills -> host=github.com | domain=github.com | tld=com
  mozilla.cloudflare-dns.com/dns-query -> host=mozilla.cloudflare-dns.com | domain=cloudflare-dns.com | tld=com


Unnamed: 0,URL,url_host,domain,tld,in_majestic,is_uncommon_tld
0,ssl.gstatic.com,ssl.gstatic.com,gstatic.com,com,True,False
1,self.events.data.microsoft.com/onecollector/1.0/,self.events.data.microsoft.com,microsoft.com,com,True,False
2,ssl.gstatic.com,ssl.gstatic.com,gstatic.com,com,True,False
3,detectportal.firefox.com/canonical.html,detectportal.firefox.com,firefox.com,com,True,False
4,detectportal.firefox.com/success.txt?ipv4,detectportal.firefox.com,firefox.com,com,True,False


In [5]:
# -----------------------------
# UI controls
# -----------------------------
show_only_not_in_maj = widgets.Checkbox(
    value=False,
    description="Show ONLY domains NOT in Majestic Million",
    indent=False
)

highlight_uncommon_tld = widgets.Checkbox(
    value=True,
    description="Highlight uncommon TLDs",
    indent=False
)

highlight_uncommon_domain = widgets.Checkbox(
    value=True,
    description="Highlight domains not in Majestic",
    indent=False
)

view_mode = widgets.Dropdown(
    options=[
        ("Top Domains (Stack Count)", "top_domains"),
        ("Top TLDs (Stack Count)", "top_tlds"),
        ("Least Frequency of Occurrence (Domains)", "lfo_domains"),
        ("Least Frequency of Occurrence (TLDs)", "lfo_tlds"),
        ("Raw Events (Filtered)", "raw"),
    ],
    value="top_domains",
    description="View:"
)

top_n = widgets.IntSlider(
    value=20, min=5, max=100, step=5,
    description="Top N:",
    continuous_update=False
)

search_text = widgets.Text(
    value="",
    placeholder="Filter: domain contains (e.g. discord, github, telemetry)",
    description="Search:"
)

tld_filter = widgets.Dropdown(
    options=[("All", "")] + [(t, t) for t in sorted(df["tld"].dropna().unique()) if t],
    value="",
    description="TLD:"
)

policy_filter = widgets.Dropdown(
    options=[("All", "")] + sorted([(x, x) for x in df["Policy Action"].dropna().unique()], key=lambda z: z[0]),
    value="",
    description="Policy:"
)

method_filter = widgets.Dropdown(
    options=[("All", "")] + sorted([(x, x) for x in df["Request Method"].dropna().unique()], key=lambda z: z[0]),
    value="",
    description="Method:"
)

out = widgets.Output()

controls = widgets.VBox([
    widgets.HBox([view_mode, top_n]),
    widgets.HBox([search_text, tld_filter]),
    widgets.HBox([policy_filter, method_filter]),
    widgets.HBox([show_only_not_in_maj]),
    widgets.HBox([highlight_uncommon_domain, highlight_uncommon_tld]),
])

display(controls, out)


# -----------------------------
# Rendering helpers
# -----------------------------
def _apply_filters(df0: pd.DataFrame) -> pd.DataFrame:
    d = df0

    q = (search_text.value or "").strip().lower()
    if q:
        d = d[d["domain"].fillna("").str.contains(q, case=False, na=False)]

    if tld_filter.value:
        d = d[d["tld"] == tld_filter.value]

    if policy_filter.value:
        d = d[d["Policy Action"] == policy_filter.value]

    if method_filter.value:
        d = d[d["Request Method"] == method_filter.value]

    if show_only_not_in_maj.value:
        d = d[~d["in_majestic"]]

    return d


def _style_table(d: pd.DataFrame) -> "pd.io.formats.style.Styler":
    show_cols = [
        "Logged Time", "User", "URL", "domain", "tld",
        "Request Method", "Response Code", "Policy Action",
        "Total Bytes", "in_majestic"
    ]
    show_cols = [c for c in show_cols if c in d.columns]
    view = d[show_cols].copy()

    def row_style(row):
        # Must return EXACTLY one style string per column
        styles = [""] * len(view.columns)

        # highlight uncommon domain (entire row amber)
        if highlight_uncommon_domain.value and (not bool(row.get("in_majestic", True))):
            styles = ["background-color: #fff3cd;"] * len(view.columns)

        # highlight uncommon TLD with a red stripe on the TLD cell
        if highlight_uncommon_tld.value:
            tld_val = str(row.get("tld", "")).lower()
            if tld_val and (tld_val not in maj_tlds) and ("tld" in view.columns):
                tld_idx = list(view.columns).index("tld")
                styles[tld_idx] += "border-left: 6px solid #dc3545;"

        return styles

    sty = (
        view.style
        .apply(row_style, axis=1)
        .set_properties(**{"font-size": "12px"})
    )
    return sty


def _render():
    with out:
        out.clear_output()

        d = _apply_filters(df)

        # Header stats
        total = len(d)
        not_in_maj = int((~d["in_majestic"]).sum())
        uniq_domains = d["domain"].nunique(dropna=True)
        uniq_tlds = d["tld"].nunique(dropna=True)

        display(HTML(
            f"""
            <div style="display:flex; gap:16px; flex-wrap:wrap; margin:6px 0 10px 0;">
              <div style="padding:10px 12px; border:1px solid #ddd; border-radius:12px;">
                <div style="font-size:12px; color:#666;">Filtered Events</div>
                <div style="font-size:22px; font-weight:700;">{total:,}</div>
              </div>
              <div style="padding:10px 12px; border:1px solid #ddd; border-radius:12px;">
                <div style="font-size:12px; color:#666;">Unique Domains</div>
                <div style="font-size:22px; font-weight:700;">{uniq_domains:,}</div>
              </div>
              <div style="padding:10px 12px; border:1px solid #ddd; border-radius:12px;">
                <div style="font-size:12px; color:#666;">Unique TLDs</div>
                <div style="font-size:22px; font-weight:700;">{uniq_tlds:,}</div>
              </div>
              <div style="padding:10px 12px; border:1px solid #ddd; border-radius:12px;">
                <div style="font-size:12px; color:#666;">NOT in Majestic</div>
                <div style="font-size:22px; font-weight:700;">{not_in_maj:,}</div>
              </div>
            </div>
            """
        ))

        mode = view_mode.value
        n = int(top_n.value)

        if mode in ("top_domains", "lfo_domains"):
            counts = (
                d["domain"]
                .fillna("")
                .replace("", pd.NA)
                .dropna()
                .value_counts()
                .rename_axis("domain")
                .reset_index(name="count")
            )

            if mode == "lfo_domains":
                # Least frequent = long tail (count ascending)
                counts = counts.sort_values("count", ascending=True).head(n)
                title = f"LFO Domains (least frequent) — bottom {n}"
            else:
                counts = counts.head(n)
                title = f"Top Domains (stack count) — top {n}"

            fig = px.bar(counts, x="count", y="domain", orientation="h", title=title)
            fig.update_layout(height=450, margin=dict(l=10, r=10, t=50, b=10))
            display(fig)

            # Table: show a few events for the most interesting domain (first row)
            if len(counts) > 0:
                focus_domain = counts.iloc[0]["domain"]
                sample = d[d["domain"] == focus_domain].head(25)
                display(HTML(f"<h4 style='margin-top:8px;'>Sample events for: <code>{focus_domain}</code></h4>"))
                display(_style_table(sample))

        elif mode in ("top_tlds", "lfo_tlds"):
            counts = (
                d["tld"]
                .fillna("")
                .replace("", pd.NA)
                .dropna()
                .value_counts()
                .rename_axis("tld")
                .reset_index(name="count")
            )

            if mode == "lfo_tlds":
                counts = counts.sort_values("count", ascending=True).head(n)
                title = f"LFO TLDs (least frequent) — bottom {n}"
            else:
                counts = counts.head(n)
                title = f"Top TLDs (stack count) — top {n}"

            fig = px.bar(counts, x="count", y="tld", orientation="h", title=title)
            fig.update_layout(height=450, margin=dict(l=10, r=10, t=50, b=10))
            display(fig)

            if len(counts) > 0:
                focus_tld = counts.iloc[0]["tld"]
                sample = d[d["tld"] == focus_tld].head(25)
                display(HTML(f"<h4 style='margin-top:8px;'>Sample events for TLD: <code>.{focus_tld}</code></h4>"))
                display(_style_table(sample))

        else:
            # Raw table view
            display(HTML("<h4 style='margin-top:8px;'>Filtered events</h4>"))
            display(_style_table(d.head(200)))


# wire up reactive updates
for w in [
    show_only_not_in_maj,
    highlight_uncommon_tld,
    highlight_uncommon_domain,
    view_mode,
    top_n,
    search_text,
    tld_filter,
    policy_filter,
    method_filter,
]:
    w.observe(lambda change: _render(), names="value")

_render()

VBox(children=(HBox(children=(Dropdown(description='View:', options=(('Top Domains (Stack Count)', 'top_domain…

Output()