In [86]:
# pyright: basic, reportUnknownVariableType=false, reportUnknownMemberType=false

import dask.bag as db
import json
from dask.distributed import Client
import dask.dataframe as dd
import pandas as pd
import xarray as xr

import pandas as pd
pd.set_option('display.max_columns', None)



In [None]:
client = Client()
client.dashboard_link

In [None]:
import dask.dataframe as dd

df = dd.read_parquet("data/ping-2025-06-10T0000.parquet-00000-of-00001.parquet")

df.head(10)

In [None]:
df.count().compute()

Version 5000 Ping / v6 Ping

View changes since previous version.

 - A measurement result is a JSON object with the following fields:
 - "fw" -- firmware
   - Counts:
      ```
      fw
      5082         830
      4900        1648
      4780        1806
      4910        1848
      4740        1923
      5000        6896
      5010       16548
      5060       21087
      5030       38281
      5020      238964
      5040      270900
      5100      702890
      4790      736576
      5110     1214654
      5080    12698733
      5090    23973322
      Name: count, dtype: int64
      ```
 - "af" -- address family, 4 or 6 (integer)
 - "avg" -- average round-trip time (float)
 - "bundle" -- [optional] instance ID for a collection of related measurement results (int)
 - "dst_addr" -- IP address of the destination (string)
 - "dst_name" -- name of the destination (string)
 - "dup" -- number of duplicate packets (int)
 - "from" -- IP address of the probe as known by the controller (string)
 - "group_id" -- [optional] If the measurement belongs to a group of measurements, the identifier of the group (int)
 - "lts" -- last time synchronised. How long ago (in seconds) the clock of the probe was found to be in sync with that of a controller. The value -1 is used to indicate that the probe does not know whether it is in sync (int)
 - "max" -- maximum round-trip time (float)
 - "min" -- minimum round-trip time (float)
 - "msm_id" -- measurement identifier (int)
   - Counts:
      ```
      msm_id
      108583549         1
      108568856         1
      108594069         1
      67653938          1
      108586930         1
                    ...  
      1015         194668
      1013         194668
      1012         194670
      1006         194672
      1011         194678
      Name: count, Length: 28675, dtype: int64
      ```
 - "msm_name" -- measurement type "Ping" (string)
 - "prb_id" -- source probe ID (int)
   - Counts:
      ```
      prb_id
      1002948        1
      1494           1
      1011307        1
      1011314        1
      1006442        1
      ...  
      7185       33676
      6379       33785
      6746       33872
      6875       34553
      6418       34555
      Name: count, Length: 13082, dtype: int64
      ```
 - "proto" -- "ICMP" (string)
 - "rcvd" -- number of packets received (int)
 - "result" -- variable content, depending on type of response (array of objects). objects have the following fields:
   - Case: Timeout
     - "x" -- "*" (string)
   - Case: Error
     - "error" -- description of error (string)
   - Case: Reply
     - "rtt" -- round-trip-time in milliseconds (float)
     - "src_Addr" -- [optional] source address if different from the source address in first reply (string)
     - "ttl" -- [optional] time-to-live reply if different from ttl in first reply (int)
     - "dup" -- [optional] signals that the reply is a duplicate (int)
 - "sent" -- number of packets sent (int)
 - "size" -- packet size (data part, not including IP and ICMP header) (int)
 - "src_addr" -- source address used by probe (string)
 - "timestamp" -- Unix timestamp (int)
 - "ttl" -- time-to-live field in the first reply (missing due to a bug)(int)
 - "ttr" -- time to resolve dst_name in milliseconds (float)
 - "type" -- "ping" (string)


In [None]:
display(df.head(1))
df.dtypes

In [20]:
# Dataset Data verification
# Addresses in src_addr and dst_addr are all valid
# These are all the same value in this dataset
df = df.drop(['msm_name', 'type', 'proto'], axis=1)
df = df.rename(columns={"af": "ip_typ"})


In [None]:
display(df.head(1))

In [49]:
df_rename = df.rename(columns={"prb_id": "src", "dst_addr": "dst", "timestamp": "ts"})
df_old = df_rename
dfl = df_old[["src", "dst", "ts", "avg", "result", "sent", "rcvd"]] 

In [None]:
dfl.head(10)

In [9]:
#    Assume a CSV with columns ["ip","prb_id"]
ip_map_pd = pd.read_csv("probe_ip_map.csv")
ip_map_pd = ip_map_pd.rename(columns={"prb_id": "dst_prb_id"})

# 3. Turn the lookup into a Dask DataFrame (small, single partition is fine)
ip_map = dd.from_pandas(ip_map_pd, npartitions=1)

In [None]:
ip_map.head(10)

In [None]:
dfl2 = dfl.merge(
    ip_map[["ip", "dst_prb_id"]],
    left_on="dst", right_on="ip",
    how="left"
)
dfl2["dst_prb_id"] = dfl2["dst_prb_id"].fillna(-1).astype(int)
dfl2.head(10)
# filter dfl2 where dst_prb_id is not -1
dfl3 = dfl2[dfl2["dst_prb_id"] != -1]
dfl3.head(10)
# select columns src, dst, dst_prb_id
dfl5 = dfl3[["src", "dst_prb_id", "ts", "avg", "result", "sent", "rcvd"]]
dfl6 = dfl5.rename(columns={"dst_prb_id": "dst"})
display(dfl6.head(10))
print(dfl6.count().compute())

In [None]:
edges = dfl6[["src", "dst"]].drop_duplicates()
edges.head(10)

In [None]:
edges.count().compute()

In [None]:
conn_counts = edges.groupby("src").dst.count().compute()
conn_counts_sorted = conn_counts.sort_values(ascending=False)
conn_counts_sorted.head(1000)

In [64]:
top_src = conn_counts.nlargest(100).index.tolist()

In [None]:
ddf = dfl6[dfl6.src.isin(top_src) & dfl6.dst.isin(top_src)].persist()
display(ddf.head(10))
print(ddf.count().compute())

In [None]:
ddf[(ddf["rcvd"] < 3) & (ddf["rcvd"] > 0)].head(10)

In [None]:

# First, let's investigate the temporal structure of our data
print("=== Temporal Investigation ===")

# Check the timestamp range and distribution
ts_stats = ddf['ts'].describe().compute()
print("Timestamp statistics:")
print(ts_stats)

# Check if timestamps are evenly spaced
ts_values = ddf['ts'].unique().compute()
ts_values_sorted = sorted(ts_values)
print(f"\nNumber of unique timestamps: {len(ts_values_sorted)}")

# Calculate time differences between consecutive timestamps
if len(ts_values_sorted) > 1:
    ts_diffs = [ts_values_sorted[i+1] - ts_values_sorted[i] for i in range(len(ts_values_sorted)-1)]
    print(f"Time step differences (first 10): {ts_diffs[:10]}")
    print(f"Min time step: {min(ts_diffs)}")
    print(f"Max time step: {max(ts_diffs)}")
    print(f"Most common time step: {max(set(ts_diffs), key=ts_diffs.count)}")

# Check for missing combinations
print("\n=== Data Completeness Investigation ===")

# Get unique values for each dimension
unique_src = ddf['src'].unique().compute()
unique_dst = ddf['dst'].unique().compute()
unique_ts = ddf['ts'].unique().compute()

print(f"Unique sources: {len(unique_src)}")
print(f"Unique destinations: {len(unique_dst)}")
print(f"Unique timestamps: {len(unique_ts)}")

# Calculate theoretical vs actual data points
theoretical_points = len(unique_src) * len(unique_dst) * len(unique_ts)
actual_points = len(ddf.compute())
print(f"Theoretical data points: {theoretical_points:,}")
print(f"Actual data points: {actual_points:,}")
print(f"Data completeness: {actual_points/theoretical_points*100:.2f}%")

# Check for self-loops (src == dst)
self_loops = ddf[ddf['src'] == ddf['dst']].compute()
print(f"Self-loops (src == dst): {len(self_loops)}")

# Sample some actual data to understand the structure
print("\n=== Sample Data Structure ===")
sample_data = ddf.head(20)
print("Sample data:")
print(sample_data[['src', 'dst', 'ts', 'avg']].to_string())

# Check for multiple measurements per src-dst-ts combination
duplicates = ddf.groupby(['src', 'dst', 'ts']).size().compute()
print(f"\nMultiple measurements per src-dst-ts combination:")
print(f"Max measurements per combination: {duplicates.max()}")


print(f"Mean measurements per combination: {duplicates.mean():.2f}")

In [None]:
ddf.groupby(['src', 'dst', 'ts']).size().compute()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import xarray as xr

def calculate_temporal_variation_basic(df):
    """
    Calculate basic temporal variation statistics for the entire dataset
    """
    print("=== Basic Temporal Variation Analysis ===")
    
    # Convert timestamp to datetime for easier analysis
    df['datetime'] = pd.to_datetime(df['ts'], unit='s')
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek
    
    # Overall temporal statistics
    print(f"Time span: {df['datetime'].min()} to {df['datetime'].max()}")
    print(f"Total duration: {df['datetime'].max() - df['datetime'].min()}")
    print(f"Number of unique timestamps: {df['ts'].nunique()}")
    
    # Temporal distribution of measurements
    hourly_counts = df.groupby('hour').size()
    daily_counts = df.groupby('day_of_week').size()
    
    print(f"\nMeasurements per hour (mean): {hourly_counts.mean():.2f}")
    print(f"Measurements per hour (std): {hourly_counts.std():.2f}")
    print(f"Measurements per day (mean): {daily_counts.mean():.2f}")
    
    return df, hourly_counts, daily_counts

# Usage
df_with_time = ddf.compute()  # Convert dask dataframe to pandas
df_processed, hourly_counts, daily_counts = calculate_temporal_variation_basic(df_with_time)

def analyze_node_pair_temporal_variation(df, top_n_pairs=10):
    """
    Analyze temporal variation for specific node pairs
    """
    print("=== Node Pair Temporal Variation Analysis ===")
    
    # Find most active node pairs
    pair_counts = df.groupby(['src', 'dst']).size().sort_values(ascending=False)
    top_pairs = pair_counts.head(top_n_pairs)
    
    temporal_variations = {}
    
    for (src, dst), count in top_pairs.items():
        pair_data = df[(df['src'] == src) & (df['dst'] == dst)].copy()
        pair_data['datetime'] = pd.to_datetime(pair_data['ts'], unit='s')
        
        # Calculate temporal statistics
        hourly_var = pair_data.groupby(pair_data['datetime'].dt.hour)['avg'].std().mean()
        daily_var = pair_data.groupby(pair_data['datetime'].dt.dayofweek)['avg'].std().mean()
        
        # Calculate coefficient of variation
        cv = pair_data['avg'].std() / pair_data['avg'].mean()
        
        # Calculate temporal autocorrelation
        if len(pair_data) > 1:
            pair_data_sorted = pair_data.sort_values('ts')
            autocorr = pair_data_sorted['avg'].autocorr()
        else:
            autocorr = np.nan
        
        temporal_variations[(src, dst)] = {
            'count': count,
            'hourly_variation': hourly_var,
            'daily_variation': daily_var,
            'coefficient_of_variation': cv,
            'autocorrelation': autocorr,
            'mean_latency': pair_data['avg'].mean(),
            'std_latency': pair_data['avg'].std()
        }
    
    # Create summary dataframe
    variation_df = pd.DataFrame.from_dict(temporal_variations, orient='index')
    variation_df.index.names = ['src', 'dst']
    
    print("Temporal variation for top node pairs:")
    print(variation_df.round(3))
    
    return variation_df


In [None]:
def pairwise_time_variance_heatmaps(df):
    # Get all unique src and dst
    srcs = sorted(df['src'].unique())
    dsts = sorted(df['dst'].unique())
    src_idx = {s: i for i, s in enumerate(srcs)}
    dst_idx = {d: i for i, d in enumerate(dsts)}
    
    # Initialize matrices
    stddev_matrix = np.full((len(srcs), len(dsts)), np.nan)
    autocorr_matrix = np.full((len(srcs), len(dsts)), np.nan)
    
    # Group by pair and compute stats
    for (src, dst), group in df.groupby(['src', 'dst']):
        if len(group) > 1:
            latencies = group.sort_values('ts')['avg'].values
            stddev_matrix[src_idx[src], dst_idx[dst]] = np.std(latencies)
            # Autocorrelation (lag-1)
            if len(latencies) > 2:
                autocorr_matrix[src_idx[src], dst_idx[dst]] = pd.Series(latencies).autocorr()
    
    fig, axes = plt.subplots(1, 2, figsize=(18, 7))
    im0 = axes[0].imshow(stddev_matrix, aspect='auto', cmap='magma')
    axes[0].set_title('Per-Pair Latency Stddev Over Time')
    axes[0].set_xlabel('Destination Index')
    axes[0].set_ylabel('Source Index')
    plt.colorbar(im0, ax=axes[0], label='Stddev (ms)')
    
    im1 = axes[1].imshow(autocorr_matrix, aspect='auto', cmap='coolwarm', vmin=-1, vmax=1)
    axes[1].set_title('Per-Pair Latency Autocorrelation (Lag-1)')
    axes[1].set_xlabel('Destination Index')
    axes[1].set_ylabel('Source Index')
    plt.colorbar(im1, ax=axes[1], label='Autocorrelation')
    
    plt.tight_layout()
    plt.show()

# Usage
pairwise_time_variance_heatmaps(df_processed)

In [None]:
# generate a visualization of avg latencies between two specific nodes to get a sense of the probability distribution i am working with

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Find node pairs where src != dst (no self-loops) with good data coverage
node_pairs = ddf.groupby(['src', 'dst']).size().compute().sort_values(ascending=False)
# Filter out self-loops (where src == dst)
valid_pairs = node_pairs[node_pairs.index.get_level_values('src') != node_pairs.index.get_level_values('dst')]
print("Top 10 most active node pairs (excluding self-loops):")
print(valid_pairs.head(10))

# Select a single node pair with good data coverage
if len(valid_pairs) > 0:
    # Get the most active pair with at least 20 measurements
    active_pairs = valid_pairs[valid_pairs >= 20]
    if len(active_pairs) > 0:
        selected_pair = active_pairs.index[0]
        src, dst = selected_pair
        print(f"\nSelected node pair: {src} → {dst} with {active_pairs.iloc[0]} measurements")
        
        # Get data for this specific pair
        pair_data = ddf[(ddf['src'] == src) & (ddf['dst'] == dst) & (ddf['avg'] > 0)].compute()
        
        if len(pair_data) > 0:
            # Create simple histogram
            plt.figure(figsize=(10, 6))
            plt.hist(pair_data['avg'], bins=30, alpha=0.7, edgecolor='black', color='skyblue')
            plt.xlabel('Average Latency (ms)')
            plt.ylabel('Frequency')
            plt.title(f'Latency Distribution: Node {src} → Node {dst}\n({len(pair_data)} measurements)')
            plt.grid(True, alpha=0.3)
            
            # Add statistics as text
            mean_latency = pair_data['avg'].mean()
            median_latency = pair_data['avg'].median()
            std_latency = pair_data['avg'].std()
            min_latency = pair_data['avg'].min()
            max_latency = pair_data['avg'].max()
            
            stats_text = f'Mean: {mean_latency:.2f} ms\n'
            stats_text += f'Median: {median_latency:.2f} ms\n'
            stats_text += f'Std Dev: {std_latency:.2f} ms\n'
            stats_text += f'Range: {min_latency:.2f} - {max_latency:.2f} ms'
            
            plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes, 
                    verticalalignment='top', horizontalalignment='right',
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
            
            plt.tight_layout()
            plt.show()
            
            print(f"\nStatistics for Node {src} → Node {dst}:")
            print(f"Number of measurements: {len(pair_data)}")
            print(f"Mean latency: {mean_latency:.2f} ms")
            print(f"Median latency: {median_latency:.2f} ms")
            print(f"Standard deviation: {std_latency:.2f} ms")
            print(f"Min latency: {min_latency:.2f} ms")
            print(f"Max latency: {max_latency:.2f} ms")
        else:
            print("No successful measurements found for this pair")
    else:
        print("No node pairs found with sufficient data (>= 20 measurements)")
else:
    print("No valid node pairs found (all are self-loops)")