In [2]:
import os
import glob
import re
import os        
from scipy.optimize import curve_fit
from scipy.stats import norm
import numpy as np
import plotly.graph_objects as go
import pandas as pd
from plotly.subplots import make_subplots
from collections import defaultdict

In [None]:
# Parameters for plotting

decoder_colors = {
    'uf': [
        "#6baed6", "#4292c6","#3182bd", "#1f77b4", "#2171b5", 
        "#08519c", "#08306b", "#08519c", "#08306b", "#08306b",
    ],
    'clayg': [
        "#fdae6b", "#ffbb78", "#ff8c00", "#fd8d3c", "#ffa726",
        "#f16913", "#ff7f0e", "#d95f02", "#d94801", "#a63603",    
    ],
    'sl_clayg': [
        "#31a354", "#74c476", "#238b45", "#31a354", "#74c476",
        "#006d2c", "#00441b", "#006d2c", "#00441b", "#006d2c",
    ],
    'clayg_stop_early' : [
        "#525252", "#252525", "#737373", "#525252", "#252525",
        "#000000",
    ],
    'other': [
        "#e377c2", "#d62728", "#ff9896", "#c51b7d", "#8c564b",
        "#e377c2", "#d62728", "#ff9896", "#c51b7d", "#8c564b",
    ]
}

other_decoders = ['clayg_third_growth', 'clayg_faster_backwards_growth', 'sl_clayg_third_growth']
for decoder in other_decoders:
    decoder_colors[decoder] = decoder_colors['other']

decoder_names = {
    'uf': 'UF',
    'clayg': 'ClAYG',
    'sl_clayg': 'Single Layer ClAYG',
    'clayg_third_growth': 'ClAYG ⅓ Growth',
    'clayg_faster_backwards_growth': 'ClAYG w/ Faster Backwards Growth',
    'sl_clayg_third_growth': 'Single Layer ClAYG ⅓ Growth',
    'clayg_stop_early': 'ClAYG Stop Early'
}

In [4]:
class Plot:
    fig : go.Figure
    title : str
    def __init__(self, fig, title):
        self.fig = fig
        self.title = title
        self.fig.update_layout(
            title=self.title,
        )
    
    def show(self):
        # render into html
        html = self.fig.to_html(full_html=True, include_plotlyjs='cdn')
        html = html.replace('<head>', f'<head><title>{self.title}</title><meta name="viewport" content="width=device-width, initial-scale=1">')
        # open in browser
        file_name = self.title.replace(',', '').replace(' ', '_').lower()
        file_name = f'plots/{file_name}.html'
        with open(file_name, 'w') as f:
            f.write(html)
        os.system(f'xdg-open "{file_name}"')

In [5]:
def collect_data_old(base_dir, plot_ids) -> pd.DataFrame:
    data = pd.DataFrame(columns=["metric", "decoder", "distance", "p", "value"])

    for plot_id in plot_ids:
        plot_folders = [f for f in glob.glob(os.path.join(base_dir, f"{plot_id}-*")) if os.path.isdir(f)]

        if not plot_folders:
            print(f"No folders found for plot_id {plot_id}")
            continue
        
        folder = plot_folders[0]
        files = glob.glob(os.path.join(folder, "*.txt"))

        pattern = re.compile(r"(average_operations|results)_(\w+)_d=(\d+)\.txt")
        for file in files:
            match = pattern.match(os.path.basename(file))
            if not match:
                continue
            metric, decoder, distance = match.groups()
            distance = int(distance)
            with open(file, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    if line.strip():
                        parts = line.strip().split()
                        if len(parts) == 2:
                            key, value = parts
                            # check if line is header or not
                            if key == "p":
                                continue
                            try:
                                p = float(key)
                                value = float(value)
                            except ValueError:
                                print(f"Skipping line with non-numeric key or value: {line.strip()}")
                                continue
                            # add data to dataframe
                            data.loc[len(data)] = {
                                "metric": metric,
                                "decoder": decoder,
                                "distance": distance,
                                "p": p,
                                "value": value
                            }
    return data

class Data:
    results : pd.DataFrame
    steps : pd.DataFrame

def collect_data(base_dirs, plot_ids) -> Data:
    data = Data()
    data.results = pd.DataFrame(columns=["decoder", "distance", "p", "l", "n"])
    data.steps = pd.DataFrame(columns=["decoder", "distance", "p", "value", "occurences"])

    plot_folders = []
    for base_dir in base_dirs:
        if not plot_ids:
            plot_folders.extend([f for f in glob.glob(os.path.join(base_dir, "*")) if os.path.isdir(f)])
        else:
            for plot_id in plot_ids:
                plot_folders.extend([f for f in glob.glob(os.path.join(base_dir, f"{plot_id}*")) if os.path.isdir(f)])
    
    for folder in plot_folders:
        results_files = glob.glob(os.path.join(folder, "results", "*.txt"))
        steps_files = glob.glob(os.path.join(folder, "steps", "*.txt"))  
        
        results_file_pattern = re.compile(r"(\w+)_d=(\d+)\.txt")
        for file in results_files:
            results_match = results_file_pattern.match(os.path.basename(file))
            if not results_match:
                continue
            decoder, distance = results_match.groups()
            distance = int(distance)
            with open(file, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    if line.strip():
                        parts = line.strip().split()
                        try:
                            if len(parts) == 2:
                                key, value = parts
                                n = None
                            elif len(parts) == 3:
                                key, value, n = parts
                            else:
                                raise ValueError("Unexpected number of parts in line")
                            p = float(key)
                            l = float(value)
                            n = float(n) if not n == None else None
                        except ValueError:
                            print(f"Skipping line with NaN: {line.strip()}")
                            continue
                        # add data to dataframe
                        data.results.loc[len(data.results)] = {
                            "decoder": decoder,
                            "distance": distance,
                            "p": p,
                            "l": l,
                            "n": n
                        }
                                
        steps_file_pattern = re.compile(r"(\w+)_d=(\d+)_p=(\d+.\d+)\.txt")
        for file in steps_files: 
            steps_match = steps_file_pattern.match(os.path.basename(file))
            if not steps_match:
                continue
            decoder, distance, p = steps_match.groups()
            distance = int(distance)
            p = float(p)
            with open(file, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    if line.strip():
                        parts = line.strip().split()
                        if len(parts) == 2:
                            key, value = parts
                            try:
                                steps = float(key)
                                occurences = int(value)
                            except ValueError:
                                print(f"Skipping line with NaN: {line.strip()}")
                                continue
                            # add data to dataframe
                            data.steps.loc[len(data.steps)] = {
                                "decoder": decoder,
                                "distance": distance,
                                "p": p,
                                "value": steps,
                                "occurences": occurences
                            }
                
    return data


In [6]:
def threshold_plot(results) -> Plot:
    fig = go.Figure()

    for (decoder, distance), values in results.items():
        colors = decoder_colors.get(decoder)
        decoder_name = decoder_names.get(decoder)
        ps = list(values.keys())
        ls, ns = zip(*(values.values()))
        ls = list(ls)
        ns = list(ns)

        sorted_indices = np.argsort(ps)
        ps = np.array(ps)[sorted_indices]
        ls = np.array(ls)[sorted_indices]
        
        # Compute Wilson score uncertainties
        # phat is the estimated proportion of failures
        z = norm.ppf(1 - 0.05 / 2)  # for 95% confidence
        ns = [n if not n == None and not np.isnan(n) else 200000 for n in ns]
        sigma = [1 / (1 + z**2 / n) * (l + z/(2*n)*(z + np.sqrt(4*n*l*(1-l))+z**2)) for l, n in zip(ls, ns)]
        sigma = np.array(sigma)
        
        fig.add_trace(go.Scatter(
            x=ps,
            y=ls,
            error_y=dict(
                type='data',
                array=sigma,
                visible=False,
                thickness=1.5,
                width=3
            ),
            mode='lines+markers',
            name=f"{decoder_name} d={distance}",
            line=dict(color=colors[distance % len(colors)], width=2),
            marker=dict(size=5),
            legendgroup=decoder_name,
            legendgrouptitle_text=decoder_name,
            hovertemplate=f"{decoder_name} d={distance}<br>p: %{{x:.2e}}<br>L: %{{y:.2e}}",
            showlegend=True,
        ))
                

        if len(ps) < 2:
            print(f"Not enough data points for fitting for {decoder_name} d={distance}")
            continue

        def power_law(x, a, b):
            return a * np.power(x, b)

        try:
            popt, pcov = curve_fit(
                power_law, ps, ls, sigma=sigma, absolute_sigma=True
            )
            a, b = popt
            print(f"{decoder_name} d={distance}, d/2={(distance)/2}, b={b:.4f}")
            fit_x = np.linspace(min(ps), max(ps), 100)
            fit_y = power_law(fit_x, *popt)
            fig.add_trace(go.Scatter(
                x=fit_x,
                y=fit_y,
                mode='lines',
                name=f"{decoder_name} fit d={distance}",
                line=dict(color=colors[distance % len(colors)], width=1, dash='dash'),
                legendgroup=f"{decoder_name} fit",
                legendgrouptitle_text=f"{decoder_name} fit",
                hovertemplate=f"{decoder_name} fit d={distance}: <br> parameters: c={b:.4f}<br> d/2={(distance)/2}",
                showlegend=True,
            ))
        except Exception as e:
            print(f"Error fitting data for {decoder_name} d={distance}: {e}")
        
    # Set legend groups ending with fit to not selected by default
    for trace in fig.data:
        if 'fit' in trace.name:
            trace.visible = 'legendonly'

    fig.update_layout(
        title=f"Results",
        legend_title="Decoder",
        template="plotly_white",
        xaxis=dict(type='log', title='p (log scale)'),
        yaxis=dict(type='log', title='L (log scale)'),
    )
    
    return Plot(fig, "Threshold Plot")

In [8]:
def plot_average_operations_against_p(average_operations) -> Plot:
    fig = go.Figure()

    for (decoder, distance), values in average_operations.items():
        colors = decoder_colors.get(decoder)
        decoder_name = decoder_names.get(decoder)
        x = list(values.keys())
        y = list(values.values())
        # sort by x
        sorted_indices = np.argsort(x)
        x = np.array(x)[sorted_indices]
        y = np.array(y)[sorted_indices]
        print(f"Plotting {decoder_name} d={distance}, data points: {len(x)}")
        fig.add_trace(go.Scatter(
            x=x,
            y=y,
            mode='lines+markers',
            name=f"{decoder_name} d={distance}",
            line=dict(color=colors[distance % len(colors)], width=2),
            marker=dict(size=5),
            legendgroup=decoder_name,
            legendgrouptitle_text=decoder_name,
            hovertemplate=f"{decoder_name} d={distance}<br>p: %{{x:.2e}}<br>avg. # of operations: %{{y:.3}}",
            showlegend=True,
        ))

    fig.update_layout(
        legend_title="Decoder",
        template="plotly_white",
        xaxis=dict(type='linear', title='p (linear scale)'),
        yaxis=dict(type='linear', title='Average Operations (linear scale)'),
    )
    
    return Plot(fig, "Average Operations against p, grouped by distance")

def plot_average_operations_against_d(average_operations) -> Plot:
    fig = go.Figure()

    for (decoder, p), values in average_operations.items():
        colors = decoder_colors.get(decoder)
        decoder = decoder_names.get(decoder)
        x = list(values.keys())
        y = list(values.values())
        # sort by x
        sorted_indices = np.argsort(x)
        x = np.array(x)[sorted_indices]
        y = np.array(y)[sorted_indices]
        print(f"Plotting {decoder} p={p}, data points: {len(x)}")
        fig.add_trace(go.Scatter(
            x=x,
            y=y,
            mode='lines+markers',
            name=f"{decoder} p={p}",
            line=dict(color=colors[int(p*1000) % len(colors)], width=2),
            marker=dict(size=5),
            legendgroup=decoder,
            legendgrouptitle_text=decoder,
            hovertemplate=f"{decoder} p={p}<br>d: %{{x}}<br>avg. # of operations: %{{y:.3}}",
            showlegend=True,
        ))

    fig.update_layout(
        legend_title="Decoder",
        template="plotly_white",
        xaxis=dict(type='linear', title='distance d (linear scale)'),
        yaxis=dict(type='linear', title='Average Operations (log scale)'),
    )
    
    return Plot(fig, "Average Operations against d, grouped by p")

def plot_step_distribution_by_p(steps) -> Plot:
    # Get all unique p values
    ps = set(p for (_, _, p) in steps.keys())
    if len(ps) > 1:
        # Multiple p: group by (decoder, distance), columns for each p
        group = defaultdict(list)
        for decoder, distance, p in steps.keys():
            group[(decoder, distance)].append(p)
        row_keys = list(group.keys())
        max_cols = max(len(ps) for ps in group.values())
        fig = make_subplots(
            rows=len(row_keys), cols=max_cols,
            subplot_titles=[
                f"{decoder_names.get(decoder, decoder)} d={distance}, p={p:.1e}" if p is not None else ""
                for (decoder, distance), ps in group.items() for p in (sorted(ps) if len(ps) == max_cols else sorted(ps) + [None] * (max_cols - len(ps)))
            ],
            horizontal_spacing=0.08, vertical_spacing=0.02
        )
        for row, (decoder, distance) in enumerate(row_keys, start=1):
            ps_sorted = sorted(group[(decoder, distance)])
            for col, p in enumerate(ps_sorted, start=1):
                values = steps[(decoder, distance, p)]
                x = []
                for step_val, count in sorted(values.items()):
                    x.extend([step_val] * count)
                colors = decoder_colors.get(decoder)
                fig.add_trace(
                    go.Histogram(
                        x=x,
                        marker_color=colors[distance % len(colors)],
                        showlegend=False
                    ),
                    row=row, col=col
                )
    else:
        # Only one p: group by decoder, columns for each distance
        group = defaultdict(list)
        for decoder, distance, p in steps.keys():
            group[decoder].append(distance)
        row_keys = list(group.keys())
        max_cols = max(len(ds) for ds in group.values())
        fig = make_subplots(
            rows=len(row_keys), cols=max_cols,
            subplot_titles=[
                f"{decoder_names.get(decoder, decoder)} d={distance}" if distance is not None else ""
                for decoder, ds in group.items() for distance in (sorted(ds) if len(ds) == max_cols else sorted(ds)+ [None]*(max_cols - len(ds)))
            ],
            horizontal_spacing=0.08, vertical_spacing=0.04
        )
        for row, decoder in enumerate(row_keys, start=1):
            ds_sorted = sorted(group[decoder])
            for col, distance in enumerate(ds_sorted, start=1):
                # Since only one p, get it
                p = next(p for (d, dep, p) in steps.keys() if d == decoder and dep == distance)
                values = steps[(decoder, distance, p)]
                x = []
                for step_val, count in sorted(values.items()):
                    x.extend([step_val] * count)
                colors = decoder_colors.get(decoder)
                fig.add_trace(
                    go.Histogram(
                        x=x,
                        marker_color=colors[distance % len(colors)],
                        showlegend=False
                    ),
                    row=row, col=col
                )
    fig.update_layout(
        template="plotly_white",
        height=300 * len(row_keys),
        width=500 * max_cols
    )
    for i in range(1, len(row_keys) + 1):
        fig.update_yaxes(title_text="Count", row=i, col=1)
    for j in range(1, max_cols + 1):
        fig.update_xaxes(title_text="Steps", row=1, col=j)
    
    return Plot(fig, "Step Distribution by Decoder, Distance, and p")

In [9]:
base_dirs = [
    "../data/ccluster/steps_clayg_stop_early",
]

data = collect_data(base_dirs, [])

steps = data.steps.groupby(['decoder', 'distance', 'p']).apply(lambda x: x.set_index('value')['occurences'].to_dict()).to_dict()

def weighted_median(df):
    expanded = []
    for value, count in zip(df['value'], df['occurences']):
        expanded.extend([value] * count)
    return np.median(expanded)

median_steps = (
    data.steps
    .groupby(['decoder', 'distance', 'p'])
    .apply(weighted_median)
    .reset_index(name='avg_steps')
)

median_steps_by_d = (
    median_steps
    .pivot_table(index=['decoder', 'p'], columns='distance', values='avg_steps')
    .apply(lambda row: row.dropna().to_dict(), axis=1)
    .to_dict()
)

median_steps_by_p = (
    median_steps
    .pivot_table(index=['decoder', 'distance'], columns='p', values='avg_steps')
    .apply(lambda row: row.dropna().to_dict(), axis=1)
    .to_dict()
)


avg_steps = (
    data.steps
    .groupby(['decoder', 'distance', 'p'])
    .apply(lambda x: np.average(x['value'], weights=x['occurences']))
    .reset_index(name='avg_steps')
)

average_steps_by_d = (
    avg_steps
    .pivot_table(index=['decoder', 'p'], columns='distance', values='avg_steps')
    .apply(lambda row: row.dropna().to_dict(), axis=1)
    .to_dict()
)

average_steps_by_p = (
    avg_steps
    .pivot_table(index=['decoder', 'distance'], columns='p', values='avg_steps')
    .apply(lambda row: row.dropna().to_dict(), axis=1)
    .to_dict()
)


plot_step_distribution_by_p(steps).show()
plot_average_operations_against_d(average_steps_by_d).show()
plot_average_operations_against_p(average_steps_by_p).show()


median_plot = plot_average_operations_against_d(median_steps_by_d)
median_plot.title = "Median Operations against d, grouped by p"
median_plot.show()

median_plot = plot_average_operations_against_p(median_steps_by_p)
median_plot.title = "Median Operations against p, grouped by d"
median_plot.show()

  steps = data.steps.groupby(['decoder', 'distance', 'p']).apply(lambda x: x.set_index('value')['occurences'].to_dict()).to_dict()
  .apply(weighted_median)
  .apply(lambda x: np.average(x['value'], weights=x['occurences']))


TypeError: object of type 'NoneType' has no len()

In [7]:
base_dirs = [
    "../data/ccluster/results",
    "../data/ccluster/results_new_policies",
]

data = collect_data(base_dirs, [])
results = data.results.groupby(['decoder', 'distance']).apply(
    lambda x: x.set_index('p')[['l', 'n']].apply(lambda row: (row['l'], row['n']), axis=1).to_dict()
).to_dict()
threshold_plot(results).show()

  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.results)] = {
  data.results.loc[len(data.result

ClAYG d=4, d/2=2.0, b=1.8644
ClAYG d=6, d/2=3.0, b=2.5811
ClAYG d=8, d/2=4.0, b=3.0319
ClAYG d=10, d/2=5.0, b=3.8487
ClAYG d=12, d/2=6.0, b=4.0755
ClAYG d=14, d/2=7.0, b=5.0367
ClAYG d=16, d/2=8.0, b=5.4830
ClAYG d=18, d/2=9.0, b=1.0000
ClAYG d=20, d/2=10.0, b=1.0000
ClAYG w/ Faster Backwards Growth d=4, d/2=2.0, b=1.8945
ClAYG w/ Faster Backwards Growth d=6, d/2=3.0, b=2.4871
ClAYG w/ Faster Backwards Growth d=8, d/2=4.0, b=2.8468
ClAYG w/ Faster Backwards Growth d=10, d/2=5.0, b=3.7618
ClAYG w/ Faster Backwards Growth d=12, d/2=6.0, b=3.9215
ClAYG w/ Faster Backwards Growth d=14, d/2=7.0, b=4.4932
ClAYG w/ Faster Backwards Growth d=16, d/2=8.0, b=1.0000
ClAYG w/ Faster Backwards Growth d=18, d/2=9.0, b=1.0000
ClAYG w/ Faster Backwards Growth d=20, d/2=10.0, b=1.0000
ClAYG ⅓ Growth d=4, d/2=2.0, b=0.9892
ClAYG ⅓ Growth d=6, d/2=3.0, b=1.7276
ClAYG ⅓ Growth d=8, d/2=4.0, b=2.4259
ClAYG ⅓ Growth d=10, d/2=5.0, b=3.2787
ClAYG ⅓ Growth d=12, d/2=6.0, b=3.4307
ClAYG ⅓ Growth d=14, d/2=7.0,

Gtk-Message: 16:51:25.797: Not loading module "atk-bridge": The functionality is provided by GTK natively. Please try to not load it.


In [None]:
# Plot results for the tests with different growth rates
base_dir = "data/old/special_clayg_tests"
average_operations = collect_data_old(base_dir, [2])
results = average_operations.loc[(average_operations['metric'] == 'results') & (average_operations['p'] > 0) & (average_operations['value'] > 0)].groupby(['decoder', 'distance']).apply(
    lambda x: x.set_index('p')['value'].to_dict()
).to_dict()



fig = threshold_plot(results)

fig.show(renderer="browser")

In [None]:
# Plot results for the tests with clayg with faster backwards growth
base_dir = "data/old/special_clayg_grow_faster_backwards"
average_operations = collect_data_old(base_dir, [1])
print(len(average_operations))
average_operations = average_operations.loc[(average_operations['metric'] == 'results') & (average_operations['p'] > 0) & (average_operations['value'] > 0)]
results = average_operations.groupby(['decoder', 'distance'], group_keys=False).apply(
    lambda x: x.set_index('p')['value'].to_dict()
).to_dict()

fig = threshold_plot(results)

fig.show(renderer="browser")

In [None]:
# Plot results for the tests with single layer clayg
base_dir = "data/old/single_layer_clayg_tests"
average_operations = collect_data_old(base_dir, [1,2,3])
average_operations = average_operations.loc[(average_operations['metric'] == 'results') & (average_operations['p'] > 0) & (average_operations['value'] > 0)]
results = average_operations.groupby(['decoder', 'distance'], group_keys=False).apply(
    lambda x: x.set_index('p')['value'].to_dict()
).to_dict()

fig = threshold_plot(results)

fig.show(renderer="browser")

In [None]:
# Plot average operations against p
base_dir = "data/treshold_plots"
average_operations = collect_data_old(base_dir, [18,19,20,21, 22])
average_operations = average_operations.loc[average_operations['metric'] == 'average_operations'].groupby(['decoder', 'distance'], group_keys=False).apply(
    lambda x: x.set_index('p')['value'].to_dict()
).to_dict()

fig = plot_average_operations_against_p(average_operations)

fig.show(renderer="browser")

In [None]:
# Plot average operations against p
base_dir = "data/special_clayg_grow_faster_backwards"
average_operations = collect_data_old(base_dir, [1,2,3])
average_operations = average_operations.loc[average_operations['metric'] == 'average_operations'].groupby(['decoder', 'distance'], group_keys=False).apply(
    lambda x: x.set_index('p')['value'].to_dict()
).to_dict()

fig = plot_average_operations_against_p(average_operations)

fig.show(renderer="browser")

In [None]:
# Plot average operations against d
base_dir = "data/average_operations_initial"
data = collect_data_old(base_dir, [2,3])
average_operations = data.loc[data['metric'] == 'average_operations'].groupby(['decoder', 'p'], group_keys=False).apply(
    lambda x: x.set_index('distance')['value'].to_dict()
).to_dict()

fig = plot_average_operations_against_d(average_operations)

fig.show(renderer="browser")