In [1]:
import os

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import wandb

os.environ["WANDB_PROJECT"] = "wandbot-eval"

In [2]:
run = wandb.init(
    entity="wandbot",
    project="wandbot-eval",
    group="auto-eval",
    job_type="eval-analysis",
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112689988739375, max=1.0…

In [3]:
artifact = run.use_artifact(
    "wandbot/wandbot-eval/autoeval_dataset:latest", type="dataset"
)
artifact_dir = artifact.download()

In [4]:
def load_dfs_from_artifact(artifact, table_names):
    dfs = {}
    for table_name in table_names:
        table = artifact.get(table_name)
        dfs[table_name] = table.get_dataframe()
    return dfs

In [5]:
table_names = [
    "gpt4_0613_autoeval_data",
    "gpt35_autoeval_data",
    "gpt4_turbo_autoeval_data",
    "gpt4_1106_preview_v1_1_autoeval_data",
]

In [6]:
dfs = load_dfs_from_artifact(artifact, table_names)

In [7]:
def table_metrics(dfs, table_names, metric_cols, metric_prefix=""):
    metrics_df = []
    for table_name in table_names:
        for col in metric_cols:
            model_name = dfs[table_name]["model"].value_counts().index[0]
            metric_df = pd.DataFrame(
                dfs[table_name][col].values, columns=["score"]
            )
            metric_df["model"] = (
                model_name + "-v1.1" if "v1_1" in table_name else model_name
            )
            metric_df["metric"] = (
                metric_prefix + " ".join(col.split("_")[:-1]).title()
            )
            metrics_df.append(metric_df)
    metrics_df = pd.concat(metrics_df)
    metrics_df.fillna(0, inplace=True)
    return metrics_df


def plot_categorical_metric_heatmaps(
    dfs, table_names, metric_col, metric_prefix=""
):
    metric_cols = [metric_col]
    metrics_df = table_metrics(dfs, table_names, metric_cols, metric_prefix)
    # Create pivot table
    pivot_table = pd.pivot_table(
        metrics_df,
        values="score",
        index=["metric", "score"],
        columns=["model"],
        aggfunc="size",
        fill_value=0,
    )

    # Convert multi-index to string
    pivot_table.index = pivot_table.index.map(lambda x: "=".join(map(str, x)))

    z_text = pivot_table.applymap(str).values

    fig = px.imshow(
        pivot_table.values,
        x=pivot_table.columns,
        y=pivot_table.index,
        color_continuous_scale="Viridis",
        aspect="auto",
    )
    metric_name = " ".join(metric_col.split("_")).title()
    # fig.update_layout(title_text=metric_name, title_x=0.5)
    fig.update_traces(text=z_text, texttemplate="%{text}")
    wandb_plot = wandb.Plotly(fig)

    run.log({f"{metric_name} Heatmap": wandb_plot})
    return fig.show()

In [8]:
categorical_metrics = [
    "answer_correctness_score",
    "answer_faithfulness_score",
    "answer_relevancy_score",
]

for metric_col in categorical_metrics:
    plot_categorical_metric_heatmaps(dfs, table_names, metric_col)

In [9]:
def plot_categorical_metrics(dfs, table_names, metric_cols, metric_prefix=""):
    for metric in metric_cols:
        metrics_df = table_metrics(dfs, table_names, [metric])

        fig = px.histogram(
            metrics_df,
            x="score",
            color="model",
            nbins=3,
            barmode="group",
        )
        metric_name = metrics_df.iloc[0]["metric"]
        # fig.update_layout(title_text=metric_name, title_x=0.5)
        score_plot = wandb.Plotly(fig)
        run.log({f"{metric_name} Histogram": score_plot})
        fig.show()

In [10]:
plot_categorical_metrics(dfs, table_names, categorical_metrics)

In [11]:
def plot_ragas_violin_metrics(dfs, table_names, metric_cols):
    for metric in metric_cols:
        metrics_df = table_metrics(
            dfs, table_names, [metric], metric_prefix="Ragas "
        )

        fig = px.violin(
            metrics_df,
            x="score",
            y="model",
            color="model",
            box=True,
            points="all",
            orientation="h",
        )
        metric_name = metrics_df.iloc[0]["metric"]
        # fig.update_layout(title_text=metric_name, title_x=0.5)
        fig.update_xaxes(
            tickangle=0
        )  # update x-axis labels to vertical orientation
        score_plot = wandb.Plotly(fig)
        run.log({f"{metric_name} ViolinPlot": score_plot})
        fig.show()

In [12]:
ragas_metrics = [col for col in dfs[table_names[0]].columns if "ragas" in col]
plot_ragas_violin_metrics(dfs, table_names, ragas_metrics)

In [13]:
def plot_token_violins(dfs, table_names, token_cols):
    for metric in token_cols:
        metrics_df = table_metrics(
            dfs,
            table_names,
            [metric],
        )
        tokens_df = metrics_df.rename(
            columns={"score": "tokens", "metric": "token_type"}
        )
        fig = px.violin(
            tokens_df,
            x="tokens",
            y="model",
            color="model",
            box=True,  # draw box plot inside the violin
            points="all",  # can be 'outliers', or False
            orientation="h",
        )

        metric_name = tokens_df.iloc[0]["token_type"] + " Tokens"
        # fig.update_layout(title_text=metric_name, title_x=0.5)
        fig.update_xaxes(
            tickangle=0
        )  # update x-axis labels to vertical orientation
        score_plot = wandb.Plotly(fig)
        run.log({f"{metric_name} Violinplot": score_plot})
        fig.show()


token_columns = ["prompt_tokens", "completion_tokens", "total_tokens"]
plot_token_violins(dfs, table_names, token_columns)

In [14]:
def plot_context_metrics(dfs, table_names, metric_cols):
    for metric in metric_cols:
        metrics_df = table_metrics(
            dfs,
            table_names,
            [metric],
        )

        fig = px.violin(
            metrics_df,
            x="score",
            y="model",
            color="model",
            box=True,  # draw box plot inside the violin
            points="all",  # can be 'outliers', or False
            orientation="h",
        )
        metric_name = metrics_df.iloc[0]["metric"]
        # fig.update_layout(title_text=metric_name, title_x=0.5)
        fig.update_xaxes(tickangle=0)
        score_plot = wandb.Plotly(fig)
        run.log({f"{metric_name} Violinplot": score_plot})
        fig.show()

In [15]:
context_metrics = ["context_precision_score", "context_recall_score"]

plot_context_metrics(dfs, table_names, context_metrics)

In [16]:
def plot_latency_metrics(dfs, table_names, metric_cols):
    for metric in metric_cols:
        metrics_df = table_metrics(
            dfs,
            table_names,
            [metric],
        )
        metrics_df = metrics_df.rename(
            columns={
                "score": "latency(s)",
            }
        )
        fig = px.violin(
            metrics_df,
            x="latency(s)",
            y="model",
            color="model",
            box=True,  # draw box plot inside the violin
            points="all",  # can be 'outliers', or False
            orientation="h",
        )
        metric_name = "Time Taken"
        # fig.update_layout(title_text=metric_name, title_x=0.5)
        fig.update_xaxes(tickangle=0)
        score_plot = wandb.Plotly(fig)
        run.log({f"{metric_name} Violinplot": score_plot})
        fig.show()

In [17]:
latency_metrics = ["time_taken"]
plot_latency_metrics(dfs, table_names, latency_metrics)

In [18]:
def plot_latency_vs_metric(dfs, table_names, metric_cols):
    # Extract latency and correctness metrics
    latency_df = table_metrics(dfs, table_names, ["time_taken"])
    for metric in metric_cols:
        metric_df = table_metrics(dfs, table_names, [metric])

        # Add an identifier for each evaluation
        latency_df["id"] = range(len(latency_df))
        metric_df["id"] = range(len(metric_df))

        # Merge the two dataframes on the 'model' and 'id' columns
        metric_suffix = f'_{metric.split("_")[1]}'
        merged_df = pd.merge(
            latency_df,
            metric_df,
            on=["model", "id"],
            suffixes=("_latency", metric_suffix),
        )

        metric_name = " ".join(metric_df.iloc[0]["metric"].split("_")).title()
        # Create the violin plot
        fig = px.box(
            merged_df,
            y="score_latency",
            x=f"score{metric_suffix}",
            color="model",
            labels={
                "score_latency": "Latency",
                f"score{metric_suffix}": metric_name,
            },
            orientation="v",
        )
        wandb_plot = wandb.Plotly(fig)
        run.log({f"{metric_name} vs Latency": wandb_plot})
        # Show the plot
        fig.show()

In [19]:
plot_latency_vs_metric(dfs, table_names, categorical_metrics)

In [20]:
def plot_model_radars(
    dfs, table_names, categorical_metrics, ragas_metrics, context_metrics
):
    categorical_radar_df = (
        table_metrics(dfs, table_names, categorical_metrics)
        .groupby(["model", "metric"])
        .mean()
        .reset_index()
    )
    categorical_radar_df.loc[:, "score"] = categorical_radar_df["score"].map(
        lambda x: (x - 1) / (3 - 1)
    )
    ragas_radar_df = (
        table_metrics(dfs, table_names, ragas_metrics, metric_prefix="RAGAS ")
        .groupby(["model", "metric"])
        .mean()
        .reset_index()
    )
    context_radar_df = (
        table_metrics(dfs, table_names, context_metrics)
        .groupby(["model", "metric"])
        .mean()
        .reset_index()
    )

    radar_df = pd.concat(
        [categorical_radar_df, ragas_radar_df, context_radar_df]
    )

    # first plot and log the combined radar plot

    df_pivot = radar_df.pivot(
        index="metric", columns="model", values="score"
    ).reset_index()

    metrics = df_pivot["metric"].values
    models = df_pivot.columns[1:]

    fig = go.Figure()

    color_palette = [
        "rgba(255,0,0,0.5)",
        "rgba(0,255,0,0.5)",
        "rgba(0,0,255,0.5)",
        "rgba(255,255,0,0.5)",
    ]  # Add more colors if needed

    for idx, model in enumerate(models):
        fig.add_trace(
            go.Scatterpolar(
                r=df_pivot[model].values,
                theta=metrics,
                # fill='toself',
                name=model,
                line_color=color_palette[idx % len(color_palette)],
            )
        )

    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True, range=[0, 1]  # adjust this to fit your data
            )
        ),
        showlegend=True,
    )
    run.log({"Model Comparison RadarPlot": wandb.Plotly(fig)})
    fig.show()

    for i, model in enumerate(models):
        fig = go.Figure(
            data=go.Scatterpolar(
                r=df_pivot[model].values,
                theta=metrics,
                fill="toself",
                name=model,
                line_color=color_palette[
                    i % len(color_palette)
                ],  # Use color palette
                fillcolor=color_palette[
                    i % len(color_palette)
                ],  # Use color palette
                hovertemplate="<i>Metric</i>: %{theta}<br><b>Score</b>: %{r}<extra></extra>",  # Custom hover template
            )
        )
        fig.update_layout(
            # title_text=model,
            # title_x=0.5,
            polar=dict(
                radialaxis=dict(
                    visible=True, range=[0, 1]  # adjust this to fit your data
                )
            ),
            showlegend=False,
        )

        run.log({f"{model} RadarPlot": wandb.Plotly(fig)})

        fig.show()

In [21]:
plot_model_radars(
    dfs, table_names, categorical_metrics, ragas_metrics, context_metrics
)

In [22]:
def plot_metric_bars(
    dfs, table_names, categorical_metrics, ragas_metrics, context_metrics
):
    category_sample_df = table_metrics(dfs, table_names, categorical_metrics)
    ragas_sample_df = table_metrics(
        dfs,
        table_names,
        ragas_metrics,
    )
    context_sample_df = table_metrics(dfs, table_names, context_metrics)

    for idx, metric_df in enumerate(
        [category_sample_df, ragas_sample_df, context_sample_df]
    ):
        # Aggregate the data to get mean and std deviation
        agg_df = (
            metric_df.groupby(["model", "metric"])
            .agg(["mean", "std"])
            .reset_index()
        )
        agg_df.columns = [
            "model",
            "metric",
            "mean_score",
            "std_score",
        ]  # Flatten the column multi-index

        fig = go.Figure()

        for model in agg_df["model"].unique():
            subset = agg_df[agg_df["model"] == model]
            fig.add_trace(
                go.Bar(
                    x=subset["metric"],
                    y=subset["mean_score"],
                    name=model,
                    error_y=dict(
                        type="data", array=subset["std_score"], visible=True
                    ),
                )
            )

        fig.update_layout(
            barmode="group",
            # title='Model Performance',
            # title_x=0.5,
            xaxis_title="Ragas Metric" if idx == 1 else "Metric",
            yaxis_title="Mean Score",
            legend_title="Model",
        )

        # Show the figure

        fig.show()

In [23]:
def plot_metric_bars(
    dfs, table_names, categorical_metrics, ragas_metrics, context_metrics
):
    category_sample_df = table_metrics(dfs, table_names, categorical_metrics)
    ragas_sample_df = table_metrics(
        dfs,
        table_names,
        ragas_metrics,
    )
    context_sample_df = table_metrics(dfs, table_names, context_metrics)

    for idx, metric_df in enumerate(
        [category_sample_df, ragas_sample_df, context_sample_df]
    ):
        # Aggregate the data to get mean and std deviation
        agg_df = (
            metric_df.groupby(["model", "metric"])
            .agg(["mean", "std"])
            .reset_index()
        )
        agg_df.columns = [
            "model",
            "metric",
            "mean_score",
            "std_score",
        ]  # Flatten the column multi-index

        for metric in agg_df["metric"].unique():
            fig = go.Figure()

            for model in agg_df["model"].unique():
                subset = agg_df[
                    (agg_df["model"] == model) & (agg_df["metric"] == metric)
                ]
                fig.add_trace(
                    go.Bar(
                        x=subset["metric"],
                        y=subset["mean_score"],
                        name=model,
                        error_y=dict(
                            type="data", array=subset["std_score"], visible=True
                        ),
                    )
                )

            fig.update_layout(
                barmode="group",
                # title=f'Average Model Performance for {"Ragas " + metric if idx == 1 else metric}',
                # title_x=0.5,
                xaxis_title="Ragas Metric" if idx == 1 else "Metric",
                yaxis_title="Mean Score",
                legend_title="Model",
            )
            run.log(
                {
                    f"{'Ragas ' + metric if idx == 1 else metric} Barplot": wandb.Plotly(
                        fig
                    )
                }
            )
            # Show the figure
            fig.show()

In [24]:
plot_metric_bars(
    dfs, table_names, categorical_metrics, ragas_metrics, context_metrics
)

In [25]:
run.finish()

