In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
import aquarel

df = pd.read_json("../data/artifacts/responses.jsonl.gz", lines=True)

In [None]:
boxplot_kwargs = {
    "y": "style",
    "hue": "kind",
    "native_scale": True,
    "notch": True, 
    "showcaps": True,
    "showfliers": False,
    "width": .75, 
    "medianprops":{"linewidth": 1.6},
    "linewidth": .8,
    "legend": False
}

In [None]:
with (
    aquarel.load_theme("boxy_light")
    .set_axes(top=False, left=False, right=False, bottom=True)
):
    fig, ax = plt.subplots(1, 3, sharey=True, sharex=False, figsize=(5.5, 2))
    
    sns.boxplot(
        data = (
            df
            .assign(num_words=lambda df: df["raw_text"].apply(lambda x: len(x.split(" "))))
            .query("num_words > 100")
            .loc[:, ["kind", "style", "num_words"]]
        ),
        x="num_words", 
        ax=ax[0],
        **boxplot_kwargs
    )
    
    sns.boxplot(
        data = (
            df
            .assign(num_statements=lambda df: df["statements"].apply(len))
            .loc[:, ["kind", "style", "num_statements"]]
        ),
        x="num_statements", 
        ax=ax[1],
        **boxplot_kwargs
    )
    
    sns.boxplot(
        data = (
            df
            .assign(num_citations=lambda df: df["statements"].apply(lambda x: len(set([c for s in x for c in s["citations"]]))))
            .query("num_citations > 1")
            .loc[:, ["kind", "style", "num_citations"]]
        ),
        x="num_citations", 
        ax=ax[2],
        **boxplot_kwargs
    )

ax[0].axvline(250, color=".3", dashes=(2, 2))
plt.tight_layout() 
fig.savefig("figure-descriptive-statistics.pdf")
fig

In [None]:
from collections import Counter

(
    df
    .assign(citation_counts=lambda df: df["statements"].apply(lambda cell: Counter([y for x in cell for y in x["citations"]]).values()))
    .loc[:, ["style", "kind", "citation_counts"]]
    .explode("citation_counts")
    .dropna()
    .astype({"citation_counts": int})
    .groupby(["style", "kind"])
    .mean()
    .reset_index()
    .pivot(index="style", columns="kind", values="citation_counts")
    .round(2)
    .mean(axis=0)
)

In [None]:
(
    df
    .loc[:, ["kind", "style", "references_ids", "statements"]]
    .assign(
        references_ranking=lambda df: df["references_ids"].apply(lambda x: {k: v for v,k in (enumerate(x))}),
        references_used=lambda df: df["statements"].apply(lambda cell: list(set([y for x in cell for y in x["citations"]]))),
    )
    .assign(
        reference_ranking_used=lambda df: df.apply(lambda row: [row["references_ranking"][ref] for ref in row["references_used"]], axis=1),
        reference_ranking_unused=lambda df: df.apply(lambda row: [row["references_ranking"][ref] for ref in row["references_ranking"].keys() if not ref in row["references_used"]], axis=1)
    )
    .assign(
        used_median=lambda df: df["reference_ranking_used"].apply(lambda x: pd.Series(x).median()),
        unused_median=lambda df: df["reference_ranking_unused"].apply(lambda x: pd.Series(x).median())
    )
    .loc[:, ["style", "kind", "used_median", "unused_median"]]
    .groupby(["style", "kind"])
    .mean()
    .reset_index()
    .melt(id_vars=["style", "kind"])
    .pivot(index="style", columns=["kind", "variable"], values="value")
    .round(1)
)

In [None]:
import numpy as np

with (
    aquarel.load_theme("boxy_light")
    .set_axes(top=False, left=False, right=False, bottom=True)
):
    fig, ax = plt.subplots(1, 1, figsize=(2.75, 2))
    
    sns.lineplot(
        data=(
            df
            .loc[:, ["kind", "style", "references_ids", "statements"]]
            .assign(
                references_ranking=lambda df: df["references_ids"].apply(lambda x: {k: v for v,k in (enumerate(x))}),
                references_used=lambda df: df["statements"].apply(lambda cell: list(set([y for x in cell for y in x["citations"]]))),
            )
            .assign(
                occurrence_counter=lambda df: df.apply(lambda row: [1 if ref in row["references_used"] else 0 for ref in row["references_ranking"].keys()], axis=1)
            )
            .groupby(["style", "kind"])
            .apply(lambda group: list(enumerate(np.array(group["occurrence_counter"].values.tolist()).sum(axis=0) / 301)))
            .explode(0)
            .apply(pd.Series)
            .rename(columns={0: "rank", 1: "p_cited"})
            .reset_index()
        ),
        x="rank", 
        y="p_cited",
        hue="kind",
        ax=ax,
        legend=False
    )


plt.tight_layout() 
fig.savefig("figure-citation-probability.pdf")
fig

In [None]:
(
    df
    .assign(
        num_words=lambda df: df["raw_text"].apply(lambda x: len(x.split(" "))),
        num_citations=lambda df: df["statements"].apply(lambda cell: sum(len(x["citations"]) for x in cell)),
        num_statements=lambda df: df["statements"].apply(len),
    )
    .loc[:, ["kind", "style", "num_statements", "num_citations", "num_words"]]
    .groupby(["kind", "style"])
    .corr()
    .reset_index()
    .melt(id_vars=["kind", "style", "level_2"])
    .pivot(index=["kind", "level_2"], columns=["style", "variable"], values="value")
    .round(2)
)

In [None]:
(
    df
    .explode("statements")
    .assign(num_citations_per_statement=lambda df: df["statements"].apply(lambda x: len(x["citations"])))
    .loc[:, ["kind", "style", "num_citations_per_statement"]]
    .groupby(["kind", "style"])
    .mean()
    .reset_index()
    .pivot(index="style", columns="kind", values="num_citations_per_statement")
    .round(2)
)