In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

# Table

In [3]:
data_path = "data.05_21_16_53"


def read_json_lines_to_df(file_path):
    df = pd.read_json(open(file_path, "r", encoding="utf8"), lines=True)
    return df


def get_sample_df(data_path=data_path, task="text_summarization"):
    df1 = read_json_lines_to_df(f"{data_path}/{task}.txt")
    df2 = read_json_lines_to_df(f"{data_path}/{task}_result.txt")
    df3 = read_json_lines_to_df(f"{data_path}/{task}_ppl.txt")
    df4 = read_json_lines_to_df(f"{data_path}/{task}_score.txt")
    # merge by id, watermark_process column
    df = pd.merge(df1, df2, on=["id", "watermark_processor"])
    df = pd.merge(df, df3, on=["id", "watermark_processor"])
    # df4 misses some rows, so we use left join
    df = pd.merge(df, df4, on=["id", "watermark_processor"], how="left")
    return df


def get_bootstrap_df(data_path=data_path, task="machine_translation"):
    df1 = read_json_lines_to_df(f"{data_path}/{task}_bleu.txt")
    return df1


def extract_watermark_info(df, return_wp_list=False):
    show_wp = [
        "No Watermark",
        "$\delta$-reweight",
        "$\gamma$-reweight",
        "$\delta$-reweight (woh)",
        "$\gamma$-reweight (woh)",
    ]
    john_wps_set = set()

    def map_wp_str(wp_str):
        if "Delta" in wp_str or "Gamma" in wp_str:
            woh = ", True)" in wp_str
            if "Delta" in wp_str and not woh:
                return show_wp[1]
            elif "Delta" in wp_str and woh:
                return show_wp[3]
            elif "Gamma" in wp_str and not woh:
                return show_wp[2]
            elif "Gamma" in wp_str and woh:
                return show_wp[4]
        elif "John" in wp_str:
            import re

            delta = re.findall(r"delta=(\d+\.?\d*)", wp_str)[0]
            n = "Soft" + f"($\delta$={delta})"
            john_wps_set.add(n)
            return n
        if wp_str == "None":
            return show_wp[0]
        else:
            raise ValueError("Unknown watermark: {}".format(wp_str))

    df = df.assign(show_wp_name=df["watermark_processor"].apply(map_wp_str))
    john_wps = sorted(list(john_wps_set))
    show_wp = show_wp + john_wps
    if return_wp_list:
        return df, show_wp
    else:
        return df


def sample_df_2_stat(df, bootstrap=False, show_wp=None):
    sdf = df.melt(
        id_vars=["show_wp_name"],
        value_vars=[c for c in df.columns if df[c].dtype == np.float64],
        var_name="score",
        value_name="value",
    )
    sdf = sdf.groupby(["show_wp_name", "score"]).agg(["mean", "std", "count"])

    def format_fn(x):
        mean = x["mean"]
        if not bootstrap:
            std = x["std"] / np.sqrt(x["count"])
        else:
            std = x["std"]
        if not np.isfinite(std):
            return f"{mean:.2f}±{std:.2f}"
        useful_digits = np.max(-int(np.floor(np.log10(std / 3))), 0)
        fmt_str = f"{{:.{useful_digits}f}}±{{:.{useful_digits}f}}"
        return fmt_str.format(mean, std)

    sdf = sdf["value"].apply(format_fn, axis=1).unstack()
    if show_wp:
        sdf = sdf.loc[show_wp]
    return sdf


def merge_stat_df(df1, df2):
    df = pd.merge(df1, df2, left_index=True, right_index=True)
    return df

In [4]:
tsdf, show_wp = extract_watermark_info(get_sample_df(), return_wp_list=True)
mtdf = extract_watermark_info(get_sample_df(task="machine_translation"))
mtbsdf = extract_watermark_info(get_bootstrap_df())

In [44]:
def extract_len_info(df):
    df = df.assign(
        output_words=df["display_output"].apply(lambda x: len(x.split(" "))),
        output_chars=df["display_output"].apply(lambda x: len(x)),
    )
    return df


def filter_wh_score(df):
    df = df[df["best_sum_score"].notna()]
    df = df[~df.show_wp_name.str.contains("woh")]
    return df

def filter_score(df):
    df = df[df["best_sum_score"].notna()]
    return df

def filter_noout(df):
    df = df[df['display_output']!='']
    return df

def merge_tasks(dfs: dict):
    # add new column called task, and use key as task name
    for k, v in dfs.items():
        v["task"] = k
    df = pd.concat(dfs.values())
    return df

In [10]:
merged_df = merge_tasks(
    {
        "Text summarization": extract_len_info(filter_score(tsdf)),
        "Machine translation": extract_len_info(filter_score(mtdf)),
    }
)

verify woh has larger score

In [11]:
merged_df[["task", "show_wp_name", "best_score", "entropy"]].explode(
    ["best_score", "entropy"]
).astype({"best_score": float, "entropy": float}).melt(
    id_vars=["task", "show_wp_name"], value_vars=["best_score", "entropy"]
).groupby(["task", "show_wp_name", "variable"]).agg(
    ["mean", "std", "count"]
)['value'].apply(
    lambda x: f"{x['mean']:.4f}±{x['std']/np.sqrt(x['count']):.4f}", axis=1
).unstack().loc[["Text summarization", "Machine translation"]]

Unnamed: 0_level_0,variable,best_score,entropy
task,show_wp_name,Unnamed: 2_level_1,Unnamed: 3_level_1
Text summarization,$\delta$-reweight,0.8784±0.0015,0.9694±0.0009
Text summarization,$\delta$-reweight (woh),0.9340±0.0015,0.9721±0.0009
Text summarization,$\gamma$-reweight,0.2207±0.0004,0.9695±0.0009
Text summarization,$\gamma$-reweight (woh),0.2408±0.0004,0.9677±0.0009
Machine translation,$\delta$-reweight,0.4192±0.0043,0.5260±0.0024
Machine translation,$\delta$-reweight (woh),0.4517±0.0043,0.5249±0.0024
Machine translation,$\gamma$-reweight,0.1056±0.0011,0.5271±0.0024
Machine translation,$\gamma$-reweight (woh),0.1192±0.0011,0.5281±0.0024


# additional performance table

In [17]:
print(sample_df_2_stat(tsdf[['show_wp_name','bertscore.precision','bertscore.recall','rouge2','rougeL']], show_wp=show_wp).to_latex())

\begin{tabular}{lllll}
\toprule
score & bertscore.precision & bertscore.recall &         rouge2 &         rougeL \\
show\_wp\_name            &                     &                  &                &                \\
\midrule
No Watermark            &       0.3180±0.0009 &    0.3361±0.0010 &  0.1388±0.0008 &  0.2445±0.0008 \\
\$\textbackslash delta\$-reweight       &       0.3180±0.0009 &    0.3365±0.0010 &  0.1392±0.0008 &  0.2451±0.0008 \\
\$\textbackslash gamma\$-reweight       &       0.3180±0.0009 &    0.3360±0.0010 &  0.1397±0.0008 &  0.2451±0.0008 \\
\$\textbackslash delta\$-reweight (woh) &       0.3185±0.0009 &    0.3370±0.0010 &  0.1398±0.0008 &  0.2455±0.0008 \\
\$\textbackslash gamma\$-reweight (woh) &       0.3178±0.0009 &    0.3361±0.0010 &  0.1393±0.0008 &  0.2447±0.0008 \\
Soft(\$\textbackslash delta\$=0.0)      &       0.3180±0.0009 &    0.3361±0.0010 &  0.1388±0.0008 &  0.2445±0.0008 \\
Soft(\$\textbackslash delta\$=1.0)      &       0.3092±0.0009 &    0.3382±0.000

  print(sample_df_2_stat(tsdf[['show_wp_name','bertscore.precision','bertscore.recall','rouge2','rougeL']], show_wp=show_wp).to_latex())


In [45]:
print(sample_df_2_stat(filter_noout(mtdf)[['show_wp_name','bertscore.precision','bertscore.recall','ppl']], show_wp=show_wp).to_latex())

\begin{tabular}{llll}
\toprule
score & bertscore.precision & bertscore.recall &          ppl \\
show\_wp\_name            &                     &                  &              \\
\midrule
No Watermark            &         0.546±0.003 &      0.575±0.003 &    2.31±0.07 \\
\$\textbackslash delta\$-reweight       &         0.550±0.003 &      0.579±0.003 &    2.20±0.05 \\
\$\textbackslash gamma\$-reweight       &         0.549±0.003 &      0.577±0.003 &    2.24±0.04 \\
\$\textbackslash delta\$-reweight (woh) &         0.555±0.003 &      0.583±0.003 &  2.114±0.020 \\
\$\textbackslash gamma\$-reweight (woh) &         0.549±0.003 &      0.577±0.003 &    2.24±0.04 \\
Soft(\$\textbackslash delta\$=0.0)      &         0.546±0.003 &      0.575±0.003 &    2.31±0.07 \\
Soft(\$\textbackslash delta\$=1.0)      &         0.537±0.003 &      0.568±0.003 &    2.43±0.07 \\
Soft(\$\textbackslash delta\$=2.0)      &         0.523±0.003 &      0.555±0.003 &    2.81±0.07 \\
\bottomrule
\end{tabular}



  print(sample_df_2_stat(filter_noout(mtdf)[['show_wp_name','bertscore.precision','bertscore.recall','ppl']], show_wp=show_wp).to_latex())
