```
// Copyright 2020 Twitter, Inc.
// SPDX-License-Identifier: Apache-2.0
```

# Make plots and tables for the paper

Make plots and tables for the paper.


## Setup libraries


In [None]:
%pip install transformers==3.5.1 datasets==1.1.2 torch==1.4.0 seqeval==1.2.2 gensim==3.8.1

In [None]:
import re
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
from tensorflow.io.gfile import GFile
from tensorflow.io.gfile import glob as Glob


In [None]:
EXTRACT_TASK_CONFIG = re.compile(
    r"mbert_model_ft_(?P<seq>[a-z_]+?)_en_(?P<langs>[a-z_]+?)_2t_bce"
)
EXTRACT_TASK_CONFIG.match("mbert_model_ft_tt_en_hi_2t_bce").groupdict()
BASE_DIR = "models/"


In [None]:
df_table = {}
df_reports = {}
for path in (
    list(
        Glob(
            f"{BASE_DIR}/mbert_model_ft_*_2t_bce/en_ner_model/test_eval_report*_all_checkpoints.txt"
        )
    )
    + list(
        Glob(
            f"{BASE_DIR}/mbert_model/en_ner_model/test_eval_report_*_all_checkpoints.txt"
        )
    )
    + list(
        Glob(
            f"{BASE_DIR}/mbert_model_tt_*_2t_bce/en_ner_model/*test_eval_report_all_checkpoints.txt"
        )
    )
):
    dirname = Path(path).parts[-3]
    filename = Path(path).name
    if dirname == "mbert_model":
        config = {"seq": "mbert", "langs": "en"}
    elif "mbert_model_tt_" in dirname:
        config = {"seq": "all", "langs": "en"}
        if "equal" in dirname:
            config["seq"] = "all_equal"
    else:
        config = {}
    match = EXTRACT_TASK_CONFIG.match(dirname)
    if match:
        config = match.groupdict()

    ft = "ft" if config["seq"] != "mbert" else "base"
    lang = filename.split("_")[3]

    print(path, config, lang)

    with GFile(path) as fp:
        df_all_reports = pd.read_csv(fp, sep="\t", index_col=[0, 1, 2])
    df_reports[(lang, ft)] = df_all_reports


In [None]:
for (lang, ft), df_all_reports in df_reports.items():
    df_table[(lang, ft)] = (
        df_all_reports[df_all_reports.index.isin(["micro avg"], level=2)]
        .drop(["support", "precision", "recall"], axis=1)
        .reset_index(drop=True)
        .rename(columns={"precision": "P", "recall": "R", "f1-score": "F1"})["F1"]
    )


df_table = pd.concat(df_table, axis=1).sort_index(axis=1)[["hi", "ja", "ar"]] * 100


In [None]:
df_table


In [None]:
with pd.option_context("precision", 1):
    display(df_table)
    print(df_table.to_latex())


In [None]:
fig, ax = plt.subplots(2, 1, sharex=True, sharey=False, figsize=(5, 4))
for (lang, ft), df_all_reports in df_reports.items():
    i = 0 if ft == "base" else 1
    print(lang, ft)
    display(df_all_reports[df_all_reports.index.isin(["micro avg"], level=2)])
    df_t = df_all_reports[df_all_reports.index.isin(["micro avg"], level=2)][
        "f1-score"
    ].reset_index()["f1-score"]
    ((df_t - df_t.iloc[0]) * 100 / df_t.iloc[0]).plot(marker="o", label=lang, ax=ax[i])

ax[0].set_title("mbert")
ax[1].set_title("finetuned")

ax[1].set_xlabel("iteration")

ax[0].set_ylabel("$\%\Delta F_1$")
ax[1].set_ylabel("$\%\Delta F_1$")

fig.tight_layout()


## NER plots


In [None]:
df_table = {}
for path in (
    list(Glob(f"{BASE_DIR}/mbert_model_ft_*_2t_bce/en_ner_model/*test_eval_report.txt"))
    + list(Glob(f"{BASE_DIR}/mbert_model/en_ner_model/*test_eval_report.txt"))
    + list(
        Glob(f"{BASE_DIR}/mbert_model_tt_*_2t_bce/en_ner_model/*test_eval_report.txt")
    )
):
    dirname = Path(path).parts[-3]
    filename = Path(path).name
    if dirname == "mbert_model":
        config = {"seq": "mbert", "langs": "en"}
    elif "mbert_model_tt_" in dirname:
        config = {"seq": "all", "langs": "en"}
        if "equal" in dirname:
            config["seq"] = "all_equal"
    else:
        config = {}
    match = EXTRACT_TASK_CONFIG.match(dirname)
    if match:
        config = match.groupdict()
    reporttype = "SSEA" if filename.startswith("ssea") else "BASE"
    with GFile(path) as fp:
        df_report = pd.read_csv(fp, sep="\t", index_col=[0, 1])
    print(path, reporttype, config)
    lang = config["langs"]
    seq = config["seq"]

    df_table[(lang, seq, reporttype)] = (
        df_report[df_report.index.isin(["micro avg"], level=1)]
        .drop(["support", "precision", "recall"], axis=1)
        .rename(columns={"precision": "P", "recall": "R", "f1-score": "F1"})["F1"]
        .reset_index(level=1, drop=True)
    )

    display(df_table[(lang, seq, reporttype)])


In [None]:
df_report[df_report.index.isin(["micro avg"], level=1)].drop(
    ["support", "precision", "recall"], axis=1
).rename(columns={"precision": "P", "recall": "R", "f1-score": "F1"})["F1"].reset_index(
    level=1, drop=True
)


In [None]:
df_table = pd.concat(df_table, axis=1).sort_index(
    axis=1
)
with pd.option_context("precision", 3):
    display(df_table)


In [None]:
df_table.T


In [None]:
df_table_langs = {}
for lang in {"hi", "ar", "ja"}:
    df_table_langs[lang] = df_table.T.loc[lang, lang]


In [None]:
df_table.T.loc["en", ["ja", "ar", "hi"]]


In [None]:
df_t = pd.concat(
    [pd.concat(df_table_langs, axis=1), df_table.T.loc["en", ["ja", "ar", "hi"]]],
    axis=0,
).max(level=0)
df_t


In [None]:
df_t.loc[["tt_wd", "tt_wm", "wm_tt", "wm_wd"], ["hi", "ja", "ar"]].max().to_frame().T


In [None]:
lang_order = ["hi", "ja", "ar"]
df_tt = pd.concat(
    [
        df_t.loc[["mbert"], lang_order],
        df_t.loc[["tt", "wd", "wm"], lang_order],
        df_t.loc[["tt", "wd", "wm"], lang_order]
        .max()
        .to_frame()
        .T.rename(index={0: "one"}),
        df_t.loc[["tt_wd", "tt_wm", "wm_tt", "wm_wd"], lang_order]
        .max()
        .to_frame()
        .T.rename(index={0: "pair"}),
        df_t.loc[["tt_wd_wm", "tt_wm_wd", "wm_wd_tt"], lang_order]
        .max()
        .to_frame()
        .T.rename(index={0: "triple"}),
        df_t.loc[["all", "all_equal"], lang_order],
    ],
    axis=0,
)

df_tt = df_tt * 100

with pd.option_context("precision", 1):
    display(df_tt)
    print(df_tt.to_latex())


In [None]:
df_tt_with_percent = (
    pd.concat({"F1": df_tt, "imp %": ((df_tt / df_tt.loc["mbert"]) - 1) * 100}, axis=1)
    .reorder_levels([1, 0], axis=1)
    .sort_index(axis=1)[["hi", "ja", "ar"]]
)


with pd.option_context("precision", 1):
    display(df_tt_with_percent)
    print(df_tt_with_percent.to_latex())
    display(df_tt_with_percent.loc[["one", "pair", "triple"], :].max().to_frame().T)
    print(
        df_tt_with_percent.loc[["one", "pair", "triple"], :]
        .max()
        .to_frame()
        .T.to_latex()
    )
    display(df_tt_with_percent.loc[["one", "pair"], :].max().to_frame().T)
    print(df_tt_with_percent.loc[["one", "pair"], :].max().to_frame().T.to_latex())


## Sentiment plots


In [None]:
df_table = {}
for path in (
    list(
        Glob(
            f"{BASE_DIR}/mbert_model_ft_*_2t_bce/en_sentiment_model/*test_eval_report.txt"
        )
    )
    + list(Glob(f"{BASE_DIR}/mbert_model/en_sentiment_model/*test_eval_report.txt"))
    + list(
        Glob(
            f"{BASE_DIR}/mbert_model_tt_*_2t_bce/en_sentiment_model/*test_eval_report.txt"
        )
    )
):
    dirname = Path(path).parts[-3]
    filename = Path(path).name
    if dirname == "mbert_model":
        config = {"seq": "mbert", "langs": "en"}
    elif "en_hi_en_ja_en_ar" in dirname:
        config = {"seq": "all", "langs": "en"}
        if "equal_2t_bce" in dirname:
            config["seq"] = "all_equal"
    else:
        config = {}
    match = EXTRACT_TASK_CONFIG.match(dirname)
    if match:
        config = match.groupdict()
    reporttype = "SSEA" if filename.startswith("ssea") else "BASE"
    with GFile(path) as fp:
        # df_report = pd.read_csv(fp, sep="\t", index_col=[0, 1])
        df_report = pd.read_csv(fp, sep="\t", index_col=[0, 1, 2])
    print(path, reporttype, config)
    lang = config["langs"]
    seq = config["seq"]

    df_table[(lang, seq, reporttype)] = (
        df_report[df_report.index.isin(["macro avg"], level=2)]
        .drop(["support", "precision", "recall"], axis=1)
        .rename(columns={"precision": "P", "recall": "R", "f1-score": "F1"})["F1"]
        .reset_index(level=1, drop=True)
        .droplevel(1)
    )

    display(df_table[(lang, seq, reporttype)])


In [None]:
df_report[df_report.index.isin(["macro avg"], level=2)].drop(
    ["support", "precision", "recall"], axis=1
).rename(columns={"precision": "P", "recall": "R", "f1-score": "F1"})["F1"].reset_index(
    level=1, drop=True
).droplevel(
    1
)


In [None]:
df_table = pd.concat(df_table, axis=1).sort_index(
    axis=1
)  # .reorder_levels([2, 0, 1], axis=1)
with pd.option_context("precision", 3):
    display(df_table)


In [None]:
df_table.T.loc["en", "hi"]  # ["hi"]


In [None]:
df_table_langs = {}
for lang in {"hi", "ar", "ja"}:
    df_table_langs[lang] = df_table.T.loc[lang, lang]


In [None]:
df_t = pd.concat(
    [pd.concat(df_table_langs, axis=1), df_table.T.loc["en", ["ja", "ar", "hi"]]],
    axis=0,
).max(level=0)
df_t


In [None]:
lang_order = ["hi", "ja", "ar"]
df_tt = pd.concat(
    [
        df_t.loc[["mbert"], lang_order],
        df_t.loc[
            ["tt", "wd", "wm"], lang_order
        ],
        df_t.loc[["tt", "wd", "wm"], lang_order]
        .max()
        .to_frame()
        .T.rename(index={0: "one"}),
        df_t.loc[["tt_wd", "wm_tt", "tt_wm", "wm_wd"], lang_order]
        .max()
        .to_frame()
        .T.rename(index={0: "pair"}),
        df_t.loc[["tt_wd_wm", "tt_wm_wd", "wm_wd_tt"], lang_order]
        .max()
        .to_frame()
        .T.rename(index={0: "triple"}),
        df_t.loc[["all"], lang_order],
        df_t.loc[["all_equal"], lang_order],
    ],
    axis=0,
)

df_tt = df_tt * 100

with pd.option_context("precision", 1):
    display(df_tt)
    print(df_tt.to_latex())


In [None]:
df_tt_with_percent = (
    pd.concat({"F1": df_tt, "imp %": ((df_tt / df_tt.loc["mbert"]) - 1) * 100}, axis=1)
    .reorder_levels([1, 0], axis=1)
    .sort_index(axis=1)[["hi", "ja", "ar"]]
)


with pd.option_context("display.float_format", "{:.1f}".format):
    display(df_tt_with_percent)
    print(df_tt_with_percent.to_latex())
    display(df_tt_with_percent.loc[["one", "pair", "triple"], :].max().to_frame().T)
    print(
        df_tt_with_percent.loc[["one", "pair", "triple"], :]
        .max()
        .to_frame()
        .T.to_latex()
    )
    display(df_tt_with_percent.loc[["one", "pair"], :].max().to_frame().T)
    print(df_tt_with_percent.loc[["one", "pair"], :].max().to_frame().T.to_latex())


## UD POS plots


In [None]:
df_table = {}
for path in (
    list(
        Glob(f"{BASE_DIR}/mbert_model_ft_*_2t_bce/en_udpos_model/*test_eval_report.txt")
    )
    + list(Glob(f"{BASE_DIR}/mbert_model/en_udpos_model/*test_eval_report.txt"))
    + list(
        Glob(f"{BASE_DIR}/mbert_model_tt_*_2t_bce/en_udpos_model/*test_eval_report.txt")
    )
):
    dirname = Path(path).parts[-3]
    filename = Path(path).name
    if dirname == "mbert_model":
        config = {"seq": "mbert", "langs": "en"}
    elif "en_hi_en_ja_en_ar" in dirname:
        config = {"seq": "all", "langs": "en"}
        if "equal_2t_bce" in dirname:
            config["seq"] = "all_equal"
    else:
        config = {}
    match = EXTRACT_TASK_CONFIG.match(dirname)
    if match:
        config = match.groupdict()
    reporttype = "SSEA" if filename.startswith("ssea") else "BASE"
    with GFile(path) as fp:
        # df_report = pd.read_csv(fp, sep="\t", index_col=[0, 1])
        df_report = pd.read_csv(fp, sep="\t", index_col=[0, 1, 2])
    print(path, reporttype, config)
    lang = config["langs"]
    seq = config["seq"]

    df_table[(lang, seq, reporttype)] = (
        df_report[df_report.index.isin(["accuracy"], level=2)]
        .drop(["support", "precision", "recall"], axis=1)
        .rename(columns={"precision": "P", "recall": "R", "f1-score": "F1"})["F1"]
        .reset_index(level=2, drop=True)
    )

    display(df_table[(lang, seq, reporttype)])


In [None]:
df_report[df_report.index.isin(["accuracy"], level=2)].drop(
    ["support", "precision", "recall"], axis=1
).rename(columns={"precision": "P", "recall": "R", "f1-score": "F1"})["F1"].reset_index(
    level=2, drop=True
)

In [None]:
df_table = pd.concat(df_table, axis=1).sort_index(
    axis=1
)
with pd.option_context("precision", 3):
    display(df_table)


In [None]:
df_table_langs = {}
for lang in {"hi", "ar", "ja"}:
    df_table_langs[lang] = df_table.T.loc[lang, lang]


In [None]:
df_t = (
    pd.concat(
        [pd.concat(df_table_langs, axis=1), df_table.T.loc["en", ["ja", "ar", "hi"]]],
        axis=0,
    )
    .max(level=0)
    .droplevel(1, axis=1)
)
df_t


In [None]:
lang_order = ["hi", "ja", "ar"]
df_tt = pd.concat(
    [
        df_t.loc[["mbert"], lang_order],
        df_t.loc[
            ["tt", "wd", "wm"], lang_order
        ],
        df_t.loc[["tt", "wd", "wm"], lang_order]
        .max()
        .to_frame()
        .T.rename(index={0: "one"}),
        df_t.loc[["tt_wd", "wm_tt", "tt_wm", "wm_wd"], lang_order]
        .max()
        .to_frame()
        .T.rename(index={0: "pair"}),
        df_t.loc[["tt_wd_wm", "tt_wm_wd", "wm_wd_tt"], lang_order]
        .max()
        .to_frame()
        .T.rename(index={0: "triple"}),
        df_t.loc[["all"], lang_order],
        df_t.loc[["all_equal"], lang_order],
    ],
    axis=0,
)

df_tt = df_tt * 100

with pd.option_context("precision", 1):
    display(df_tt)
    print(df_tt.to_latex())


In [None]:
df_tt_with_percent = (
    pd.concat({"F1": df_tt, "imp %": ((df_tt / df_tt.loc["mbert"]) - 1) * 100}, axis=1)
    .reorder_levels([1, 0], axis=1)
    .sort_index(axis=1)[["hi", "ja", "ar"]]
)


with pd.option_context("display.float_format", "{:.1f}".format):
    display(df_tt_with_percent)
    print(df_tt_with_percent.to_latex())
    display(df_tt_with_percent.loc[["one", "pair", "triple"], :].max().to_frame().T)
    print(
        df_tt_with_percent.loc[["one", "pair", "triple"], :]
        .max()
        .to_frame()
        .T.to_latex()
    )
    display(df_tt_with_percent.loc[["one", "pair"], :].max().to_frame().T)
    print(df_tt_with_percent.loc[["one", "pair"], :].max().to_frame().T.to_latex())


## Embedding Plots


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE


In [None]:
embeddings_path = Path("../data/en_ar_embeddings.ft.npz").expanduser()


In [None]:
embedding_data = np.load(embeddings_path)
embedding_data


In [None]:
embeddings_path = Path("../data/en_ar_embeddings.base.npz").expanduser()
embedding_data_base = np.load(embeddings_path)
embedding_data_base


In [None]:
embedding_data_base["labels"]


In [None]:
n = 100
all_embedding = np.vstack(
    [embedding_data_base["embeddings"][:n], embedding_data["embeddings"][:n]]
)
all_embedding.shape


In [None]:
%%time
low_embed = TSNE().fit_transform(all_embedding)

In [None]:
low_embed = low_embed.reshape(2, -1, 2)


In [None]:
fig, ax = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(6, 6))

for i in range(0, low_embed.shape[1], 2):
    marker = "o"  # f"${i}$"
    ax[0].scatter(
        low_embed[0, i : i + 2, 0],
        low_embed[0, i : i + 2, 1],
        marker=marker,
        color=["k", "r"],
        s=100,
        alpha=0.2,
    )
    ax[1].scatter(
        low_embed[1, i : i + 2, 0],
        low_embed[1, i : i + 2, 1],
        marker=marker,
        color=["k", "r"],
        s=100,
        alpha=0.2,
    )

ax[0].set_title("mBERT")
ax[1].set_title("mBERT fine-tuned on all languages")
fig.tight_layout()
plt.savefig(Path("../figures/en_ar_embeddings.pdf").expanduser(), bbox_inches="tight")


In [None]:
fig, ax = plt.subplots(1, 2, sharex=False, sharey=False, figsize=(6, 3))

n = all_embedding.shape[0]//2 
print(n)
dist_mbert = []
low_embed = %time TSNE().fit_transform(all_embedding[:n])
for i in range(0, low_embed.shape[0], 2):
  marker = "o"
  ax[0].scatter(low_embed[i:i+2, 0], low_embed[i:i+2, 1], marker=marker, color=["k", "r"], s=100, alpha=0.2)
  d = ((low_embed[i] - low_embed[i+1])**2).sum()
  dist_mbert.append(d)


dist_ft = []
low_embed = %time TSNE().fit_transform(all_embedding[n:])
for i in range(0, low_embed.shape[0], 2):  
  ax[1].scatter(low_embed[i:i+2, 0], low_embed[i:i+2, 1], marker=marker, color=["k", "r"], s=100, alpha=0.2)
  d = ((low_embed[i] - low_embed[i+1])**2).sum()
  dist_ft.append(d)
  
ax[0].set_title("mBERT")
ax[1].set_title("mBERT + TPP")
ax[0].axis("off")
ax[1].axis("off")
fig.tight_layout()
plt.savefig(Path("../figures/en_ar_embeddings.pdf").expanduser(), bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(1, 2, sharex=False, sharey=False, figsize=(6, 3))

n = all_embedding.shape[0]//2 
print(n)
dist_mbert = []
low_embed = %time TSNE().fit_transform(all_embedding[:n])
markers = ["o", "s"]
colors = ["k", "r"]
langs = ["EN", "AR"]
for i, lang in enumerate(langs):
  ax[0].scatter(low_embed[i::2, 0], low_embed[i::2, 1], marker=markers[i], color=colors[i], s=100, alpha=0.2, label=lang)
  
dist_mbert = np.linalg.norm(low_embed[0::2] - low_embed[1::2], axis=1)



dist_ft = []
low_embed = %time TSNE().fit_transform(all_embedding[n:])
for i, lang in enumerate(langs):
  ax[1].scatter(low_embed[i::2, 0], low_embed[i::2, 1], marker=markers[i], color=colors[i], s=100, alpha=0.2, label=lang)
  
dist_ft = np.linalg.norm(low_embed[0::2] - low_embed[1::2], axis=1)
  
ax[0].set_title("mBERT")
ax[1].set_title("mBERT + TPP")
ax[0].axis("off")
ax[1].axis("off")
ax[0].legend()
fig.tight_layout()
plt.savefig(Path("../figures/en_ar_embeddings.pdf").expanduser(), bbox_inches="tight")

In [None]:
dist_mbert = dist_mbert / np.max(dist_mbert)
dist_ft = dist_ft / np.max(dist_ft)


In [None]:
plt.plot(dist_mbert, dist_ft, linestyle="none", marker="o", color="k")
plt.plot([0, 1], [0, 1], color="0.5", linestyle="--", lw=1)
ax = plt.gca()
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_xlabel("mBERT distance")
ax.set_ylabel("mBERT + TPP distance")
plt.savefig(
    Path("../figures/en_ar_embeddings_dist.pdf").expanduser(), bbox_inches="tight"
)


In [None]:
total_delta_dist = dist_ft - dist_mbert
total_delta_dist.mean(), total_delta_dist.std()


In [None]:
fig, ax = plt.subplots(1, 2, sharex=False, sharey=False, figsize=(6, 3))

n = all_embedding.shape[0]//2 
print(n)
dist_mbert = []
plot_n = 20
low_embed = %time TSNE().fit_transform(all_embedding[:n])[:plot_n]
markers = ["o", "s"]
colors = ["k", "r"]
for i, lang in enumerate(["en", "ar"]):
  ax[0].scatter(low_embed[i::2, 0], low_embed[i::2, 1], marker=markers[i], color=colors[i], s=100, alpha=0.2, label=lang)
  
dist_mbert = np.linalg.norm(low_embed[0::2] - low_embed[1::2], axis=1)



dist_ft = []
low_embed = %time TSNE().fit_transform(all_embedding[n:])[:plot_n]
for i, lang in enumerate(["en", "ar"]):
  ax[1].scatter(low_embed[i::2, 0], low_embed[i::2, 1], marker=markers[i], color=colors[i], s=100, alpha=0.2, label=lang)
  
dist_ft = np.linalg.norm(low_embed[0::2] - low_embed[1::2], axis=1)
  
ax[0].set_title("mBERT")
ax[1].set_title("mBERT + TPP")
ax[0].axis("off")
ax[1].axis("off")
ax[0].legend(loc="upper left")
fig.tight_layout()
plt.savefig(Path("../figures/en_ar_embeddings.pdf").expanduser(), bbox_inches="tight")

In [None]:
data_file = Path("../data/en_ar_tatoeba.json").expanduser()


In [None]:
import json


In [None]:
sentences = []
labels = []
with data_file.open() as fp:
    for i, line in enumerate(fp):
        line = line.strip()
        if not line:
            continue
        if i > 1000:
            break
        line = json.loads(line)
        line_sents = line["unique_label_desc"]
        sentences.extend(line_sents)
        labels.extend([i] * len(line_sents))


In [None]:
sentences = np.asarray(sentences)
sentences[:10]


In [None]:
df_lang_dists = pd.DataFrame(
    {lang: sentences[:n][i::2] for i, lang in enumerate(["en", "ar"])}
).assign(dist_mbert=dist_mbert, dist_ft=dist_ft, total_delta_dist=total_delta_dist)

df_lang_dists


In [None]:
paper_idx = [1, 2, 37, 39]
with pd.option_context("display.float_format", "{:.1f}".format):
    display(df_lang_dists.loc[paper_idx])
    print(
        df_lang_dists.loc[paper_idx]
        .assign(ar=lambda x: x.ar.apply(lambda k: f"RL{{ {k} }}"))
        .to_latex()
    )
