In [4]:
import pandas as pd
from util import get_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve
import matplotlib.pyplot as plt
from sentence_transformers import CrossEncoder, InputExample
from sentence_transformers.cross_encoder.evaluation import (
    CESoftmaxAccuracyEvaluator,
)
from sklearn.metrics import auc, roc_curve
from torch.utils.data import DataLoader


def make_data(train, test):
    train_examples = []
    test_examples = []
    for i, row in train.iterrows():
        train_examples.append(
            InputExample(texts=[row["s1"], row["s2"]], label=row["label"])
        )
    for i, row in test.iterrows():
        test_examples.append(
            InputExample(texts=[row["s1"], row["s2"]], label=row["label"])
        )

    return train_examples, test_examples


def separate_splits(
    lang, adversarial_train, adversarial_test, model_name="xlm-roberta-base"
):
    train_examples, test_examples = make_data(adversarial_train, adversarial_test)

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
    evaluation = CESoftmaxAccuracyEvaluator.from_input_examples(
        test_examples, name="adversarial_test"
    )
    model = CrossEncoder(model_name, num_labels=2)
    epochs = 4
    model.fit(
        train_dataloader=train_dataloader,
        evaluator=evaluation,
        # optimizer_params={
        #     "lr": 1e-5,
        # },
        epochs=epochs,
        warmup_steps=len(train_dataloader) * 0.1 * epochs,
        show_progress_bar=False,
    )
    model_predictions = model.predict([example.texts for example in test_examples])
    model_predictions = [1 if pred[0] > pred[1] else 0 for pred in model_predictions]
    fpr, tpr, _ = roc_curve(
        [example.label for example in test_examples], model_predictions
    )
    roc_auc = auc(fpr, tpr)
    plot_auc(fpr, tpr, roc_auc, lang=lang)
    return roc_auc

def plot_auc(fpr, tpr, roc_auc, lang):
    plt.figure()
    lw = 2
    plt.plot(
        fpr,
        tpr,
        color="darkorange",
        lw=lw,
        label="ROC curve (area = %0.2f)" % roc_auc,
    )
    plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC curve for {lang}")
    plt.legend(loc="lower right")
    plt.show()

In [None]:
from tqdm.notebook import tqdm

lang = "eng"

train = get_data(lang=lang, train=True)
test = get_data(lang=lang, test=True)
test = test[["s1", "s2"]]
test["label"] = 1

# iteratively move through test data and
# find its distribution overlap in the test data
window_size = 200
num_windows = len(train) // window_size
print(f"Number of windows: {num_windows}")

distribution_scores = {}
for window_idx in tqdm(range(num_windows + 1)):
    train_window = train[window_idx * window_size : (window_idx + 1) * window_size]
    ids = train_window["PairID"].values
    train_window = train_window[["s1", "s2"]]
    train_window["label"] = 0
    df = pd.concat([train_window, test])
    train_window, test_window = train_test_split(df, test_size=0.3)
    roc_auc = separate_splits(
        lang, train_window, test_window, model_name="roberta-base"
    )
    window_range = f"{ids[0]}-{ids[-1]}"
    distribution_scores[window_range] = roc_auc

In [2]:
distribution_scores

{'ENG-train-0000-ENG-train-0199': 0.26305990581535427,
 'ENG-train-0200-ENG-train-0399': 0.5,
 'ENG-train-0400-ENG-train-0599': 0.5,
 'ENG-train-0600-ENG-train-0799': 0.5,
 'ENG-train-0800-ENG-train-0999': 0.42578157392818644,
 'ENG-train-1000-ENG-train-1199': 0.4627487575685236,
 'ENG-train-1200-ENG-train-1399': 0.5,
 'ENG-train-1400-ENG-train-1599': 0.49122807017543857,
 'ENG-train-1600-ENG-train-1799': 0.41319261768976623,
 'ENG-train-1800-ENG-train-1999': 0.453024453024453,
 'ENG-train-2000-ENG-train-2199': 0.35012191405059434,
 'ENG-train-2200-ENG-train-2399': 0.3660545905707196,
 'ENG-train-2400-ENG-train-2599': 0.4396551724137931,
 'ENG-train-2600-ENG-train-2799': 0.33656330749354013,
 'ENG-train-2800-ENG-train-2999': 0.3663194444444445,
 'ENG-train-3000-ENG-train-3199': 0.35657240825538744,
 'ENG-train-3200-ENG-train-3399': 0.3728957625010366,
 'ENG-train-3400-ENG-train-3599': 0.3496919277888327,
 'ENG-train-3600-ENG-train-3799': 0.36065573770491804,
 'ENG-train-3800-ENG-train-

In [4]:
overlapping_dist = {
    k: v for k, v in distribution_scores.items() if v >= 0.4
}
good_samples = overlapping_dist.keys()
for sample in overlapping_dist.keys():
    print(sample)
    parts = sample.split("-")

ENG-train-0200-ENG-train-0399
ENG-train-0400-ENG-train-0599
ENG-train-0600-ENG-train-0799
ENG-train-0800-ENG-train-0999
ENG-train-1000-ENG-train-1199
ENG-train-1200-ENG-train-1399
ENG-train-1400-ENG-train-1599
ENG-train-1600-ENG-train-1799
ENG-train-1800-ENG-train-1999
ENG-train-2400-ENG-train-2599
ENG-train-4000-ENG-train-4199


In [5]:
sorted_samples = sorted(
    overlapping_dist.keys(), key=lambda x: (int(x.split("-")[2]), int(x.split("-")[5]))
)
good_pairs = []
# Initialize the start and end of the current block
current_start = None
current_end = None

for sample in sorted_samples:
    parts = sample.split("-")
    _from = int(parts[2])
    _to = int(parts[5])

    if current_start is None:
        current_start = _from
        current_end = _to
    elif _from == current_end + 1:
        current_end = _to
    else:
        good_pairs.append((current_start, current_end))
        current_start = _from
        current_end = _to

if current_start is not None:
    good_pairs.append((current_start, current_end))

for pair in good_pairs:
    print(pair)

(200, 1999)
(2400, 2599)
(4000, 4199)


In [6]:
numbers = []
for start, end in good_pairs:
    numbers.extend(range(start, end + 1))

In [9]:
from util import get_data
lang = "eng"
# iterate the training set and create a new training set with only in-distribution samples
train = get_data(lang=lang, train=True, clean=False)
train["simple_id"] = train["PairID"].apply(lambda x: int(x.split("-")[-1]))
train = train[train["simple_id"].isin(numbers)]
train

Unnamed: 0,PairID,Score,s1,s2,simple_id
200,ENG-train-0200,0.91,"You try to forget everything good, and remembe...","you just try to forget everything good, and re...",200
201,ENG-train-0201,0.91,It is up to your boyfriend to tell her.,Your bf should be the on telling her.,201
202,ENG-train-0202,0.91,I'll talk to the son of a- Foyle!,I want to talk to this son-of-a-Foyle!,202
203,ENG-train-0203,0.91,The orchestra 's reputation increased most pro...,Serge Koussevitzky was their conductor for man...,203
204,ENG-train-0204,0.91,I definitely believe that there is a 'special ...,I think their is a 'special someone' for every...,204
...,...,...,...,...,...
4195,ENG-train-4195,0.31,"Because I'm against common core, common crooks...","#BENGHAZI WAIT DID I SAY ""THANK GOD?"" VOTERS? ...",4195
4196,ENG-train-4196,0.31,It varies from place to place.,That is a good place to start.,4196
4197,ENG-train-4197,0.31,I'm pitchingthis at Fox in half an hour.,"Which means what, half an hour?",4197
4198,ENG-train-4198,0.31,We live in a world where people care more abou...,So unfortunate #thebriefcase @cbs. Adoption is...,4198


In [10]:
train = train.drop(columns=["simple_id"])
train.to_csv(f"data/adversarial_validation-{lang}-train.csv", index=False)