<a href="https://colab.research.google.com/github/tatiana-iazykova/2020_HACK_RUSSIANSUPERGLUE/blob/main/BaseLine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import random 
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.metrics import matthews_corrcoef


def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.RandomState(seed)
    np.random.seed(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True

seed_everything(42)
SEEDS = [42, 23, 1234567]

In [4]:
class Baseline:

    def __init__(self, path: str, path_valid=None, seed=42):
        
        self.path = path
        self.train = pd.read_json(path_or_buf=path, lines=True)
        self.seed = seed
        if path_valid:
            self.valid = pd.read_json(path_or_buf=path_valid, lines=True)
        else:
            self.valid = None
    
    def f1_accuracy_only(self, predict_function):
        """
        F1-macro and accuracy metrics only
        """
        test_size = len(self.valid)
        y_true = list(self.valid.label)
        y_pred = predict_function(test_size=test_size)
        accuracy = accuracy_score(y_true, y_pred)
        f1_macro = f1_score(y_true, y_pred, average='macro')
        return f1_macro, accuracy    

    def mc_only(self, predict_function):
        """
        Mathews Correlation only (for Lidirus)
        """
        test_size = len(self.train)
        y_true = self.train.label
        y_pred = predict_function(test_size=test_size)
        mc_metric = matthews_corrcoef(y_true, y_pred)
        return mc_metric
    
    def majority(self):
        """
        Majority prediction and classification report/mc metric
        """
        if self.valid is not None:
            test_size = len(self.valid)
            y_true = list(self.valid.label)
        else:
            print("No Valid dataset. Making Predictions for Train dataset")
            test_size = len(self.train)
            y_true = self.train.label
            
        print(f"\nMaking Prediction based on Majority Class")
        y_pred = self.majority_class(test_size=test_size)

        if 'lidirus' in self.path.lower():
            print(f" Matthews Correlation: {self.show_mc(y_true, y_pred)}")
        else:
            self.show_report(y_true, y_pred)

    def show_report(self, y_true, y_pred):
        print(classification_report(y_true, y_pred))

    def show_mc(self, y_true, y_pred):
        return matthews_corrcoef(y_true, y_pred)
       
    def majority_class(self, test_size):
        """
        Make prediction based on majority class of train dataset
        test_size: how many predictions should be made
        return: List of predictions
        """

        prediction = self.train.label.mode()[0]
        y_pred = [prediction] * test_size
        return y_pred

    def random_choice(self, test_size):
        """
        Make random predictions
        label: label column in df (str)
        test_size: how many predictions should be made
        return: List of predictions
        """
        options = sorted(self.train.label.unique())
        if test_size != 1:
            np.random.seed(self.seed)
        y_pred = np.random.choice(options, size=test_size)
        return y_pred

    def random_balanced_choice(self, test_size):
        """
        Make random predictions with calculated probabilities
        label: label column in df (str)
        test_size: how many predictions should be made
        return: List of predictions
        """
        frequences = dict(self.train.label.value_counts(normalize=True))

        labels = []
        probs = []
        for key, value in frequences.items():
            labels.append(key)
            probs.append(value)
        if test_size != 1:
            np.random.seed(self.seed)
        y_pred = np.random.choice(labels, size=test_size, p=probs)
        return y_pred

# LiDiRus

In [None]:
%%capture
%%bash
# change url if you want to work with a different RSG dataset
wget -q "https://russiansuperglue.com/ru/tasks/download/LiDiRus" -O temp.zip
unzip temp.zip -d data

# remove unnecessary directories and files
rm temp.zip
rm -r data/__MACOSX
rm -r sample_data/

In [None]:
lidirus = Baseline(path='data/LiDiRus/LiDiRus.jsonl')
lidirus.majority()

No Valid dataset. Making Predictions for Train dataset

Making Prediction based on Majority Class
 Matthews Correlation: 0.0


In [None]:
metrics = []
for seed in SEEDS:
    lidirus = Baseline(path='data/LiDiRus/LiDiRus.jsonl', seed=seed)
    metrics.append(lidirus.mc_only(predict_function=lidirus.random_choice))
print(f"Random Choice")
print(f"Average MC score over {len(SEEDS)} experiments: {np.array(metrics).mean()}")

Random Choice
Average MC score over 3 experiments: 0.019522405921979163


In [None]:
metrics = []
for seed in SEEDS:
    lidirus = Baseline(path='data/LiDiRus/LiDiRus.jsonl', seed=seed)
    metrics.append(lidirus.mc_only(predict_function=lidirus.random_balanced_choice))
print(f"Random Balanced Choice")
print(f"Average MC score over {len(SEEDS)} experiments: {np.array(metrics).mean()}")

Random Balanced Choice
Average MC score over 3 experiments: -0.012200643650637838


# RCB

In [None]:
%%capture
%%bash
# change url if you want to work with a different RSG dataset
wget -q "https://russiansuperglue.com/tasks/download/RCB" -O temp.zip
unzip temp.zip -d data

# remove unnecessary directories and files
rm temp.zip
rm -r data/__MACOSX
rm -r sample_data/

In [None]:
rcb = Baseline(path='data/RCB/train.jsonl',
               path_valid='data/RCB/val.jsonl')
rcb.majority()


Making Prediction based on Majority Class
               precision    recall  f1-score   support

contradiction       0.00      0.00      0.00        30
   entailment       0.00      0.00      0.00        74
      neutral       0.53      1.00      0.69       116

     accuracy                           0.53       220
    macro avg       0.18      0.33      0.23       220
 weighted avg       0.28      0.53      0.36       220



In [None]:
f1a_metrics = []
acc_metrics = []
for seed in SEEDS:
    rcb = Baseline(path='data/RCB/train.jsonl', 
                   path_valid='data/RCB/val.jsonl',
                   seed=seed)
    
    f1, acc = rcb.f1_accuracy_only(predict_function=rcb.random_choice)
    f1a_metrics.append(f1)
    acc_metrics.append(acc)
print(f"Random Choice")
print(f"Average Accuracy score over {len(SEEDS)} experiments: {np.array(acc_metrics).mean()}")
print(f"Average F1-macro score over {len(SEEDS)} experiments: {np.array(f1a_metrics).mean()}")

Random Choice
Average Accuracy score over 3 experiments: 0.35000000000000003
Average F1-macro score over 3 experiments: 0.3184430925012964


In [None]:
f1a_metrics = []
acc_metrics = []
for seed in SEEDS:
    rcb = Baseline(path='data/RCB/train.jsonl', 
                   path_valid='data/RCB/val.jsonl',
                   seed=seed)
    
    f1, acc = rcb.f1_accuracy_only(predict_function=rcb.random_balanced_choice)
    f1a_metrics.append(f1)
    acc_metrics.append(acc)
print(f"Random Balanced Choice")
print(f"Average Accuracy score over {len(SEEDS)} experiments: {np.array(acc_metrics).mean()}")
print(f"Average F1-macro score over {len(SEEDS)} experiments: {np.array(f1a_metrics).mean()}")

Random Balanced Choice
Average Accuracy score over 3 experiments: 0.37121212121212116
Average F1-macro score over 3 experiments: 0.3076621045848644


# PARus

In [None]:
%%capture
%%bash
# change url if you want to work with a different RSG dataset
wget -q  "https://russiansuperglue.com/tasks/download/PARus" -O temp.zip
unzip temp.zip -d data

# remove unnecessary directories and files
rm temp.zip
rm -r data/__MACOSX
rm -r sample_data/

In [None]:
parus = Baseline(path='data/PARus/train.jsonl',
               path_valid='data/PARus/val.jsonl')
parus.majority()


Making Prediction based on Majority Class
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        55
           1       0.45      1.00      0.62        45

    accuracy                           0.45       100
   macro avg       0.23      0.50      0.31       100
weighted avg       0.20      0.45      0.28       100



In [None]:
f1a_metrics = []
acc_metrics = []
for seed in SEEDS:
    parus = Baseline(path='data/PARus/train.jsonl',
                     path_valid='data/PARus/val.jsonl',
                     seed=seed)
    
    f1, acc = parus.f1_accuracy_only(predict_function=parus.random_choice)
    f1a_metrics.append(f1)
    acc_metrics.append(acc)
print(f"Random Choice")
print(f"Average Accuracy score over {len(SEEDS)} experiments: {np.array(acc_metrics).mean()}")
print(f"Average F1-macro score over {len(SEEDS)} experiments: {np.array(f1a_metrics).mean()}")

Random Choice
Average Accuracy score over 3 experiments: 0.5166666666666666
Average F1-macro score over 3 experiments: 0.5151684790713317


In [None]:
f1a_metrics = []
acc_metrics = []
for seed in SEEDS:
    parus = Baseline(path='data/PARus/train.jsonl',
                     path_valid='data/PARus/val.jsonl',
                     seed=seed)
    
    f1, acc = parus.f1_accuracy_only(predict_function=parus.random_balanced_choice)
    f1a_metrics.append(f1)
    acc_metrics.append(acc)
print(f"Random Balanced Choice")
print(f"Average Accuracy score over {len(SEEDS)} experiments: {np.array(acc_metrics).mean()}")
print(f"Average F1-macro score over {len(SEEDS)} experiments: {np.array(f1a_metrics).mean()}")

Random Balanced Choice
Average Accuracy score over 3 experiments: 0.52
Average F1-macro score over 3 experiments: 0.5195209881342803


#TERRa

In [None]:
%%capture
%%bash
# change url if you want to work with a different RSG dataset
wget -q  "https://russiansuperglue.com/tasks/download/TERRa" -O temp.zip
unzip temp.zip -d data

# remove unnecessary directories and files
rm temp.zip
rm -r data/__MACOSX
rm -r sample_data/

In [None]:
terra = Baseline(path='data/TERRa/train.jsonl',
               path_valid='data/TERRa/val.jsonl')
terra.majority()


Making Prediction based on Majority Class
                precision    recall  f1-score   support

    entailment       0.50      1.00      0.67       153
not_entailment       0.00      0.00      0.00       154

      accuracy                           0.50       307
     macro avg       0.25      0.50      0.33       307
  weighted avg       0.25      0.50      0.33       307



In [None]:
f1a_metrics = []
acc_metrics = []
for seed in SEEDS:
    terra = Baseline(path='data/TERRa/train.jsonl',
                     path_valid='data/TERRa/val.jsonl',
                     seed=seed)
    
    f1, acc = terra.f1_accuracy_only(predict_function=terra.random_choice)
    f1a_metrics.append(f1)
    acc_metrics.append(acc)
print(f"Random Choice")
print(f"Average Accuracy score over {len(SEEDS)} experiments: {np.array(acc_metrics).mean()}")
print(f"Average F1-macro score over {len(SEEDS)} experiments: {np.array(f1a_metrics).mean()}")

Random Choice
Average Accuracy score over 3 experiments: 0.498371335504886
Average F1-macro score over 3 experiments: 0.4981518746617879


In [None]:
f1a_metrics = []
acc_metrics = []
for seed in SEEDS:
    terra = Baseline(path='data/TERRa/train.jsonl',
                     path_valid='data/TERRa/val.jsonl',
                     seed=seed)
    
    f1, acc = terra.f1_accuracy_only(predict_function=terra.random_balanced_choice)
    f1a_metrics.append(f1)
    acc_metrics.append(acc)
print(f"Random Balanced Choice")
print(f"Average Accuracy score over {len(SEEDS)} experiments: {np.array(acc_metrics).mean()}")
print(f"Average F1-macro score over {len(SEEDS)} experiments: {np.array(f1a_metrics).mean()}")

Random Balanced Choice
Average Accuracy score over 3 experiments: 0.49294245385450597
Average F1-macro score over 3 experiments: 0.49242854242830375


# RUSSE

In [None]:
%%capture
%%bash
# change url if you want to work with a different RSG dataset
wget -q "https://russiansuperglue.com/tasks/download/RUSSE" -O temp.zip
unzip temp.zip -d data

# remove unnecessary directories and files
rm temp.zip
rm -r data/__MACOSX
rm -r sample_data/

In [None]:
russe = Baseline(path='data/RUSSE/train.jsonl',
               path_valid='data/RUSSE/val.jsonl')
russe.majority()


Making Prediction based on Majority Class
              precision    recall  f1-score   support

       False       0.63      1.00      0.77      5366
        True       0.00      0.00      0.00      3139

    accuracy                           0.63      8505
   macro avg       0.32      0.50      0.39      8505
weighted avg       0.40      0.63      0.49      8505



In [None]:
f1a_metrics = []
acc_metrics = []
for seed in SEEDS:
    russe = Baseline(path='data/RUSSE/train.jsonl',
               path_valid='data/RUSSE/val.jsonl',
               seed=seed)
    
    f1, acc = russe.f1_accuracy_only(predict_function=russe.random_choice)
    f1a_metrics.append(f1)
    acc_metrics.append(acc)
print(f"Random Choice")
print(f"Average Accuracy score over {len(SEEDS)} experiments: {np.array(acc_metrics).mean()}")
print(f"Average F1-macro score over {len(SEEDS)} experiments: {np.array(f1a_metrics).mean()}")

Random Choice
Average Accuracy score over 3 experiments: 0.49900058788947677
Average F1-macro score over 3 experiments: 0.49057702373389867


In [None]:
f1a_metrics = []
acc_metrics = []
for seed in SEEDS:
    russe = Baseline(path='data/RUSSE/train.jsonl',
               path_valid='data/RUSSE/val.jsonl',
               seed=seed)
    
    f1, acc = russe.f1_accuracy_only(predict_function=russe.random_balanced_choice)
    f1a_metrics.append(f1)
    acc_metrics.append(acc)
print(f"Random Balanced Choice")
print(f"Average Accuracy score over {len(SEEDS)} experiments: {np.array(acc_metrics).mean()}")
print(f"Average F1-macro score over {len(SEEDS)} experiments: {np.array(f1a_metrics).mean()}")

Random Balanced Choice
Average Accuracy score over 3 experiments: 0.5342347638643935
Average F1-macro score over 3 experiments: 0.4962201497224709


# RWSD

In [None]:
%%capture
%%bash
# change url if you want to work with a different RSG dataset
wget -q "https://russiansuperglue.com/tasks/download/RWSD" -O temp.zip
unzip temp.zip -d data

# remove unnecessary directories and files
rm temp.zip
rm -r data/__MACOSX
rm -r sample_data/

In [None]:
rwsd = Baseline(path='data/RWSD/train.jsonl',
               path_valid='data/RWSD/val.jsonl')
rwsd.majority()


Making Prediction based on Majority Class
              precision    recall  f1-score   support

       False       0.55      1.00      0.71       113
        True       0.00      0.00      0.00        91

    accuracy                           0.55       204
   macro avg       0.28      0.50      0.36       204
weighted avg       0.31      0.55      0.39       204



In [None]:
f1a_metrics = []
acc_metrics = []
for seed in SEEDS:
    rwsd = Baseline(path='data/RWSD/train.jsonl',
                    path_valid='data/RWSD/val.jsonl',
                    seed=seed)
    
    f1, acc = rwsd.f1_accuracy_only(predict_function=rwsd.random_choice)
    f1a_metrics.append(f1)
    acc_metrics.append(acc)
print(f"Random Choice")
print(f"Average Accuracy score over {len(SEEDS)} experiments: {np.array(acc_metrics).mean()}")
print(f"Average F1-macro score over {len(SEEDS)} experiments: {np.array(f1a_metrics).mean()}")

Random Choice
Average Accuracy score over 3 experiments: 0.5261437908496732
Average F1-macro score over 3 experiments: 0.5238139284486554


In [None]:
f1a_metrics = []
acc_metrics = []
for seed in SEEDS:
    rwsd = Baseline(path='data/RWSD/train.jsonl',
                    path_valid='data/RWSD/val.jsonl',
                    seed=seed)
    
    f1, acc = rwsd.f1_accuracy_only(predict_function=rwsd.random_balanced_choice)
    f1a_metrics.append(f1)
    acc_metrics.append(acc)
print(f"Random Balanced Choice")
print(f"Average Accuracy score over {len(SEEDS)} experiments: {np.array(acc_metrics).mean()}")
print(f"Average F1-macro score over {len(SEEDS)} experiments: {np.array(f1a_metrics).mean()}")

Random Balanced Choice
Average Accuracy score over 3 experiments: 0.4950980392156863
Average F1-macro score over 3 experiments: 0.4908715414536949


# DaNetQA

In [None]:
%%capture
%%bash
# change url if you want to work with a different RSG dataset
wget -q "https://russiansuperglue.com/tasks/download/DaNetQA" -O temp.zip
unzip temp.zip -d data

# remove unnecessary directories and files
rm temp.zip
rm -r data/__MACOSX
rm -r sample_data/

In [None]:
danetqa = Baseline(path='data/DaNetQA/train.jsonl',
               path_valid='data/DaNetQA/val.jsonl')
danetqa.majority()


Making Prediction based on Majority Class
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       409
        True       0.50      1.00      0.67       412

    accuracy                           0.50       821
   macro avg       0.25      0.50      0.33       821
weighted avg       0.25      0.50      0.34       821



In [None]:
f1a_metrics = []
acc_metrics = []
for seed in SEEDS:
    danetqa = Baseline(path='data/DaNetQA/train.jsonl',
                       path_valid='data/DaNetQA/val.jsonl',
                       seed=seed)
    
    f1, acc = danetqa.f1_accuracy_only(predict_function=danetqa.random_choice)
    f1a_metrics.append(f1)
    acc_metrics.append(acc)
print(f"Random Choice")
print(f"Average Accuracy score over {len(SEEDS)} experiments: {np.array(acc_metrics).mean()}")
print(f"Average F1-macro score over {len(SEEDS)} experiments: {np.array(f1a_metrics).mean()}")

Random Choice
Average Accuracy score over 3 experiments: 0.5002030044660982
Average F1-macro score over 3 experiments: 0.5000465922796313


In [None]:
f1a_metrics = []
acc_metrics = []
for seed in SEEDS:
    danetqa = Baseline(path='data/DaNetQA/train.jsonl',
                       path_valid='data/DaNetQA/val.jsonl',
                       seed=seed)
    
    f1, acc = danetqa.f1_accuracy_only(predict_function=danetqa.random_balanced_choice)
    f1a_metrics.append(f1)
    acc_metrics.append(acc)
print(f"Random Balanced Choice")
print(f"Average Accuracy score over {len(SEEDS)} experiments: {np.array(acc_metrics).mean()}")
print(f"Average F1-macro score over {len(SEEDS)} experiments: {np.array(f1a_metrics).mean()}")

Random Balanced Choice
Average Accuracy score over 3 experiments: 0.4973609419407227
Average F1-macro score over 3 experiments: 0.48881400729269947


# MuSeRC

In [1]:
%%capture
%%bash
# change url if you want to work with a different RSG dataset
wget -q "https://russiansuperglue.com/tasks/download/MuSeRC" -O temp.zip
unzip temp.zip -d data

# remove unnecessary directories and files
rm temp.zip
rm -r data/__MACOSX
rm -r sample_data/

In [2]:
%%capture
%%bash
wget -q "https://github.com/RussianNLP/RussianSuperGLUE/raw/master/tfidf_baseline/MuSeRC.py" -O MuSeRC.py

In [40]:
import functools
import jsonlines
import numpy as np
from MuSeRC import MuSeRCMetrics, MuSeRC_metrics

def MuSeRC_metrics(pred, labels):
    metrics = MuSeRCMetrics()
    em = metrics.exact_match_simple(pred, labels)
    em0 = metrics.exact_match_metrics_origin(pred, labels, 0)
    f1 = metrics.per_dataset_metric(pred, labels)
    f1a = f1[-1]
    return em0, f1a

Measures = MuSeRCMetrics

def eval_MuSeRC(train_path, val_path, test_path, vect):
    test_score, test_pred = eval_part_MuSeRC(test_path, vect)
    return None, {
        "train": eval_part_MuSeRC(train_path, vect)[0],
        "val": eval_part_MuSeRC(val_path, vect)[0],
        "test": test_score,
        "test_pred": test_pred
    }

def get_stats_MuSeRC(path, vect):
    with jsonlines.open(path) as reader:
        lines = list(reader)
    labels = []
    for row in lines:
        _, _labels, __ = get_row_pred_MuSeRC(row, vect)
        labels.extend(_labels)
    labels = [item for sublist in labels for item in sublist]
    MAJOR_LABEL = max(labels, key=labels.count)
    labels = pd.Series(labels)
    frequences = dict(labels.value_counts(normalize=True))
    OPTIONS = []
    PROBS = []
    for key, value in frequences.items():
        OPTIONS.append(key)
        PROBS.append(value)
    return MAJOR_LABEL, OPTIONS, PROBS
    

def eval_part_MuSeRC(path, vect):
    with jsonlines.open(path) as reader:
        lines = list(reader)
    preds = []
    labels = []
    res = []
    for row in lines:
        pred, lbls, res_ids = get_row_pred_MuSeRC(row, vect)
        preds.extend(pred)
        labels.extend(lbls)
        res.append(res_ids)
    return MuSeRC_metrics(preds, labels), res


def get_row_pred_MuSeRC(row, vect):
    res = []
    labels = []
    res_ids = {"idx": row["idx"], "passage": {"questions": []}}
    for line in row["passage"]["questions"]:
        res_line = {"idx": line["idx"], "answers": []}
        line_answers = []
        line_labels = []
        for answ in line["answers"]:
            line_labels.append(answ.get("label", 0))
            answ = f"{line['question']} {answ['text']}"
            line_answers.append(answ)

        if MODE == 'MAJOR':
            pred  = [0] * len(line["answers"])
        elif MODE == 'RANDOM':
            pred = list(np.random.choice(OPTIONS, size=len(line['answers'])))
        elif MODE == 'RB':
            pred = list(np.random.choice(OPTIONS, size=len(line['answers']), p=PROBS))
        else:
            size = np.random.choice(np.arange(1, 5), size=1)[0]
            if size > len(line_answers):
                pred = [1] * len(line_answers)
            else:
                pred = np.random.choice(np.arange(len(line_answers)),
                                        size=size,
                                        replace=False)
            pred = [int(idx in pred) for idx in range(len(line["answers"]))]
        res.append(pred)
        labels.append(line_labels)
        for answ, p in zip(line["answers"], pred):
            res_line["answers"].append({"idx": answ["idx"], "label": p})
        res_ids["passage"]["questions"].append(res_line)
    return res, labels, res_ids

train_path = "data/MuSeRC/train.jsonl"
val_path = "data/MuSeRC/val.jsonl"
test_path = "data/MuSeRC/test.jsonl"

In [29]:
MODE = 'MAJOR'
MAJOR_LABEL, OPTIONS, PROBS = get_stats_MuSeRC(train_path, 'No vect')
MAJOR_LABEL, OPTIONS, PROBS

(0, [0, 1], [0.5496234309623431, 0.4503765690376569])

In [42]:
MODE = 'MAJOR'
em_metrics = []
f1a_metrics = []

for i in range(3):
    _, MuSeRC_scores = eval_MuSeRC(train_path, val_path, test_path, 'No vect')
    em = MuSeRC_scores["val"][0]
    f1a = MuSeRC_scores["val"][1]
    em_metrics.append(em)
    f1a_metrics.append(f1a)

print(f"Major Class")
print(f"Average EM score over 3 experiments: {np.array(em_metrics).mean()}")
print(f"Average F1-a score over 3 experiments: {np.array(f1a_metrics).mean()}")

Major Class
Average EM score over 3 experiments: 0.0
Average F1-a score over 3 experiments: 0.0


In [43]:
MODE = 'RANDOM'
em_metrics = []
f1a_metrics = []

for i in range(3):
    _, MuSeRC_scores = eval_MuSeRC(train_path, val_path, test_path, 'No vect')
    em = MuSeRC_scores["val"][0]
    f1a = MuSeRC_scores["val"][1]
    em_metrics.append(em)
    f1a_metrics.append(f1a)

print(f"Random Choice")
print(f"Average EM score over 3 experiments: {np.array(em_metrics).mean()}")
print(f"Average F1-a score over 3 experiments: {np.array(f1a_metrics).mean()}")

Random Choice
Average EM score over 3 experiments: 0.06238185255198488
Average F1-a score over 3 experiments: 0.4690999401921761


In [44]:
MODE = 'RB'
em_metrics = []
f1a_metrics = []

for i in range(3):
    _, MuSeRC_scores = eval_MuSeRC(train_path, val_path, test_path, 'No vect')
    em = MuSeRC_scores["val"][0]
    f1a = MuSeRC_scores["val"][1]
    em_metrics.append(em)
    f1a_metrics.append(f1a)

print(f"Random Balanced Choice")
print(f"Average EM score over 3 experiments: {np.array(em_metrics).mean()}")
print(f"Average F1-a score over 3 experiments: {np.array(f1a_metrics).mean()}")

Random Balanced Choice
Average EM score over 3 experiments: 0.074354127284184
Average F1-a score over 3 experiments: 0.4376949078560884


In [45]:
MODE = 'RANDOM_OLD'
em_metrics = []
f1a_metrics = []

for i in range(3):
    _, MuSeRC_scores = eval_MuSeRC(train_path, val_path, test_path, 'No vect')
    em = MuSeRC_scores["val"][0]
    f1a = MuSeRC_scores["val"][1]
    em_metrics.append(em)
    f1a_metrics.append(f1a)

print(f"Random Choice")
print(f"Average EM score over 3 experiments: {np.array(em_metrics).mean()}")
print(f"Average F1-a score over 3 experiments: {np.array(f1a_metrics).mean()}")

Random Choice
Average EM score over 3 experiments: 0.07939508506616257
Average F1-a score over 3 experiments: 0.4998549551984078


# RuCoS

In [None]:
%%capture
%%bash
# change url if you want to work with a different RSG dataset
wget -q "https://russiansuperglue.com/tasks/download/RuCoS" -O temp.zip
unzip temp.zip -d data

# remove unnecessary directories and files
rm temp.zip
rm -r data/__MACOSX
rm -r sample_data/

In [None]:
%%capture
%%bash
wget -q "https://github.com/RussianNLP/RussianSuperGLUE/raw/master/tfidf_baseline/RuCoS.py" -O RuCoS.py

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import jsonlines
import numpy as np
from collections import Counter
import string
import re
import sys
import numpy as np
from RuCoS import normalize_answer, f1_score, exact_match_score, metric_max_over_ground_truths


def evaluate(dataset, predictions):
    f1 = exact_match = total = 0
    correct_ids = []
    for prediction, passage in zip(predictions, dataset):
        prediction = prediction["label"]
        for qa in passage['qas']:
            total += 1
            ground_truths = list(map(lambda x: x['text'], qa.get("answers", "")))

            _exact_match = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
            if int(_exact_match) == 1:
                correct_ids.append(qa['idx'])
            exact_match += _exact_match

            f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)

    exact_match = exact_match / total
    f1 = f1 / total
    return exact_match, f1


def eval_RuCoS(train_path, val_path, test_path, vect):
    test_score, test_pred = eval_part(test_path, vect)
    return None, {
        "train": eval_part(train_path, vect)[0],
        "val": eval_part(val_path, vect)[0],
        "test": test_score,
        "test_pred": test_pred
    }


def eval_part(path, vect):
    with jsonlines.open(path) as reader:
        lines = list(reader)
    preds = []
    for row in lines:
        pred = get_row_pred(row, vect)
        preds.append({
            "idx": row["idx"],
            "label": pred
        })
    return evaluate(lines, preds), preds


def get_row_pred(row, vect):
    res = []
    words = [
        row["passage"]["text"][x["start"]: x["end"]]
        for x in row["passage"]["entities"]]
    for line in row["qas"]:
        line_candidates = []
        for word in words:
            line_candidates.append(line["query"].replace("@placeholder", word))
        
        
        pred_idx = np.random.choice(np.arange(1, len(line_candidates)),
                                    size=1)[0]
        pred = np.array(words)[pred_idx]      
        res.append(pred)
    return " ".join(res)

train_path = "data/RuCoS/train.jsonl"
val_path = "data/RuCoS/val.jsonl"
test_path = "data/RuCoS/test.jsonl"

In [None]:
em_metrics = []
f1_metrics = []

for i in range(3):
    _, RuCoS_scores = eval_RuCoS(train_path, val_path, test_path, 'No vect')
    em = RuCoS_scores['val'][0]
    f1 = RuCoS_scores['val'][1]
    em_metrics.append(em)
    f1_metrics.append(f1)

print(f"Random Choice")
print(f"Average EM score over 3 experiments: {np.array(em_metrics).mean()}")
print(f"Average F1 score over 3 experiments: {np.array(f1_metrics).mean()}")

Random Choice
Average EM score over 3 experiments: 0.23267784083410323
Average F1 score over 3 experiments: 0.23464871761031195
