<a href="https://colab.research.google.com/github/tatiana-iazykova/2020_HACK_RUSSIANSUPERGLUE/blob/main/generate_json.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!wget https://russiansuperglue.com/tasks/download
!unzip download
!rm download
!rm -r /content/__MACOSX
!rm -r sample_data/

In [None]:
from pathlib import Path
data_dir = Path("combined/")

In [None]:
!wget -q --show-progress "https://raw.githubusercontent.com/tatiana-iazykova/2020_HACK_RUSSIANSUPERGLUE/main/base.py" -O base.py



In [None]:
import pandas as pd

class JSONL_handler():
    """ opens a jsonl file and turns it into a necessary data structure """
    
    def __init__(self, path):
        self.path = path # path to jsonl file

    def to_pandas(self):
        """ get jsonl file content as a pandas DataFrame"""
        return pd.read_json(path_or_buf=self.path, lines=True)

# Baseline

In [None]:
output_dir = Path("random_submission")
!mkdir $output_dir

In [None]:
output_dir_majority = Path("majority_submission")
!mkdir $output_dir_majority

In [None]:
output_dir_random_weighted = Path("random_weighted_submission")
!mkdir $output_dir_random_weighted

In [None]:
from base import BaseSolverSubmit
import json

class Random_submission():
  def __init__(self, dataset, path = None, path_valid = None, path_test = None):
    self.dataset = dataset
    self.path = '/content/combined/' + dataset + '/train.jsonl' if path is None else path
    self.path_valid = '/content/combined/' + dataset + '/val.jsonl' if path_valid is None else path_valid
    self.path_test = '/content/combined/' + dataset + '/test.jsonl' if path_test is None else path_test

  def test_output(self):
    test = JSONL_handler(self.path_test).to_pandas()
    test_pred = [{"idx": idx, "label": str(label).lower()} for idx, label in zip(test.idx, self.scores)]
    return test_pred

  def get_scores_random(self):
    solver = BaseSolverSubmit(path = self.path, path_valid = self.path_valid, path_test = self.path_test)
    self.scores = solver.random_choice(len(solver.valid))
    filename = self.dataset + ".jsonl"
    self.save_output(self.test_output(), output_dir / filename)
  
  def get_scores_majority(self):
    solver = BaseSolverSubmit(path = self.path, path_valid = self.path_valid, path_test = self.path_test)
    self.scores = solver.majority_class(len(solver.valid))
    filename = self.dataset + ".jsonl"
    self.save_output(self.test_output(), output_dir_majority / filename)

  def get_scores_random_weighted(self):
    solver = BaseSolverSubmit(path = self.path, path_valid = self.path_valid, path_test = self.path_test)
    self.scores = solver.random_balanced_choice(len(solver.valid))
    filename = self.dataset + ".jsonl"
    self.save_output(self.test_output(), output_dir_random_weighted / filename)
  
  def save_output(self, data, path):
    with open(path, mode="w") as file:
        for line in sorted(data, key=lambda x: int(x.get("idx"))):
            line["idx"] = int(line["idx"])
            file.write(f"{json.dumps(line, ensure_ascii=False)}\n")

## Datasets

### DaNetQA

In [None]:
random = Random_submission('DaNetQA')
random.get_scores_random()
majority.get_scores_majority()
random_w.get_scores_random_weighted()

### RCB

In [None]:
random_RCB = Random_submission('RCB')
random_RCB.get_scores_random()
majority_RCB.get_scores_majority()
random_w_RCB.get_scores_random_weighted()

### PARus

In [None]:
random_PARus = Random_submission('PARus')
random_PARus.get_scores_random()
majority_PARus.get_scores_majority()
random_w_PARus.get_scores_random_weighted()

### TERRa

In [None]:
random_TERRa = Random_submission('TERRa')
random_TERRa.get_scores_random()
majority_TERRa.get_scores_majority()
random_w_TERRa.get_scores_random_weighted()

### RUSSE

In [None]:
random_RUSSE = Random_submission('RUSSE')
random_RUSSE.get_scores_random()
majority_RUSSE.get_scores_majority()
random_w_RUSSE.get_scores_random_weighted()

### RWSD

In [None]:
random_RWSD = Random_submission('RWSD')
random_RWSD.get_scores_random()
majority_RWSD.get_scores_majority()
random_w_RWSD.get_scores_random_weighted()

### LidiRus

In [None]:
random_LiDiRus = Random_submission('LiDiRus', path = '/content/combined/TERRa/train.jsonl', path_valid='/content/combined/TERRa/val.jsonl',
                                   path_test = '/content/combined/LiDiRus/LiDiRus.jsonl')
random_LiDiRus.get_scores_random()
majority_LiDiRus.get_scores_majority()
random_w_LiDiRus.get_scores_random_weighted()

# Optimised Tfidf

In [None]:
output_dir_tfidf = Path("tfidf_submission")
!mkdir $output_dir_tfidf

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

In [None]:
%%capture
# RSG baseline class for MuSeRC
!pip3 install jsonlines
!wget -q --show-progress "https://github.com/RussianNLP/RussianSuperGLUE/raw/master/tfidf_baseline/MuSeRC.py" -O MuSeRC.py
!wget -q --show-progress "https://github.com/RussianNLP/RussianSuperGLUE/raw/master/tfidf_baseline/RuCoS.py" -O RuCoS.py
import MuSeRC
import RuCoS

In [None]:
def unite(path1, path2):
  df = JSONL_handler(path1).to_pandas()
  df1 = JSONL_handler(path2).to_pandas()
  return pd.concat([df, df1])

In [None]:
import json

class Tfidf_Submisssion():
  
  def __init__(self, test, predictions, filename):
    self.test = test
    self.predictions = predictions
    self.filename = filename + '.jsonl'

  def test_output(self):
    test_pred = [{"idx": idx, "label": str(label).lower()} for idx, label in zip(self.test.idx, self.predictions)]
    self.save_output(test_pred, output_dir_tfidf / self.filename)

  def save_output(self, data, path):
    with open(path, mode="w") as file:
        for line in sorted(data, key=lambda x: int(x.get("idx"))):
            line["idx"] = int(line["idx"])
            file.write(f"{json.dumps(line, ensure_ascii=False)}\n")

## Datasets

### RCB

In [None]:
RCB_train = unite('/content/combined/RCB/train.jsonl', '/content/combined/RCB/val.jsonl')
RCB_test = JSONL_handler('/content/combined/RCB/test.jsonl').to_pandas()

In [None]:
steps_RCB = [('tfidf', TfidfVectorizer(analyzer='word', max_features=10000)),
          ('func', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
         ('sgd', SGDClassifier(loss="log", n_jobs=-1, alpha=0.00001, class_weight='balanced', random_state=42))]

pipeline_RCB = Pipeline(steps_RCB)

pipeline_RCB.fit(RCB_train.hypothesis, RCB_train.label)
y_pred_RCB = pipeline_RCB.predict(RCB_test.hypothesis)

In [None]:
tfidf_rcb = Tfidf_Submisssion(RCB_test, y_pred_RCB, 'RCB')
tfidf_rcb.test_output()

### RWSD


In [None]:
class JSONL_handler_2():
    """ opens a jsonl file and turns it into a necessary data structure """
    
    def __init__(self, path):
        self.path = path # path to jsonl file

    def to_pandas(self):
        """ get jsonl file content as a pandas DataFrame"""

        data = self.read_jsonlines()

        return pd.json_normalize(data).drop(columns=['idx'])

    
    def read_jsonlines(self):
        """ yields json lines one by one """
        data = []
        with open(self.path) as f:
            for line in f:
                data.append(json.loads(line))
        return data

In [None]:
RWSD_train = pd.concat([JSONL_handler_2('/content/combined/RWSD/train.jsonl').to_pandas(), JSONL_handler_2('/content/combined/RWSD/val.jsonl').to_pandas()])
RWSD_test = JSONL_handler_2('/content/combined/RWSD/test.jsonl').to_pandas()

In [None]:
RWSD_test_1 = JSONL_handler('/content/combined/RWSD/test.jsonl').to_pandas()

In [None]:
steps_RWSD  = [('tfidf', TfidfVectorizer(analyzer= 'char_wb',max_df= 0.8999999999999999, min_df=0.001, ngram_range=(1, 3))),
         ('logreg', LogisticRegression( C = 1.01, class_weight='balanced'))]

rwsd_tr = RWSD_train.assign(merged=lambda x: x['text'] + "<sep>" + x['target.span1_text'] + "<sep>" + x['target.span2_text'])
rwsd_df = RWSD_test.assign(merged=lambda x: x['text'] + "<sep>" + x['target.span1_text'] + "<sep>" + x['target.span2_text'])

pipeline_RWSD = Pipeline(steps_RWSD)

pipeline_RWSD.fit(rwsd_tr.merged, rwsd_tr.label)
y_pred_RWSD = pipeline_RWSD.predict(rwsd_df.merged)

In [None]:
tfidf_rwsd = Tfidf_Submisssion(RWSD_test_1, y_pred_RWSD, 'RWSD')
tfidf_rwsd.test_output()

### MuSeRC

In [None]:
train_path = "combined/MuSeRC/train.jsonl"
val_path = "combined/MuSeRC/val.jsonl"
test_path = "combined/MuSeRC/test.jsonl"

muserc = JSONL_handler(train_path).to_pandas()

def extract_passages(row):
    return row.get('text')

muserc['text'] = muserc['passage'].apply(extract_passages)

In [None]:
%%capture
# These parameters show the highest result during tryouts
vect = TfidfVectorizer(ngram_range=(1, 3), analyzer='char_wb', max_df = 0.8, max_features=5000)
# Trained on most passages only to use consine_similarity with question+asnwer pairs
vect.fit(muserc.text)

In [None]:
_, MuSeRC_scores = MuSeRC.eval_MuSeRC(train_path, val_path, test_path, vect)

In [None]:
scores_MuSeRC = MuSeRC_scores["test_pred"]

In [None]:
tfidf_muserc = Tfidf_Submisssion(test_path, scores_MuSeRC, 'MuSeRC')
tfidf_muserc.save_output(scores_MuSeRC, output_dir_tfidf / 'MuSeRC.jsonl')

### RuCos


In [None]:
class JSONL_handler_1():
    """ opens a jsonl file and turns it into a necessary data structure """
    
    def __init__(self, path):
        self.path = path # path to jsonl file

    def to_pandas(self):
        """ get jsonl file content as a pandas DataFrame"""

        text_df = pd.DataFrame(columns=['text', 'entities'])
        questions_df = pd.DataFrame(columns=['text_id',
                                             'question', 'answers'])

        lines = self.yield_lines()

        for passage_id, line in enumerate(lines):
            text, entities, questions = self.split_text_and_questions(line)
            text_df = text_df.append({'text':text, 'entities': entities}, 
                           ignore_index=True)
            for i in range(len(questions)):
                questions_df = questions_df.append({'text_id': passage_id,
                                    'question': questions[i]['query'],
                                     'answers': questions[i]['answers']},
                                    ignore_index=True)
        return text_df, questions_df

    def yield_lines(self):
        """ yields json lines one by one """
        with open(self.path) as f:
            for line in f:
                yield json.loads(line)


    def split_text_and_questions(self, line):
        """ transforms a complex json object into a single row dataframe"""
        text = line['passage']['text']
        entities = line['passage']['entities']
        questions = line['qas']

        return text, entities, questions

In [None]:
train = JSONL_handler_1('/content/combined/RuCoS/train.jsonl')
texts_train, questions_train = train.to_pandas()
valid = JSONL_handler_1('/content/combined/RuCoS/val.jsonl')
texts_valid, questions_valid = valid.to_pandas()

In [None]:
train_path_RuCos = "/content/combined/RuCoS/train.jsonl"
val_path_RuCos = "/content/combined/RuCoS/val.jsonl"
test_path_RuCos = "/content/combined/RuCoS/test.jsonl"

In [None]:
def fill_entities(text, entities):
    for entity in entities:
        entity['text'] = text[entity['start']:entity['end']]

for idx, row in texts_train.iterrows():
    fill_entities(row.text, row.entities)

for idx, row in texts_valid.iterrows():
    fill_entities(row.text, row.entities)

In [None]:
vec1 = TfidfVectorizer(ngram_range=(1, 2), analyzer='char_wb', max_df = 0.95)
vec1.fit(texts_train.text)

TfidfVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.95, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
_, RuCoS_scores = RuCoS.eval_RuCoS(train_path_RuCos, val_path_RuCos, test_path_RuCos, vec1)

In [None]:
scores_RuCoS = RuCoS_scores["test_pred"]

In [None]:
tfidf_rucos = Tfidf_Submisssion(test_path_RuCos, scores_RuCoS, 'RuCoS')
tfidf_rucos.save_output(scores_RuCoS, output_dir_tfidf / 'RuCoS.jsonl')

### TERRa

In [None]:
TERRa_train = unite('/content/combined/TERRa/train.jsonl', '/content/combined/TERRa/val.jsonl')
TERRa_test = JSONL_handler('/content/combined/TERRa/test.jsonl').to_pandas()

In [None]:
steps_TERRa = [('countvect', CountVectorizer(min_df=15, max_df=0.4, lowercase=True, analyzer ='char_wb', decode_error = 'ignore', ngram_range = (2, 4))),
         ('sgd', SGDClassifier(alpha = 1e-08, loss="log", n_jobs=-1, class_weight='balanced', random_state=42))]

pipeline_TERRa = Pipeline(steps_TERRa)

pipeline_TERRa.fit(TERRa_train.hypothesis, TERRa_train.label)
y_pred_TERRa = pipeline_TERRa.predict(TERRa_test.hypothesis)

In [None]:
tfidf_terra = Tfidf_Submisssion(TERRa_test, y_pred_TERRa, 'TERRa')
tfidf_terra.test_output()

### DaNetQA

In [None]:
DaNetQA_train = unite('/content/combined/DaNetQA/train.jsonl', '/content/combined/DaNetQA/val.jsonl')
DaNetQA_test = JSONL_handler('/content/combined/DaNetQA/test.jsonl').to_pandas()

In [None]:
steps_DaNetQA = [('vectorizer', TfidfVectorizer()),
              ('sgd', SGDClassifier(loss="log", n_jobs=-1, alpha=0.15, class_weight='balanced', random_state=42))]

pipeline_DaNetQA = Pipeline(steps_DaNetQA)

pipeline_DaNetQA.fit(DaNetQA_train.question, DaNetQA_train.label)
y_pred_DaNetQA = pipeline_DaNetQA.predict(DaNetQA_test.question)

In [None]:
tfidf_danetqa = Tfidf_Submisssion(DaNetQA_test, y_pred_DaNetQA, 'DaNetQa')
tfidf_danetqa.test_output()

### RUSSE

In [None]:
RUSSE_train = unite('/content/combined/RUSSE/train.jsonl', '/content/combined/RUSSE/val.jsonl')
RUSSE_test = JSONL_handler('/content/combined/RUSSE/test.jsonl').to_pandas()

In [None]:
def build_feature_RUSSE(row):
    sentence1 = row["sentence1"].strip()
    sentence2 = row["sentence2"].strip()
    word = row["word"].strip()
    res = f"{sentence1} {sentence2} {word}"
    return res

train_concat = []
for i, row in RUSSE_train.iterrows():
    train_concat.append(build_feature_RUSSE(row))
RUSSE_train['concatenated'] = train_concat

valid_concat = []
for i, row in RUSSE_test.iterrows():
    valid_concat.append(build_feature_RUSSE(row))
RUSSE_test['concatenated'] =  valid_concat

In [None]:
steps_RUSSE = [('tfidf', TfidfVectorizer(analyzer = 'word', max_df = 0.6, min_df= 0.001, ngram_range =  (1,2))),
         ('logreg', LogisticRegression(C = 1.01, class_weight='balanced'))]

pipeline_RUSSE = Pipeline(steps_RUSSE)

pipeline_RUSSE.fit(RUSSE_train.concatenated, RUSSE_train.label)
y_pred_RUSSE = pipeline_RUSSE.predict(RUSSE_test.concatenated)

In [None]:
tfidf_russe = Tfidf_Submisssion(RUSSE_test, y_pred_RUSSE, 'RUSSE')
tfidf_russe.test_output()

### PARus

In [None]:
PARus_train = unite('/content/combined/PARus/train.jsonl', '/content/combined/PARus/val.jsonl')
PARus_test = JSONL_handler('/content/combined/PARus/test.jsonl').to_pandas()

In [None]:
def build_feature_PARus(row):
    premise = str(row["premise"]).strip()
    choice1 = row["choice1"]
    choice2 = row["choice2"]
    label = row.get("label")
    question = "Что было ПРИЧИНОЙ этого?" if row["question"] == "cause" else "Что случилось в РЕЗУЛЬТАТЕ?"
    res = f"{premise} {question} {choice1} {choice2}"
    return res


train_concat = []
for i, row in PARus_train.iterrows():
    train_concat.append(build_feature_PARus(row))
PARus_train['concatenated'] = train_concat

valid_concat = []
for i, row in PARus_test.iterrows():
    valid_concat.append(build_feature_PARus(row))
PARus_test['concatenated'] =  valid_concat

In [None]:
steps_PARus = [('tfidf', TfidfVectorizer(analyzer= 'word', max_df= 0.6, min_df= 0.04, ngram_range= (1, 2))),
         ('logreg', LogisticRegression(C = 1e-10, class_weight='balanced'))]

pipeline_PARus = Pipeline(steps_PARus)

pipeline_PARus.fit(PARus_train.concatenated, PARus_train.label)
y_pred_PARus = pipeline_PARus.predict(PARus_test.concatenated)

In [None]:
tfidf_parus = Tfidf_Submisssion(PARus_test, y_pred_PARus, 'PARus')
tfidf_parus.test_output()

### LiDiRus

In [None]:
LiDiRus_train = unite('/content/combined/TERRa/train.jsonl', '/content/combined/TERRa/val.jsonl').assign(merged=lambda x: x.premise + "\n" + x.hypothesis)
LiDiRus_test = JSONL_handler('/content/combined/LiDiRus/LiDiRus.jsonl').to_pandas().assign(merged=lambda x: x.sentence1 + "\n" + x.sentence2)

In [None]:
steps_LiDiRus = [('tfidf', TfidfVectorizer(analyzer= 'char_wb', max_df= 0.6, min_df=0.091, ngram_range = (1, 1))),
         ('logreg', LogisticRegression(C = 1.01, class_weight='balanced'))]

pipeline_LiDiRus = Pipeline(steps_LiDiRus)

pipeline_LiDiRus.fit(LiDiRus_train.merged, LiDiRus_train.label)
y_pred_LiDiRus = pipeline_LiDiRus.predict(LiDiRus_test.merged)

In [None]:
tfidf_lidirus = Tfidf_Submisssion(LiDiRus_test, y_pred_LiDiRus, 'LiDiRus')
tfidf_lidirus.test_output()

# Heuristics

In [None]:
output_dir_heuristics_random = Path("heuristics_random_submission")
!mkdir $output_dir_heuristics_random
output_dir_heuristics_majority = Path("heuristics_majority_submission")
!mkdir $output_dir_heuristics_majority
output_dir_heuristics_rw = Path("heuristics_rw_submission")
!mkdir $output_dir_heuristics_rw

In [None]:
import json

class Heuristic_submission():
  def __init__(self, dataset, solver, path = None, path_valid = None, path_test = None):
    self.dataset = dataset
    self.path = '/content/combined/' + dataset + '/train.jsonl' if path is None else path
    self.path_valid = '/content/combined/' + dataset + '/val.jsonl' if path_valid is None else path_valid
    self.path_test = '/content/combined/' + dataset + '/test.jsonl' if path_test is None else path_test
    self.solver = solver(path=self.path, path_valid= self.path_valid, path_test = self.path_test)

  def test_output(self):
    test = JSONL_handler(self.path_test).to_pandas()
    test_pred = [{"idx": idx, "label": str(label).lower()} for idx, label in zip(test.idx, self.scores)]
    return test_pred

  def get_scores_random(self):
    self.scores = self.solver.heuristics_all(final_decision=self.solver.random_choice)
    filename = self.dataset + ".jsonl"
    self.save_output(self.test_output(), output_dir_heuristics_random / filename)
  
  def get_scores_majority(self):
    self.scores = self.solver.heuristics_all(final_decision=self.solver.majority_class)
    filename = self.dataset + ".jsonl"
    self.save_output(self.test_output(), output_dir_heuristics_majority / filename)

  def get_scores_random_weighted(self):
    self.scores = self.solver.heuristics_all(final_decision=self.solver.random_balanced_choice)
    filename = self.dataset + ".jsonl"
    self.save_output(self.test_output(), output_dir_heuristics_rw / filename)
  
  def save_output(self, data, path):
    with open(path, mode="w") as file:
        for line in sorted(data, key=lambda x: int(x.get("idx"))):
            line["idx"] = int(line["idx"])
            file.write(f"{json.dumps(line, ensure_ascii=False)}\n")

## Datasets

In [None]:
%%capture
!pip install pymorphy2[fast]
!pip install razdel
!pip install natasha
!pip3 install jsonlines

In [None]:
%%capture
!wget "https://github.com/RussianNLP/RussianSuperGLUE/raw/master/tfidf_baseline/LiDiRus.py" -O LiDiRus.py
!wget "https://github.com/tatiana-iazykova/2020_HACK_RUSSIANSUPERGLUE/raw/main/utils.py" -O utils.py
!wget "https://github.com/tatiana-iazykova/2020_HACK_RUSSIANSUPERGLUE/raw/main/Solvers/MuSeRCSolver.py" -O MuSeRCSolver.py

In [None]:
import re
from pymorphy2 import MorphAnalyzer
import nltk
from functools import lru_cache
from base import BaseSolverSubmit
from scipy import stats
from string import punctuation
from razdel import tokenize as razdel_tokenize
from base import BaseSolver
from utils import RSG_MorphAnalyzer
from sklearn.metrics.pairwise import cosine_similarity
import jsonlines
import numpy as np
from collections import Counter
import string
import sys


m = MorphAnalyzer()

In [None]:
import numpy as np
from natasha import (
    Segmenter,
    MorphVocab,  
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

In [None]:
import json
def save_output(data, path):
    with open(path, mode="w") as file:
        for line in sorted(data, key=lambda x: int(x.get("idx"))):
            line["idx"] = int(line["idx"])
            file.write(f"{json.dumps(line, ensure_ascii=False)}\n")

### TERRa

In [None]:
class TERRaSolver(BaseSolverSubmit):
   
    def __init__(self, path: str, path_valid=None, path_test=None):
        super(TERRaSolver, self).__init__(path, path_test, path_valid)
   
    def preprocess(self, columns):
      for column in columns:
        self.train[f"{column}_lemmas"] = self.train[column].apply(self.clean_text)
        self.valid[f"{column}_lemmas"] = self.valid[column].apply(self.clean_text)
 
    def words_only(self, text):
      rg = re.compile("[А-Яа-яA-z]+")
      try:
        return rg.findall(text.lower())
      except:
        return []

    @lru_cache(maxsize=128)
    def lemmatize_word(self, token, pymorphy=m):
      return pymorphy.parse(token)[0].normal_form

    def lemmatize_text(self, text):
      return [self.lemmatize_word(w) for w in text]

    def clean_text(self, text):
      tokens = self.words_only(text)
      lemmas = self.lemmatize_text(tokens)  
      return lemmas
    
    def heuristics_all(self, final_decision=None):
        y_pred = []
        self.preprocess(columns=['premise', "hypothesis"])

        for i, row in self.valid.iterrows():
          
          hyp = row.hypothesis.lower()
          hyp_lem = set(row['hypothesis_lemmas'])
          prem_lem = set(row['premise_lemmas'])
          indic_non_ent = set(['только', 'мужчина'])

          if hyp in row['premise'].lower():
             y_pred.append('entailment')
          elif len(prem_lem & hyp_lem)/len(hyp_lem) <= 1/3 or len(row['premise'].split()) < 29 or len(indic_non_ent & hyp_lem) > 0:
            y_pred.append('not_entailment')
          elif len(prem_lem & hyp_lem)/len(hyp_lem) == 0.75 or len(prem_lem & hyp_lem)/len(hyp_lem) == 1 or len(prem_lem & hyp_lem)/len(hyp_lem) == 2/3:
            y_pred.append('entailment')
          elif len(row['premise'].split()) > 32:
            y_pred.append('entailment')
          else:
            y_pred.append(final_decision(test_size=1)[0])
        
        return y_pred

In [None]:
terra_heuristics = Heuristic_submission('TERRa', TERRaSolver)
terra_heuristics.get_scores_random()
terra_heuristics.get_scores_majority()
terra_heuristics.get_scores_random_weighted()

### DaNetQA

In [None]:
class DaNetQASolver(BaseSolverSubmit):
    
    def __init__(self, path: str, path_valid=None, path_test=None):
        super(DaNetQASolver, self).__init__(path, path_test, path_valid)
   
    def heuristics_all(self, final_decision=None):
        y_pred = []

        for i, row in self.valid.iterrows():

            question = row.question.lower()
            question_w_count = len(question.split())
            passage_w_count = len(row.passage.split())

            if re.search("был|(^есть)", question):
              y_pred.append(True)
            elif re.search("^входит|едят|правда ли", question):
              y_pred.append(False)
            elif question_w_count > 5:
              y_pred.append(False)
            elif passage_w_count >= 90:
              y_pred.append(False)
            else:
              y_pred.append(final_decision(test_size=1)[0])
      
        return y_pred

In [None]:
danetqa_heuristics = Heuristic_submission('DaNetQA', DaNetQASolver)
danetqa_heuristics.get_scores_random()
danetqa_heuristics.get_scores_majority()
danetqa_heuristics.get_scores_random_weighted()

### RCB

In [None]:
class RCBSolver(BaseSolverSubmit):

    def __init__(self, path: str, path_valid=None, path_test=None):
        super(RCBSolver, self).__init__(path, path_test, path_valid)
   
    def preprocess(self, columns):
      for column in columns:
        self.train[f"{column}_lemmas"] = self.train[column].apply(self.clean_text)
        self.valid[f"{column}_lemmas"] = self.valid[column].apply(self.clean_text)
 
    def words_only(self, text):
      rg = re.compile("[А-Яа-яA-z]+")
      try:
        return rg.findall(text.lower())
      except:
        return []

    @lru_cache(maxsize=128)
    def lemmatize_word(self, token, pymorphy=m):
      return pymorphy.parse(token)[0].normal_form

    def lemmatize_text(self, text):
      return [self.lemmatize_word(w) for w in text]

    def clean_text(self, text):
      tokens = self.words_only(text)
      lemmas = self.lemmatize_text(tokens)  
      return lemmas
    
    def heuristics_all(self, final_decision=None):
        y_pred = []
        self.preprocess(columns=['premise', "hypothesis"])

        for i, row in self.valid.iterrows():
          
          hyp = row.hypothesis.lower()
          hyp_lem = set(row['hypothesis_lemmas'])
          prem_lem = set(row['premise_lemmas'])
          indic_neutral = set(['подозревать', 'cчитать', 'говорить', 'думать', 'надеяться', 'понять', 'уверять'])
          indic_ent = set(['признать'])

          if hyp in row['premise'].lower() and len(indic_ent & prem_lem) > 0 :
             y_pred.append('entailment')
          elif len(prem_lem & hyp_lem)/len(hyp_lem) == 0.6:
            y_pred.append('entailment')
          elif len(indic_neutral & prem_lem) > 0:
            y_pred.append('neutral')
          elif len(row.hypothesis.split()) < 4:
            y_pred.append('contradiction')
          elif len(row.hypothesis.split()) >= 5 and len(row.hypothesis.split()) <= 7:
            y_pred.append('neutral')
          elif len(row['premise'].split()) > 15:
            y_pred.append('entailment')
          else:
            y_pred.append(final_decision(test_size=1)[0])
        
        return y_pred

In [None]:
rcb_heuristics = Heuristic_submission('RCB', RCBSolver)
rcb_heuristics.get_scores_random()
rcb_heuristics.get_scores_majority()
rcb_heuristics.get_scores_random_weighted()

### PARus

In [None]:
class ParusSolver(BaseSolverSubmit):
   
    def __init__(self, path: str, path_valid=None, path_test=None):
        super(ParusSolver, self).__init__(path, path_test, path_valid)
   
    def preprocess(self, columns):

        for column in columns:
            self.train[f"{column}_lemmas"] = self.train[column].apply(self.lemmatize)
            self.valid[f"{column}_lemmas"] = self.valid[column].apply(self.lemmatize)

    def lemmatize(self, text):
        """
        param text: str
        return: List of lemmas (strings)
        """

        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)

        for token in doc.tokens:
            token.lemmatize(morph_vocab)
        lemmas = [token.lemma for token in doc.tokens]
        return lemmas

    
    def heuristics_all(self, final_decision=None):
        """
        This heruistic chooses the option that has more common lemmas with premise
        If the amount of common words is equal for both choices, it uses {final_desicion}
        function (one of BaseSolver functions) to predict
        param: final_decision (function)
        """
        y_pred = []
        self.preprocess(columns=['premise', 'choice1', 'choice2'])

        for i, row in self.valid.iterrows():
            words1 = set(row.choice1_lemmas)
            words2 = set(row.choice2_lemmas)
            premise = set(row.premise_lemmas)
            overlap1 = len(premise & words1)
            overlap2 = len(premise & words2)
            if overlap1 > overlap2:
                y_pred.append(0)
            elif overlap2 > overlap1:
                y_pred.append(1)
            else:
                y_pred.append(final_decision(test_size=1)[0])
        return y_pred

In [None]:
parus_heuristics = Heuristic_submission('PARus', ParusSolver)
parus_heuristics.get_scores_random()
parus_heuristics.get_scores_majority()
parus_heuristics.get_scores_random_weighted()

### RUSSE

In [None]:
class RusseSolver(BaseSolverSubmit):

    def __init__(self, path: str, path_valid=None, path_test=None):
        super(RusseSolver, self).__init__(path, path_test, path_valid)

    def heuristics_all(self, final_decision=None):
        y_pred = []

        for i, row in self.valid.iterrows():
            tokens1 = set(row.sentence1.split())
            tokens2 = set(row.sentence2.split())

            if len(tokens1 & tokens2) / len(tokens1 | tokens2) > 0.10:
                y_pred.append(True)
            else:
                options = np.array([final_decision(test_size=1)[0] for i in range(0,3)])
                y_pred.append(stats.mode(options)[0][0])
        return y_pred

In [None]:
russe_heuristics = Heuristic_submission('RUSSE', RusseSolver)
russe_heuristics.get_scores_random()
russe_heuristics.get_scores_majority()
russe_heuristics.get_scores_random_weighted()

### LiDiRus

In [None]:
class LiDiRusSolver(BaseSolver):

    def __init__(self, path: str, path_valid=None):
        self.e_words = {"чтобы", 'будет', "от", "он"} # -> entailment
        self.ne_words = {'и', "не", "никогда", "вовсе", 'что', "это"} # -> not_eintailment
        self.morph = RSG_MorphAnalyzer() # PyMorphy + cashing
        super(LiDiRusSolver, self).__init__(path, path_valid)

    def preprocess(self):
        self.cashe = {} # create a dictionary for lemmas
        """ preprocess sentences to apply heuristics"""
        self.valid["sentence1_words"] = self.valid['sentence1'].str.split()
        self.valid["sentence2_words"] = self.valid['sentence2'].str.split()
        self.valid["sentence1_lemmas"] = self.morph.lemmantize_sentences(self.valid.sentence1.to_list())
        self.valid["sentence2_lemmas"] = self.morph.lemmantize_sentences(self.valid.sentence2.to_list())

    def get_heuristics(self, non_intersect, intersect, non_intersect_lemmas, heuristic) -> dict:
        """ all heuristics at once or one of them """

        heuristics = {
            "not_entailment": {
                "little overlap": len(non_intersect) > 10,

                # catches if there is an extra clause inside
                "extra clause": len(re.findall(r",", " ".join(non_intersect))) > 1,

                "keyword": len(non_intersect) == 2,

                # negated word, e.g: необычный, незапланированно
                # "negated words": re.search(r'(?<=\s)не\w+', " ".join(non_intersect)) != None ,

                # has one of the words from the list
                "wordlist": len(self.ne_words.intersection(non_intersect)) > 0},

            "entailment": {
                "all lemmas overlap": len(non_intersect_lemmas) == 0,

                "wordlist": len(self.e_words.intersection(intersect)) > 0}}

        if heuristic != None:
            # return a single heuristic only
            key = list(heuristic.keys())[0]
            value = heuristic[key]

            return ({
                key: { # key = "entailment" or "not_entailment"
                      value: heuristics[key][value] # "heuristic name": Boolean
                      }
                    })
        return (heuristics)

    def heuristics_all(self, final_decision = None, heuristic = None):
        """
            apply heuristics to a dataset
            To check on a single heursitic, pass
                        heuristic = {"label": "heuristic name"}
            to this function
        """
        y_pred = []


        for i, row in self.valid.iterrows():

            sentence1 = row['sentence1_words']
            sentence2 = row['sentence2_words']

            non_intersect = set(sentence1) ^ set(sentence2)
            intersect = set(sentence1).intersection(sentence2)
            lemmas_non_intersect = set(row.sentence1_lemmas) ^ set(row.sentence2_lemmas)

            heuristics = self.get_heuristics(non_intersect,
                                             intersect,
                                             lemmas_non_intersect,
                                             heuristic)


            if ('entailment' in heuristics.keys() and
                (True in list(heuristics['entailment'].values()))):
                    y_pred.append('entailment')
            elif ('not_entailment' in heuristics.keys() and
                (True in list(heuristics['not_entailment'].values()))):
                y_pred.append('not_entailment') # inserts an opposite label
            else:
                y_pred.append(final_decision(test_size=1)[0])

        return y_pred

In [None]:
solver_LiDiRus = LiDiRusSolver(path='/content/combined/LiDiRus/LiDiRus.jsonl', path_valid='/content/combined/LiDiRus/LiDiRus.jsonl')
solver_LiDiRus.preprocess()

In [None]:
test_LiDiRus = JSONL_handler('/content/combined/LiDiRus/LiDiRus.jsonl').to_pandas()

In [None]:
LiDiRus_majority = [{"idx": idx, "label": str(label).lower()} for idx, label in zip(test_LiDiRus.idx, solver_LiDiRus.heuristics_all(final_decision=solver_LiDiRus.majority_class))]
LiDiRus_random = [{"idx": idx, "label": str(label).lower()} for idx, label in zip(test_LiDiRus.idx, solver_LiDiRus.heuristics_all(final_decision=solver_LiDiRus.random_choice))]
LiDiRus_random_b = [{"idx": idx, "label": str(label).lower()} for idx, label in zip(test_LiDiRus.idx, solver_LiDiRus.heuristics_all(final_decision=solver_LiDiRus.random_balanced_choice))]

In [None]:
save_output(LiDiRus_majority, output_dir_heuristics_majority / "LiDiRus.jsonl")
save_output(LiDiRus_random, output_dir_heuristics_random / "LiDiRus.jsonl")
save_output(LiDiRus_random_b, output_dir_heuristics_rw / "LiDiRus.jsonl")

### RuCos

In [None]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_punc(lower(s)))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = [0]
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate(dataset, predictions):
    f1 = exact_match = total = 0
    correct_ids = []
    for prediction, passage in zip(predictions, dataset):
        prediction = prediction["label"]
        for qa in passage['qas']:
            total += 1
            ground_truths = list(map(lambda x: x['text'], qa.get("answers", "")))

            _exact_match = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
            if int(_exact_match) == 1:
                correct_ids.append(qa['idx'])
            exact_match += _exact_match

            f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)

    exact_match = exact_match / total
    f1 = f1 / total
    return exact_match, f1


def eval_RuCoS(train_path, val_path, test_path, vect):
    test_score, test_pred = eval_part(test_path, vect)
    return None, {
        "train": eval_part(train_path, vect)[0],
        "val": eval_part(val_path, vect)[0],
        "test": test_score,
        "test_pred": test_pred
    }


def eval_part(path, vect):
    with jsonlines.open(path) as reader:
        lines = list(reader)
    preds = []
    for row in lines:
        pred = get_row_pred(row, vect)
        preds.append({
            "idx": row["idx"],
            "label": pred
        })
    return evaluate(lines, preds), preds


def get_row_pred(row, vect):
    res = []
    words = [
        row["passage"]["text"][x["start"]: x["end"]]
        for x in row["passage"]["entities"]]
    text  = row['passage']['text'].split()
    for line in row["qas"]:
        line_candidates = []
        _words = []
        for word in words:
            if word[:-2]  not in line['query'] or text.count(words[:-2]) >= 2:
                _words.append(word)
        if len(_words) == 0:
            for word in words:
                line_candidates.append(line["query"].replace("@placeholder", word))
            pred_idx = np.random.choice(np.arange(1, len(line_candidates)),
                                size=1)[0]
            pred = np.array(words)[pred_idx]
        elif len(_words) == 1:
            pred = _words[0]
        else:
            for word in _words:
                line_candidates.append(line["query"].replace("@placeholder", word))
            pred_idx = np.random.choice(np.arange(1, len(line_candidates)),
                                        size=1)[0]
            pred = np.array(_words)[pred_idx]
        res.append(pred)
    return " ".join(res)

In [None]:
train_path_RuCos = "/content/combined/RuCoS/train.jsonl"
val_path_RuCos = "/content/combined/RuCoS/val.jsonl"
test_path_RuCos = "/content/combined/RuCoS/test.jsonl"

In [None]:
 _, RuCoS_scores_heuristics = eval_RuCoS(train_path_RuCos, val_path_RuCos, test_path_RuCos, 'No vect')

In [None]:
rucos_scores = RuCoS_scores_heuristics["test_pred"]

In [None]:
save_output(rucos_scores, output_dir_heuristics_majority / "RuCoS.jsonl")
save_output(rucos_scores, output_dir_heuristics_random / "RuCoS.jsonl")
save_output(rucos_scores, output_dir_heuristics_rw / "RuCoS.jsonl")

### MuSeRC

In [None]:
from MuSeRCSolver import MuSeRCSolver

solver_MuSeRC = MuSeRCSolver(path='/content/combined/MuSeRC/train.jsonl',
                      path_valid='/content/combined/MuSeRC/val.jsonl') # pass a dataset to get stats
solver_MuSeRC.preprocess_data('/content/combined/MuSeRC/test.jsonl') # path a dataset to solve
solver_MuSeRC.get_stats_MuSeRC() # collect statistics for majority and random balanced

In [None]:
scores_muserc, _, _ = solver_MuSeRC.heuristics()
scores_muserc_r, _, _ = solver_MuSeRC.heuristics('RANDOM')
scores_muserc_rb, _, _ = solver_MuSeRC.heuristics('RB')

Heuristics appears for 5909 samples, 2001 of them correct
Heuristics appears for 5909 samples, 2001 of them correct
Heuristics appears for 5909 samples, 2001 of them correct


In [None]:
save_output(scores_muserc, output_dir_heuristics_majority / "MuSeRC.jsonl")
save_output(scores_muserc_r, output_dir_heuristics_random / "MuSeRC.jsonl")
save_output(scores_muserc_rb, output_dir_heuristics_rw / "MuSeRC.jsonl")

### RWSD

In [None]:
class RWSDSolver(BaseSolverSubmit):

    def __init__(self, path: str, path_test = None, path_valid=None):
        self.morph = RSG_MorphAnalyzer() # PyMorphy + cashing
        super(RWSDSolver, self).__init__(path, path_test, path_valid)
        self.data = self.to_pandas(path_valid)     

    def get_heuristics(self, length, distance, heuristic) -> dict:
        """ all heuristics at once or one of them """

        heuristics = {
            "True": {
                'placeholder': False},

            "False": {
                'odd length': length % 2 == 1,
                
                'remainder 3': length % 4 == 3,

                'remainder 2': (distance + length) % 3 != 2}
                }

        if heuristic != None:
            # return a single heuristic only
            key = list(heuristic.keys())[0]
            value = heuristic[key]

            return({
                key: { # key = "entailment" or "not_entailment"
                      value: heuristics[key][value] # "heuristic name": Boolean
                      }
                    })

        return(heuristics)

    def heuristics_all(self, final_decision = None, heuristic = None):
        """
            apply heuristics to a dataset
            To check on a single heursitic, pass
                        heuristic = {"label": "heuristic name"}
            to this function
        """
        y_pred = []

        for i, row in self.data.iterrows():
            unique = self.unique(row)
            distance = self.distance(row)
            # length of a phrase without anticedents and proforms in tokens 
            length = len(unique)

            heuristics = self.get_heuristics(length, distance, heuristic)

            if ('True' in heuristics.keys() and
                (True in list(heuristics['True'].values()))):
                    y_pred.append(True)

            elif ('False' in heuristics.keys() and
                (True in list(heuristics['False'].values()))):
                y_pred.append(False) # inserts an opposite label

            else:
                y_pred.append(final_decision(test_size=1)[0])

        return y_pred
  
    def to_pandas(self, path):
        """ get jsonl file content as a pandas DataFrame"""
        data = []
        with open(path) as f:
            for line in f:
                data.append(json.loads(line))

        data = pd.json_normalize(data)

        if 'label' not in data.columns:
            data['label'] = False

        return data

    def unique(self, row):
        """ removes span1 and span2 words from a text """
        string = row.text.replace(row['target.span1_text'],'')
        string.replace(row['target.span2_text'],'')
        return string


    def distance(self, row):
        """ calculate the distance between an anticedent and proforms in tokens"""
        return row['target.span2_index'] - row['target.span1_index']

In [None]:
rwsd_heuristics = Heuristic_submission('RWSD', RWSDSolver)
rwsd_heuristics.get_scores_random()
rwsd_heuristics.get_scores_majority()
rwsd_heuristics.get_scores_random_weighted()

# Make submission file

In [None]:
!7z a "random_submission.zip" $output_dir
!7z a "majority_submission.zip" $output_dir_majority
!7z a "random_weighted_submission.zip" $output_dir_random_weighted

In [None]:
!7z a "random_tfidf_submission.zip" $output_dir_tfidf


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive:
  0M Scan           1 folder, 8 files, 1453406 bytes (1420 KiB)

Creating archive: random_tfidf_submission.zip

Items to compress: 9

  0%     99% 8 + tfidf_submission/TERRa.jsonl                                     
Files read from disk: 8
Archive size: 150284 bytes (147 KiB)
Everything is Ok


In [None]:
!7z a "heuristics_random_submission.zip" $output_dir_heuristics_random
!7z a "heuristics_majority_submission.zip" $output_dir_heuristics_majority
!7z a "heuristics_rw_submission.zip" $output_dir_heuristics_rw


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive:
  0M Scan           1 folder, 9 files, 1412372 bytes (1380 KiB)

Creating archive: heuristics_random_submission.zip

Items to compress: 10

  0%    
Files read from disk: 9
Archive size: 136081 bytes (133 KiB)
Everything is Ok

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive:
  0M Scan           1 folder, 9 files, 1421995 bytes (1389 KiB)

Creating archive: heuristics_majority_submission.zip

Items to compress: 10

  0%    
Files read from disk: 9
Archive size: 130632 bytes (128 KiB)
Everything is Ok

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pav