<a href="https://colab.research.google.com/github/tatiana-iazykova/2020_HACK_RUSSIANSUPERGLUE/blob/main/generate_json.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget -q --show-progress "https://raw.githubusercontent.com/tatiana-iazykova/2020_HACK_RUSSIANSUPERGLUE/main/base.py" -O base.py



In [None]:
%%capture
!wget https://russiansuperglue.com/tasks/download
!unzip download
!rm download
!rm -r /content/__MACOSX
!rm -r sample_data/

In [None]:
from pathlib import Path
data_dir = Path("combined/")

In [None]:
import pandas as pd

class JSONL_handler():
    """ opens a jsonl file and turns it into a necessary data structure """
    
    def __init__(self, path):
        self.path = path # path to jsonl file

    def to_pandas(self):
        """ get jsonl file content as a pandas DataFrame"""
        return pd.read_json(path_or_buf=self.path, lines=True)

# Baseline

In [None]:
output_dir = Path("random_submission")
!mkdir $output_dir

In [None]:
output_dir_majority = Path("majority_submission")
!mkdir $output_dir_majority

In [None]:
output_dir_random_weighted = Path("random_weighted_submission")
!mkdir $output_dir_random_weighted

In [None]:
from base import BaseSolverSubmit
import json

class Random_submission():
  def __init__(self, dataset, path = None, path_valid = None, path_test = None):
    self.dataset = dataset
    self.path = '/content/combined/' + dataset + '/train.jsonl' if path is None else path
    self.path_valid = '/content/combined/' + dataset + '/val.jsonl' if path_valid is None else path_valid
    self.path_test = '/content/combined/' + dataset + '/test.jsonl' if path_test is None else path_test

  def test_output(self):
    test = JSONL_handler(self.path_test).to_pandas()
    test_pred = [{"idx": idx, "label": str(label).lower()} for idx, label in zip(test.idx, self.scores)]
    return test_pred

  def get_scores_random(self):
    solver = BaseSolverSubmit(path = self.path, path_valid = self.path_valid, path_test = self.path_test)
    self.scores = solver.random_choice(len(solver.valid))
    filename = self.dataset + ".jsonl"
    self.save_output(self.test_output(), output_dir / filename)
  
  def get_scores_majority(self):
    solver = BaseSolverSubmit(path = self.path, path_valid = self.path_valid, path_test = self.path_test)
    self.scores = solver.majority_class(len(solver.valid))
    filename = self.dataset + ".jsonl"
    self.save_output(self.test_output(), output_dir_majority / filename)

  def get_scores_random_weighted(self):
    solver = BaseSolverSubmit(path = self.path, path_valid = self.path_valid, path_test = self.path_test)
    self.scores = solver.random_balanced_choice(len(solver.valid))
    filename = self.dataset + ".jsonl"
    self.save_output(self.test_output(), output_dir_random_weighted / filename)
  
  def save_output(self, data, path):
    with open(path, mode="w") as file:
        for line in sorted(data, key=lambda x: int(x.get("idx"))):
            line["idx"] = int(line["idx"])
            file.write(f"{json.dumps(line, ensure_ascii=False)}\n")

## Datasets

### DaNetQA

In [None]:
random = Random_submission('DaNetQA')
random.get_scores_random()
majority.get_scores_majority()
random_w.get_scores_random_weighted()

### RCB

In [None]:
random_RCB = Random_submission('RCB')
random_RCB.get_scores_random()
majority_RCB.get_scores_majority()
random_w_RCB.get_scores_random_weighted()

### PARus

In [None]:
random_PARus = Random_submission('PARus')
random_PARus.get_scores_random()
majority_PARus.get_scores_majority()
random_w_PARus.get_scores_random_weighted()

### TERRa

In [None]:
random_TERRa = Random_submission('TERRa')
random_TERRa.get_scores_random()
majority_TERRa.get_scores_majority()
random_w_TERRa.get_scores_random_weighted()

### RUSSE

In [None]:
random_RUSSE = Random_submission('RUSSE')
random_RUSSE.get_scores_random()
majority_RUSSE.get_scores_majority()
random_w_RUSSE.get_scores_random_weighted()

### RWSD

In [None]:
random_RWSD = Random_submission('RWSD')
random_RWSD.get_scores_random()
majority_RWSD.get_scores_majority()
random_w_RWSD.get_scores_random_weighted()

### LidiRus

In [None]:
random_LiDiRus = Random_submission('LiDiRus', path = '/content/combined/TERRa/train.jsonl', path_valid='/content/combined/TERRa/val.jsonl',
                                   path_test = '/content/combined/LiDiRus/LiDiRus.jsonl')
random_LiDiRus.get_scores_random()
majority_LiDiRus.get_scores_majority()
random_w_LiDiRus.get_scores_random_weighted()

# Optimised Tfidf

In [None]:
output_dir_tfidf = Path("tfidf_submission")
!mkdir $output_dir_tfidf

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

In [None]:
def unite(path1, path2):
  df = JSONL_handler(path1).to_pandas()
  df1 = JSONL_handler(path2).to_pandas()
  return pd.concat([df, df1])

In [None]:
import json

class Tfidf_Submisssion():
  
  def __init__(self, test, predictions, filename):
    self.test = test
    self.predictions = predictions
    self.filename = filename + '.jsonl'

  def test_output(self):
    test_pred = [{"idx": idx, "label": str(label).lower()} for idx, label in zip(self.test.idx, self.predictions)]
    self.save_output(test_pred, output_dir_tfidf / self.filename)

  def save_output(self, data, path):
    with open(path, mode="w") as file:
        for line in sorted(data, key=lambda x: int(x.get("idx"))):
            line["idx"] = int(line["idx"])
            file.write(f"{json.dumps(line, ensure_ascii=False)}\n")

## Datasets

### RCB

In [None]:
RCB_train = unite('/content/combined/RCB/train.jsonl', '/content/combined/RCB/val.jsonl')
RCB_test = JSONL_handler('/content/combined/RCB/test.jsonl').to_pandas()

In [None]:
steps_RCB = [('tfidf', TfidfVectorizer(analyzer='word', max_features=10000)),
          ('func', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
         ('sgd', SGDClassifier(loss="log", n_jobs=-1, alpha=0.00001, class_weight='balanced', random_state=42))]

pipeline_RCB = Pipeline(steps_RCB)

pipeline_RCB.fit(RCB_train.hypothesis, RCB_train.label)
y_pred_RCB = pipeline_RCB.predict(RCB_test.hypothesis)

In [None]:
tfidf_rcb = Tfidf_Submisssion(RCB_test, y_pred_RCB, 'RCB')
tfidf_rcb.test_output()

### MuSeRC

In [None]:
RCB_train = unite('/content/combined/RCB/train.jsonl', '/content/combined/RCB/val.jsonl')
RCB_test = JSONL_handler('/content/combined/RCB/test.jsonl').to_pandas()

In [None]:
tfidf_rcb = Tfidf_Submisssion(RCB_test, y_pred_RCB, 'RCB')
tfidf_rcb.test_output()

### TERRa

In [None]:
TERRa_train = unite('/content/combined/TERRa/train.jsonl', '/content/combined/TERRa/val.jsonl')
TERRa_test = JSONL_handler('/content/combined/TERRa/test.jsonl').to_pandas()

In [None]:
steps_TERRa = [('countvect', CountVectorizer(min_df=15, max_df=0.4, lowercase=True, analyzer ='char_wb', decode_error = 'ignore', ngram_range = (2, 4))),
         ('sgd', SGDClassifier(alpha = 1e-08, loss="log", n_jobs=-1, class_weight='balanced', random_state=42))]

pipeline_TERRa = Pipeline(steps_TERRa)

pipeline_TERRa.fit(TERRa_train.hypothesis, TERRa_train.label)
y_pred_TERRa = pipeline_TERRa.predict(TERRa_test.hypothesis)

In [None]:
tfidf_terra = Tfidf_Submisssion(TERRa_test, y_pred_TERRa, 'TERRa')
tfidf_terra.test_output()

### DaNetQA

In [None]:
DaNetQA_train = unite('/content/combined/DaNetQA/train.jsonl', '/content/combined/DaNetQA/val.jsonl')
DaNetQA_test = JSONL_handler('/content/combined/DaNetQA/test.jsonl').to_pandas()

In [None]:
steps_DaNetQA = [('vectorizer', TfidfVectorizer()),
              ('sgd', SGDClassifier(loss="log", n_jobs=-1, alpha=0.15, class_weight='balanced', random_state=42))]

pipeline_DaNetQA = Pipeline(steps_DaNetQA)

pipeline_DaNetQA.fit(DaNetQA_train.question, DaNetQA_train.label)
y_pred_DaNetQA = pipeline_DaNetQA.predict(DaNetQA_test.question)

In [None]:
tfidf_danetqa = Tfidf_Submisssion(DaNetQA_test, y_pred_DaNetQA, 'DaNetQa')
tfidf_danetqa.test_output()

### RUSSE

In [None]:
RUSSE_train = unite('/content/combined/RUSSE/train.jsonl', '/content/combined/RUSSE/val.jsonl')
RUSSE_test = JSONL_handler('/content/combined/RUSSE/test.jsonl').to_pandas()

In [None]:
def build_feature_RUSSE(row):
    sentence1 = row["sentence1"].strip()
    sentence2 = row["sentence2"].strip()
    word = row["word"].strip()
    res = f"{sentence1} {sentence2} {word}"
    return res

train_concat = []
for i, row in RUSSE_train.iterrows():
    train_concat.append(build_feature_RUSSE(row))
RUSSE_train['concatenated'] = train_concat

valid_concat = []
for i, row in RUSSE_test.iterrows():
    valid_concat.append(build_feature_RUSSE(row))
RUSSE_test['concatenated'] =  valid_concat

In [None]:
steps_RUSSE = [('tfidf', TfidfVectorizer(analyzer = 'word', max_df = 0.6, min_df= 0.001, ngram_range =  (1,2))),
         ('logreg', LogisticRegression(C = 1.01, class_weight='balanced'))]

pipeline_RUSSE = Pipeline(steps_RUSSE)

pipeline_RUSSE.fit(RUSSE_train.concatenated, RUSSE_train.label)
y_pred_RUSSE = pipeline_RUSSE.predict(RUSSE_test.concatenated)

In [None]:
tfidf_russe = Tfidf_Submisssion(RUSSE_test, y_pred_RUSSE, 'RUSSE')
tfidf_russe.test_output()

### PARus

In [None]:
PARus_train = unite('/content/combined/PARus/train.jsonl', '/content/combined/PARus/val.jsonl')
PARus_test = JSONL_handler('/content/combined/PARus/test.jsonl').to_pandas()

In [None]:
def build_feature_PARus(row):
    premise = str(row["premise"]).strip()
    choice1 = row["choice1"]
    choice2 = row["choice2"]
    label = row.get("label")
    question = "Что было ПРИЧИНОЙ этого?" if row["question"] == "cause" else "Что случилось в РЕЗУЛЬТАТЕ?"
    res = f"{premise} {question} {choice1} {choice2}"
    return res


train_concat = []
for i, row in PARus_train.iterrows():
    train_concat.append(build_feature_PARus(row))
PARus_train['concatenated'] = train_concat

valid_concat = []
for i, row in PARus_test.iterrows():
    valid_concat.append(build_feature_PARus(row))
PARus_test['concatenated'] =  valid_concat

In [None]:
steps_PARus = [('tfidf', TfidfVectorizer(analyzer= 'word', max_df= 0.6, min_df= 0.04, ngram_range= (1, 2))),
         ('logreg', LogisticRegression(C = 1e-10, class_weight='balanced'))]

pipeline_PARus = Pipeline(steps_PARus)

pipeline_PARus.fit(PARus_train.concatenated, PARus_train.label)
y_pred_PARus = pipeline_PARus.predict(PARus_test.concatenated)

In [None]:
tfidf_parus = Tfidf_Submisssion(PARus_test, y_pred_PARus, 'PARus')
tfidf_parus.test_output()

### LiDiRus

In [None]:
LiDiRus_train = unite('/content/combined/TERRa/train.jsonl', '/content/combined/TERRa/val.jsonl').assign(merged=lambda x: x.premise + "\n" + x.hypothesis)
LiDiRus_test = JSONL_handler('/content/combined/LiDiRus/LiDiRus.jsonl').to_pandas().assign(merged=lambda x: x.sentence1 + "\n" + x.sentence2)

In [None]:
steps_LiDiRus = [('tfidf', TfidfVectorizer(analyzer= 'char_wb', max_df= 0.6, min_df=0.091, ngram_range = (1, 1))),
         ('logreg', LogisticRegression(C = 1.01, class_weight='balanced'))]

pipeline_LiDiRus = Pipeline(steps_LiDiRus)

pipeline_LiDiRus.fit(LiDiRus_train.merged, LiDiRus_train.label)
y_pred_LiDiRus = pipeline_LiDiRus.predict(LiDiRus_test.merged)

In [None]:
tfidf_lidirus = Tfidf_Submisssion(LiDiRus_test, y_pred_LiDiRus, 'LiDiRus')
tfidf_lidirus.test_output()

# Heuristics

In [None]:
output_dir_heuristics_random = Path("heuristics_random_submission")
!mkdir $output_dir_heuristics_random

In [None]:
output_dir_heuristics_majority = Path("heuristics_majority_submission")
!mkdir $output_dir_heuristics_majority

In [None]:
output_dir_heuristics_rw = Path("heuristics_rw_submission")
!mkdir $output_dir_heuristics_rw

In [None]:
import json

class Heuristic_submission():
  def __init__(self, dataset, solver, path = None, path_valid = None, path_test = None):
    self.dataset = dataset
    self.path = '/content/combined/' + dataset + '/train.jsonl' if path is None else path
    self.path_valid = '/content/combined/' + dataset + '/val.jsonl' if path_valid is None else path_valid
    self.path_test = '/content/combined/' + dataset + '/test.jsonl' if path_test is None else path_test
    self.solver = solver(path=self.path, path_valid= self.path_valid, path_test = self.path_test)

  def test_output(self):
    test = JSONL_handler(self.path_test).to_pandas()
    test_pred = [{"idx": idx, "label": str(label).lower()} for idx, label in zip(test.idx, self.scores)]
    return test_pred

  def get_scores_random(self):
    self.scores = self.solver.heuristics_all(final_decision=self.solver.random_choice)
    filename = self.dataset + ".jsonl"
    self.save_output(self.test_output(), output_dir_heuristics_random / filename)
  
  def get_scores_majority(self):
    self.scores = self.solver.heuristics_all(final_decision=self.solver.majority_class)
    filename = self.dataset + ".jsonl"
    self.save_output(self.test_output(), output_dir_heuristics_majority / filename)

  def get_scores_random_weighted(self):
    self.scores = self.solver.heuristics_all(final_decision=self.solver.random_balanced_choice)
    filename = self.dataset + ".jsonl"
    self.save_output(self.test_output(), output_dir_heuristics_rw / filename)
  
  def save_output(self, data, path):
    with open(path, mode="w") as file:
        for line in sorted(data, key=lambda x: int(x.get("idx"))):
            line["idx"] = int(line["idx"])
            file.write(f"{json.dumps(line, ensure_ascii=False)}\n")

## Datasets

In [None]:
%%capture
!pip install pymorphy2
!pip install razdel
!pip install natasha

In [None]:
import pymorphy2
import re
from pymorphy2 import MorphAnalyzer
import nltk
from functools import lru_cache
from base import BaseSolverSubmit
from scipy import stats

m = MorphAnalyzer()

In [None]:
import numpy as np
from natasha import (
    Segmenter,
    MorphVocab,  
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

### TERRa

In [None]:
class TERRaSolver(BaseSolverSubmit):
   
    def __init__(self, path: str, path_valid=None, path_test=None):
        super(TERRaSolver, self).__init__(path, path_test, path_valid)
   
    def preprocess(self, columns):
      for column in columns:
        self.train[f"{column}_lemmas"] = self.train[column].apply(self.clean_text)
        self.valid[f"{column}_lemmas"] = self.valid[column].apply(self.clean_text)
 
    def words_only(self, text):
      rg = re.compile("[А-Яа-яA-z]+")
      try:
        return rg.findall(text.lower())
      except:
        return []

    @lru_cache(maxsize=128)
    def lemmatize_word(self, token, pymorphy=m):
      return pymorphy.parse(token)[0].normal_form

    def lemmatize_text(self, text):
      return [self.lemmatize_word(w) for w in text]

    def clean_text(self, text):
      tokens = self.words_only(text)
      lemmas = self.lemmatize_text(tokens)  
      return lemmas
    
    def heuristics_all(self, final_decision=None):
        y_pred = []
        self.preprocess(columns=['premise', "hypothesis"])

        for i, row in self.valid.iterrows():
          
          hyp = row.hypothesis.lower()
          hyp_lem = set(row['hypothesis_lemmas'])
          prem_lem = set(row['premise_lemmas'])
          indic_non_ent = set(['только', 'мужчина'])

          if hyp in row['premise'].lower():
             y_pred.append('entailment')
          elif len(prem_lem & hyp_lem)/len(hyp_lem) <= 1/3 or len(row['premise'].split()) < 29 or len(indic_non_ent & hyp_lem) > 0:
            y_pred.append('not_entailment')
          elif len(prem_lem & hyp_lem)/len(hyp_lem) == 0.75 or len(prem_lem & hyp_lem)/len(hyp_lem) == 1 or len(prem_lem & hyp_lem)/len(hyp_lem) == 2/3:
            y_pred.append('entailment')
          elif len(row['premise'].split()) > 32:
            y_pred.append('entailment')
          else:
            y_pred.append(final_desicion(test_size=1)[0])
        
        return y_pred

In [None]:
terra_heuristics = Heuristic_submission('TERRa', TERRaSolver)
terra_heuristics.get_scores_random()
terra_heuristics.get_scores_majority()
terra_heuristics.get_scores_random_weighted()

### DaNetQA

In [None]:
class DaNetQASolver(BaseSolverSubmit):
    
    def __init__(self, path: str, path_valid=None, path_test=None):
        super(DaNetQASolver, self).__init__(path, path_test, path_valid)
   
    def heuristics_all(self, final_decision=None):
        y_pred = []

        for i, row in self.valid.iterrows():

            question = row.question.lower()
            question_w_count = len(question.split())
            passage_w_count = len(row.passage.split())

            if re.search("был|(^есть)", question):
              y_pred.append(True)
            elif re.search("^входит|едят|правда ли", question):
              y_pred.append(False)
            elif question_w_count > 5:
              y_pred.append(False)
            elif passage_w_count >= 90:
              y_pred.append(False)
            else:
              y_pred.append(final_desicion(test_size=1)[0])
      
        return y_pred

In [None]:
danetqa_heuristics = Heuristic_submission('DaNetQA', DaNetQASolver)
danetqa_heuristics.get_scores_random()
danetqa_heuristics.get_scores_majority()
danetqa_heuristics.get_scores_random_weighted()

### RCB

In [None]:
class RCBSolver(BaseSolverSubmit):

    def __init__(self, path: str, path_valid=None, path_test=None):
        super(RCBSolver, self).__init__(path, path_test, path_valid)
   
    def preprocess(self, columns):
      for column in columns:
        self.train[f"{column}_lemmas"] = self.train[column].apply(self.clean_text)
        self.valid[f"{column}_lemmas"] = self.valid[column].apply(self.clean_text)
 
    def words_only(self, text):
      rg = re.compile("[А-Яа-яA-z]+")
      try:
        return rg.findall(text.lower())
      except:
        return []

    @lru_cache(maxsize=128)
    def lemmatize_word(self, token, pymorphy=m):
      return pymorphy.parse(token)[0].normal_form

    def lemmatize_text(self, text):
      return [self.lemmatize_word(w) for w in text]

    def clean_text(self, text):
      tokens = self.words_only(text)
      lemmas = self.lemmatize_text(tokens)  
      return lemmas
    
    def heuristics_all(self, final_decision=None):
        y_pred = []
        self.preprocess(columns=['premise', "hypothesis"])

        for i, row in self.valid.iterrows():
          
          hyp = row.hypothesis.lower()
          hyp_lem = set(row['hypothesis_lemmas'])
          prem_lem = set(row['premise_lemmas'])
          indic_neutral = set(['подозревать', 'cчитать', 'говорить', 'думать', 'надеяться', 'понять', 'уверять'])
          indic_ent = set(['признать'])

          if hyp in row['premise'].lower() or len(indic_ent & prem_lem) > 0 :
             y_pred.append('entailment')
          elif len(prem_lem & hyp_lem)/len(hyp_lem) == 0.75:
            y_pred.append('entailment')
          elif len(indic_neutral & prem_lem) > 0:
            y_pred.append('neutral')
          elif len(row.hypothesis.split()) < 5:
            y_pred.append('contradiction')
          elif len(row.hypothesis.split()) >= 5 and len(row.hypothesis.split()) <= 7:
            y_pred.append('neutral')
          elif len(row['premise'].split()) > 30:
            y_pred.append('entailment')
          else:
            y_pred.append(final_desicion(test_size=1)[0])
        
        return y_pred

In [None]:
rcb_heuristics = Heuristic_submission('RCB', RCBSolver)
rcb_heuristics.get_scores_random()
rcb_heuristics.get_scores_majority()
rcb_heuristics.get_scores_random_weighted()

### PARus

In [None]:
class ParusSolver(BaseSolverSubmit):
   
    def __init__(self, path: str, path_valid=None, path_test=None):
        super(ParusSolver, self).__init__(path, path_test, path_valid)
   
    def preprocess(self, columns):

        for column in columns:
            self.train[f"{column}_lemmas"] = self.train[column].apply(self.lemmatize)
            self.valid[f"{column}_lemmas"] = self.valid[column].apply(self.lemmatize)

    def lemmatize(self, text):
        """
        param text: str
        return: List of lemmas (strings)
        """

        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)

        for token in doc.tokens:
            token.lemmatize(morph_vocab)
        lemmas = [token.lemma for token in doc.tokens]
        return lemmas

    
    def heuristics_all(self, final_decision=None):
        """
        This heruistic chooses the option that has more common lemmas with premise
        If the amount of common words is equal for both choices, it uses {final_desicion}
        function (one of BaseSolver functions) to predict
        param: final_decision (function)
        """
        y_pred = []
        self.preprocess(columns=['premise', 'choice1', 'choice2'])

        for i, row in self.valid.iterrows():
            words1 = set(row.choice1_lemmas)
            words2 = set(row.choice2_lemmas)
            premise = set(row.premise_lemmas)
            overlap1 = len(premise & words1)
            overlap2 = len(premise & words2)
            if overlap1 > overlap2:
                y_pred.append(0)
            elif overlap2 > overlap1:
                y_pred.append(1)
            else:
                y_pred.append(final_desicion(test_size=1)[0])
        return y_pred

In [None]:
parus_heuristics = Heuristic_submission('PARus', ParusSolver)
parus_heuristics.get_scores_random()
parus_heuristics.get_scores_majority()
parus_heuristics.get_scores_random_weighted()

### RUSSE

In [None]:
class RusseSolver(BaseSolverSubmit):

    def __init__(self, path: str, path_valid=None, path_test=None):
        super(RusseSolver, self).__init__(path, path_test, path_valid)

    def heuristics_all(self, final_decision=None):
        y_pred = []

        for i, row in self.valid.iterrows():
            tokens1 = set(row.sentence1.split())
            tokens2 = set(row.sentence2.split())

            if len(tokens1 & tokens2) / len(tokens1 | tokens2) > 0.10:
                y_pred.append(True)
            else:
                options = np.array([final_decision(test_size=1)[0] for i in range(0,3)])
                y_pred.append(stats.mode(options)[0][0])
        return y_pred

In [None]:
russe_heuristics = Heuristic_submission('RUSSE', RusseSolver)
russe_heuristics.get_scores_random()
russe_heuristics.get_scores_majority()
russe_heuristics.get_scores_random_weighted()

# Make submission file

In [None]:
!7z a "random_submission.zip" $output_dir
!7z a "majority_submission.zip" $output_dir_majority
!7z a "random_weighted_submission.zip" $output_dir_random_weighted

In [None]:
!7z a "random_weighted_submission.zip" $output_dir_tfidf