# modeling15-FiD-encoder-sentence-level-classifier-prediction
- Prediction

## CHECKING PARSER

In [1]:
from pprint import pprint
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES']='1'

In [3]:
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
import heapq
import pickle
import pathlib
import shutil
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from pprint import pprint
from tqdm.auto import tqdm
from src.data import (
    BinaryCustomDatasetShuffle,
    BinarySentenceDataset,
    BinaryCustomDatasetDecisiveBinaryGold,
    BinaryCustomDatasetPredictionShuffle,
    SentenceClassificationDataset,
    EncoderSentenceClassificationDataset
)

import re
from functools import partial
import json
import math
import os
import logging
import sys
import evaluate
from util import utils
import argparse

import transformers
import torch
import numpy as np
import random
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoModel,
    AutoConfig,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    set_seed,
    get_scheduler,
)
from util.arguments import ModelArguments, DataTrainingArguments, CustomTrainingArguments
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from sentence_transformers import SentenceTransformer
from FiD.src.model import FiDT5
from src.model import SentenceLSTM

NEW_LINE = "\n"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

DATASET_MAPPING = {
    "BinaryCustomDatasetShuffle" : BinaryCustomDatasetShuffle,
    "BinarySentenceDataset" : BinarySentenceDataset,
    'BinaryCustomDatasetDecisiveBinaryGold' : BinaryCustomDatasetDecisiveBinaryGold,
    'BinaryCustomDatasetPredictionShuffle' : BinaryCustomDatasetPredictionShuffle,
    'SentenceClassificationDataset' : SentenceClassificationDataset,
    'EncoderSentenceClassificationDataset' : EncoderSentenceClassificationDataset
}
EMBEDDING_ARC_MAPPING = {
    "SentenceTransformer" : SentenceTransformer,
     "FiDT5" : FiDT5
}

In [4]:
def custom_collate(batch, padding):
    train_lst = [b['input_embedding'] for b in batch]
    label_lst = [b['em_pattern'] for b in batch]
    seq_len_lst = [b['em_pattern'].shape[0] for b in batch]
    max_seq_len = max(seq_len_lst)

    padding_train_lst = []
    for embedding in train_lst:
        if embedding.shape[0] < max_seq_len:
            post_pad = torch.full(size=(max_seq_len - embedding.shape[0], embedding.shape[1]), fill_value=padding)
            # post_pad = torch.full(size=(max_seq_len - embedding.shape[0], embedding.shape[1]), fill_value=-100)
            padding_train_lst.append(torch.concat([embedding, post_pad]))
        else:
            padding_train_lst.append(embedding)

    inputs = torch.stack(padding_train_lst)

    padding_label_lst = []
    for label in label_lst:
        if label.shape[0] < max_seq_len:
            post_pad = torch.full(size=(max_seq_len - label.shape[0],), fill_value=padding)
            # post_pad = torch.full(size=(max_seq_len - label.shape[0],), fill_value=-100)
            torch.concat([label, post_pad])
            padding_label_lst.append(torch.concat([label, post_pad]))
        else:
            padding_label_lst.append(label)

    labels = torch.stack(padding_label_lst)

    return {
        'inputs': inputs,
        'labels': labels,
        'sequence_len': torch.tensor(seq_len_lst)
    }

In [5]:
# parser = HfArgumentParser((ModelArguments, DataTrainingArguments, CustomTrainingArguments))

# model_args, data_args, train_args = parser.parse_args_into_dataclasses([])

In [6]:
# vars(model_args)

In [7]:
# vars(data_args)

In [8]:
# train_dict = vars(train_args)

In [9]:
# train_dict

In [10]:
parser = argparse.ArgumentParser(description='sentence_encoder_predict')

parser.add_argument('--config_path', default='/data/philhoon-relevance/binary-classification/results/NQ-DEV-DPR/5-fold/1/FiD-Encoder-lstm-12layers-sequence_exclude_no_answer_partial_decisive' ,type=str)
parser.add_argument('--model_path', default='step_2160', type=str)
parser.add_argument('--eval_file', default='/data/philhoon-relevance/binary-classification/NQ-TEST-DPR/ctx100id.pickle', type=str)
parser.add_argument('--original_eval_file', default='/data/philhoon-relevance/binary-classification/NQ-TEST-DPR/ctx100id.json', type=str)
parser.add_argument('--per_device_eval_batch_size', default=32, type=int)
# parser.add_argument('--model_name_or_path', type=str, required=True)


# eval_file = '/data/philhoon-relevance/binary-classification/NQ-TEST-DPR/ctx100id.pickle'


args = parser.parse_args([])

In [11]:
print(args.config_path)
print(args.model_path)
print(args.eval_file)
print(args.per_device_eval_batch_size)

/data/philhoon-relevance/binary-classification/results/NQ-DEV-DPR/5-fold/1/FiD-Encoder-lstm-12layers-sequence_exclude_no_answer_partial_decisive
step_2160
/data/philhoon-relevance/binary-classification/NQ-TEST-DPR/ctx100id.pickle
32


## modeling

In [12]:
# config_path = '/data/philhoon-relevance/binary-classification/results/NQ-DEV-DPR/5-fold/1/FiD-Encoder-lstm-12layers-sequence_include_all'


In [13]:
data_args_path = os.path.join(args.config_path, 'data_args.json')
model_args_path = os.path.join(args.config_path, 'model_args.json')
train_args_path = os.path.join(args.config_path, 'train_args.json')

In [14]:
data_args = utils.open_json(data_args_path)
model_args = utils.open_json(model_args_path)
train_args = utils.open_json(train_args_path)

In [15]:
# pprint(data_args)

In [16]:
# pprint(model_args)

In [17]:
# pprint(train_args)

In [18]:
# print(train_args['num_layers'])
# print(model_args['embedding'])
# print(data_args['num_labels'])
# print(train_args['drop_out_rate'])
# print(train_args['padding'])

In [19]:
args.num_layers = train_args['num_layers']
args.embedding = model_args['embedding']
args.num_labels = data_args['num_labels']
args.drop_out_rate = train_args['drop_out_rate']
args.padding = train_args['padding']

In [20]:
print(args.num_layers)
print(args.embedding)
print(args.num_labels)
print(args.drop_out_rate)
print(args.padding)

12
1024
2
0.2
-100


In [21]:
# vars(args)

In [22]:
# model = SentenceLSTM(num_layers = args.num_layers, 
#                      embedding_size = args.embedding, 
#                      num_labels = args.num_labels,
#                      drop_out_rate = args.drop_out_rate
#                     )

## Loading model from Torch

In [23]:
model_saved_path = os.path.join(args.config_path, args.model_path, 'pytorch_model.bin')
args.model_saved_path = model_saved_path
print(args.model_saved_path)

/data/philhoon-relevance/binary-classification/results/NQ-DEV-DPR/5-fold/1/FiD-Encoder-lstm-12layers-sequence_exclude_no_answer_partial_decisive/step_2160/pytorch_model.bin


In [24]:
# state_dict_ = torch.load(args.model_saved_path)

In [25]:
# model.load_state_dict(state_dict_)

## Prediction Code

In [26]:
print(args.eval_file)

/data/philhoon-relevance/binary-classification/NQ-TEST-DPR/ctx100id.pickle


In [27]:
# with open(args.eval_file, 'rb') as f:
#     eval_data = pickle.load(f)

In [28]:
# eval_dataset = EncoderSentenceClassificationDataset(eval_data, shuffle = False)

In [29]:
# eval_dataloader = DataLoader(eval_dataset,
#                              shuffle=False,
#                              collate_fn=partial(custom_collate, padding=args.padding),
#                              batch_size=args.per_device_eval_batch_size,
#                              )

## Implementation torch script

In [30]:
logger = get_logger(__name__)
accelerator = Accelerator()

In [31]:
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

In [32]:
logger.info(accelerator.state, main_process_only=False)
if accelerator.is_local_main_process:
    transformers.utils.logging.set_verbosity_info()
else:
    transformers.utils.logging.set_verbosity_error()

01/10/2023 11:22:03 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Mixed precision type: no



In [33]:
args.output_dir = os.path.join(args.config_path, args.model_path, 'test_prediction')
print(args.output_dir)

/data/philhoon-relevance/binary-classification/results/NQ-DEV-DPR/5-fold/1/FiD-Encoder-lstm-12layers-sequence_exclude_no_answer_partial_decisive/step_2160/test_prediction


In [34]:
if accelerator.is_main_process and args.output_dir is not None:
    os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()

In [35]:
model = SentenceLSTM(num_layers = args.num_layers, 
                     embedding_size = args.embedding, 
                     num_labels = args.num_labels,
                     drop_out_rate = args.drop_out_rate
                    )

state_dict_ = torch.load(args.model_saved_path)
model.load_state_dict(state_dict_)

<All keys matched successfully>

In [36]:
with open(args.eval_file, 'rb') as f:
    eval_data = pickle.load(f)

eval_dataset = EncoderSentenceClassificationDataset(eval_data, shuffle = False)

eval_dataloader = DataLoader(eval_dataset,
                             shuffle=False,
                             collate_fn=partial(custom_collate, padding=args.padding),
                             batch_size=args.per_device_eval_batch_size,
                             )

In [37]:
# for index in random.sample(range(len(eval_dataset)), 5):
#     logger.info(f"Sample {index} of the eval set: {eval_dataset[index]}.")

In [38]:
# Prepare everything with our `accelerator`.
model, eval_dataloader = accelerator.prepare(
    model, eval_dataloader
)

In [39]:
# Get the metric function
metric_acc = evaluate.load("accuracy")
metric_pre = evaluate.load('precision')
metric_re = evaluate.load('recall')
metric_f1 = evaluate.load('f1')

In [40]:
# Eval
logger.info("***** Running evaluation *****")
logger.info(f"  Num examples = {len(eval_dataset)}")
logger.info(f"  Instantaneous batch size per device = {args.per_device_eval_batch_size}")
logger.info(f"  Steps = {math.ceil(len(eval_dataset) / args.per_device_eval_batch_size) + 1}")

01/10/2023 11:22:17 - INFO - __main__ - ***** Running evaluation *****
01/10/2023 11:22:17 - INFO - __main__ -   Num examples = 3610
01/10/2023 11:22:17 - INFO - __main__ -   Instantaneous batch size per device = 32
01/10/2023 11:22:17 - INFO - __main__ -   Steps = 114


In [41]:
args_dict = vars(args)
logger.info(f"  Saving training_args = {args_dict}")
with open(os.path.join(args.output_dir, "prediction_args.json"), "w") as f:
    json.dump(args_dict, f, indent=4)

01/10/2023 11:22:18 - INFO - __main__ -   Saving training_args = {'config_path': '/data/philhoon-relevance/binary-classification/results/NQ-DEV-DPR/5-fold/1/FiD-Encoder-lstm-12layers-sequence_exclude_no_answer_partial_decisive', 'model_path': 'step_2160', 'eval_file': '/data/philhoon-relevance/binary-classification/NQ-TEST-DPR/ctx100id.pickle', 'original_eval_file': '/data/philhoon-relevance/binary-classification/NQ-TEST-DPR/ctx100id.json', 'per_device_eval_batch_size': 32, 'num_layers': 12, 'embedding': 1024, 'num_labels': 2, 'drop_out_rate': 0.2, 'padding': -100, 'model_saved_path': '/data/philhoon-relevance/binary-classification/results/NQ-DEV-DPR/5-fold/1/FiD-Encoder-lstm-12layers-sequence_exclude_no_answer_partial_decisive/step_2160/pytorch_model.bin', 'output_dir': '/data/philhoon-relevance/binary-classification/results/NQ-DEV-DPR/5-fold/1/FiD-Encoder-lstm-12layers-sequence_exclude_no_answer_partial_decisive/step_2160/test_prediction'}


In [42]:
eval_progress_bar = tqdm(range(len(eval_dataloader)), disable=not accelerator.is_local_main_process)

  0%|          | 0/113 [00:00<?, ?it/s]

In [43]:
eval_loss = 0
model.eval()
samples_seen = 0
prediction_lst = []
reference_lst = []

In [44]:
for step, batch in enumerate(eval_dataloader):
    with torch.no_grad():
        logits = model(batch['inputs'], batch['sequence_len'])
        criterion = torch.nn.CrossEntropyLoss(ignore_index=args.padding).cuda()
        loss = criterion(logits.view(-1, logits.shape[-1]), batch['labels'].view(-1))

    
    eval_loss += loss.detach().float()

    predictions = logits.argmax(dim=-1)
    references = batch['labels']

    # Get mask for target values != padding index
    nonpad_mask = references != args.padding

    # Slice out non-pad values
    references = references[nonpad_mask]
    predictions = predictions[nonpad_mask]

    predictions, references = accelerator.gather((predictions, references))
    # If we are in a multiprocess environment, the last batch has duplicates
    if accelerator.num_processes > 1:
        if step == len(eval_dataloader) - 1:
            predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
            references = references[: len(eval_dataloader.dataset) - samples_seen]
        else:
            samples_seen += references.shape[0]

    metric_acc.add_batch(
        predictions=predictions,
        references=references,
    )
    metric_pre.add_batch(
        predictions=predictions,
        references=references,
    )
    metric_re.add_batch(
        predictions=predictions,
        references=references,
    )
    metric_f1.add_batch(
        predictions=predictions,
        references=references,
    )
    eval_progress_bar.update(1)
    prediction_lst.extend(predictions.detach().cpu().tolist())
    reference_lst.extend(references.detach().cpu().tolist())

In [45]:
eval_metric = metric_acc.compute()
eval_metric_pre = metric_pre.compute()
eval_metric_re = metric_re.compute()
eval_metric_f1 = metric_f1.compute()

logger.info(f"Accuracy : {eval_metric['accuracy']} Precision : {eval_metric_pre['precision']}")
logger.info(f"Recall : {eval_metric_re['recall']} F1 : {eval_metric_f1['f1']}")
logger.info(f"Eval_loss : {eval_loss.item() / len(eval_dataloader)}")

result_log = {
    "eval_accuracy": eval_metric['accuracy'],
    "eval_precision": eval_metric_pre['precision'],
    "eval_recall": eval_metric_re['recall'],
    "eval_f1": eval_metric_f1['f1'],
    "eval_loss": eval_loss.item() / len(eval_dataloader),
}

output_result_path = os.path.join(args.output_dir, f"prediction_results.json")
with open(output_result_path, "w") as f:
    json.dump(result_log, f, indent=4)

## Extra
prediction_np = np.array(prediction_lst)
reference_np = np.array(reference_lst)
y_actu = pd.Series(reference_np, name='Actual')
y_pred = pd.Series(prediction_np, name='Predicted')

reversey_pred = y_pred.map(lambda x: 0 if x == 1 else 1)
reversey_actu = y_actu.map(lambda x: 0 if x == 1 else 1)
rev_accuracy = accuracy_score(reversey_actu, reversey_pred)
rev_precision = precision_score(reversey_actu, reversey_pred)
rev_recall = recall_score(reversey_actu, reversey_pred)
rev_f1 = f1_score(reversey_actu, reversey_pred)

logger.info(f"rev_Accuracy : {rev_accuracy} rev_Precision : {rev_precision}")
logger.info(f"rev_Recall : {rev_recall} rev_F1 : {rev_f1}")
logger.info(f"Eval_loss : {eval_loss.item() / len(eval_dataloader)}")

result_rev_log = {
    "eval_rev_accuracy": rev_accuracy,
    "eval_rev_precision": rev_precision,
    "eval_rev_recall": rev_recall,
    "eval_rev_f1": rev_f1,
    "eval_loss": eval_loss.item() / len(eval_dataloader),
}

output_result_path = os.path.join(args.output_dir, f"prediction_rev_results.json")
with open(output_result_path, "w") as f:
    json.dump(result_rev_log, f, indent=4)

01/10/2023 11:22:34 - INFO - __main__ - Accuracy : 0.48325484764542936 Precision : 0.5339932736848785
01/10/2023 11:22:34 - INFO - __main__ - Recall : 0.2192577461412596 F1 : 0.31087156488620116
01/10/2023 11:22:34 - INFO - __main__ - Eval_loss : 1.3341707212735066
01/10/2023 11:22:34 - INFO - __main__ - rev_Accuracy : 0.48325484764542936 rev_Precision : 0.4690880742722489
01/10/2023 11:22:34 - INFO - __main__ - rev_Recall : 0.782853729789826 rev_F1 : 0.5866524264186146
01/10/2023 11:22:34 - INFO - __main__ - Eval_loss : 1.3341707212735066


In [46]:
prediction_np = np.array(prediction_lst)
reference_np = np.array(reference_lst)

prediction_np = prediction_np.reshape((-1,100))
reference_np = reference_np.reshape((-1,100))

In [48]:
# print(prediction_np.shape)
# print(reference_np.shape)

In [49]:
original_eval_data = utils.open_json(args.original_eval_file)

In [50]:
# original_eval_file[0]['em_pattern']

In [52]:
args.original_eval_file

'/data/philhoon-relevance/binary-classification/NQ-TEST-DPR/ctx100id.json'

In [51]:
for ins, p_, r_ in zip(original_eval_data, prediction_np, reference_np):
    reference_em = ''.join([str(i) for i in r_.tolist()])
    if reference_em != ins['em_pattern']:
        logger.info(f"Reference EM Not Matching Instance EM")
    else:
        prediction_em = ''.join([str(i) for i in p_.tolist()])
    ins['sentence_inference'] = prediction_em

In [53]:
predcition_output_path = os.path.join(args.output_dir, 'ctx100id_test_prediction.json')
with open(predcition_output_path, "w") as f:
    json.dump(original_eval_data, f)   

## ===================================
### Testing Decisive Part

In [None]:
original_eval_data[0].keys()

In [None]:
original_eval_data[0]['conversion']

In [None]:
def get_definite_pos_neg_2(test_em):
    positive_pos = []
    negative_pos = []
    
    # pos
    if test_em.startswith('1'):
        positive_pos.append(0)
    iter_ = re.finditer(r'01', test_em)
    for m in iter_:
        pos_ = m.start() + 1
        positive_pos.append(pos_)
        
        # extra
#         neg_ = m.start()
#         negative_pos.append(neg_)
        
    # neg
    iter_ = re.finditer(r'10', test_em)
    for m in iter_:
        pos_ = m.start() + 1
        if pos_ not in negative_pos:
            negative_pos.append(pos_)
    
    return positive_pos, negative_pos

In [None]:
cnt = 0
cnt_ref = 0
for ins in original_eval_data:
    if ins['em_pattern'] == '0'*100:
        cnt += 1
    if ins['cumulative_em'] == '1' and ins['sentence_inference'] == '0'*100:
        cnt_ref += 1
print(cnt)
print(cnt_ref)

In [None]:
len(original_eval_data)

In [None]:
decisive_ref = []
decisive_pred = []
for ins in original_eval_data:
    positive_pos, negative_pos = get_definite_pos_neg_2(ins['em_pattern'])
    decisive_merge = positive_pos + negative_pos
    decisive_em_pattern_ref = [int(ins['em_pattern'][d]) for d in decisive_merge]
    decisive_em_pattern_pred = [int(ins['sentence_inference'][d]) for d in decisive_merge]
    
    decisive_ref.extend(decisive_em_pattern_ref)
    decisive_pred.extend(decisive_em_pattern_pred)
#     print(ins['em_pattern'])
#     print(decisive_merge)
#     print(decisive_em_pattern)
#     print('===')


In [None]:
print(len(decisive_ref))
print(len(decisive_pred))

In [None]:
prediction_np = np.array(decisive_pred)
reference_np = np.array(decisive_ref)

In [None]:
reference_np

In [None]:
y_actu = pd.Series(reference_np, name='Actual')
y_pred = pd.Series(prediction_np, name='Predicted')

In [None]:
# reversey_pred = y_pred.map(lambda x: 0 if x == 1 else 1)
# reversey_actu = y_actu.map(lambda x: 0 if x == 1 else 1)
rev_accuracy = accuracy_score(y_actu, y_pred)
rev_precision = precision_score(y_actu, y_pred)
rev_recall = recall_score(y_actu, y_pred)
rev_f1 = f1_score(y_actu, y_pred)

logger.info(f"rev_Accuracy : {rev_accuracy} rev_Precision : {rev_precision}")
logger.info(f"rev_Recall : {rev_recall} rev_F1 : {rev_f1}")
logger.info(f"Eval_loss : {eval_loss.item() / len(eval_dataloader)}")

In [None]:
reversey_pred = y_pred.map(lambda x: 0 if x == 1 else 1)
reversey_actu = y_actu.map(lambda x: 0 if x == 1 else 1)
rev_accuracy = accuracy_score(reversey_actu, reversey_pred)
rev_precision = precision_score(reversey_actu, reversey_pred)
rev_recall = recall_score(reversey_actu, reversey_pred)
rev_f1 = f1_score(reversey_actu, reversey_pred)

logger.info(f"rev_Accuracy : {rev_accuracy} rev_Precision : {rev_precision}")
logger.info(f"rev_Recall : {rev_recall} rev_F1 : {rev_f1}")
logger.info(f"Eval_loss : {eval_loss.item() / len(eval_dataloader)}")

In [None]:
print(original_eval_data[i]['em_pattern'])
print(original_eval_data[i]['sentence_inference'])

In [None]:
print('0'*100)

In [None]:
print(type(original_eval_data[i]['cumulative_em']))
print(original_eval_data[i]['sentence_inference'])

## ===================================

# Get top model

In [54]:
path = '/data/philhoon-relevance/binary-classification/results/NQ-DEV-DPR/5-fold/1/FiD-Encoder-lstm-12layers-sequence_exclude_no_answer_partial_decisive'



In [55]:
import pathlib
from pprint import pprint

In [56]:
print(path)

/data/philhoon-relevance/binary-classification/results/NQ-DEV-DPR/5-fold/1/FiD-Encoder-lstm-12layers-sequence_exclude_no_answer_partial_decisive


In [57]:
path = pathlib.Path(path)

In [58]:
files = path.glob('*/*[0-9]_results.json')
# pprint(len(list(files)))
# pprint(list(files))

In [59]:
step_result = []

for file in files:
    step = str(file).split('/')[-2]
    result = utils.open_json(file)
    step_result.append(result)
    
# pprint(step_result)

In [60]:
score_dict = {}
for dict_ins in step_result:
    score_dict[dict_ins['step']] = 0
print(score_dict)

{3380: 0, 2640: 0, 3640: 0, 3381: 0, 2160: 0, 2380: 0, 1980: 0, 2200: 0, 2880: 0, 3000: 0, 1960: 0, 2360: 0, 3360: 0, 3580: 0, 1920: 0, 2220: 0, 2480: 0, 2400: 0, 2780: 0, 2240: 0}


In [61]:
top_n = len(step_result)

In [62]:
# sorted(step_result, key=lambda x: x['eval_accuracy'], reverse = True) 

In [63]:
def sort_by(result, key, top_k, score_dict):
    newlist = sorted(result, key=lambda d: d[key], reverse = True) 
    print(f'sorting by {key}')
    for dic_ in newlist[:top_k]:
        print(f"step : {dic_['step']}, key : {dic_[key]}")
    for dic_, score_ in zip(newlist, list(range(len(step_result) ,0 , -1))):
        score_dict[dic_['step']] += score_
    print(score_dict)

In [64]:
sort_by(step_result, 'eval_accuracy', top_n, score_dict)

sorting by eval_accuracy
step : 2160, key : 0.8049915397631133
step : 2400, key : 0.8045685279187818
step : 2880, key : 0.8032994923857868
step : 2220, key : 0.8032994923857868
step : 2200, key : 0.8028764805414551
step : 2360, key : 0.8028764805414551
step : 3360, key : 0.8028764805414551
step : 2780, key : 0.8028764805414551
step : 2240, key : 0.8024534686971235
step : 3380, key : 0.8020304568527918
step : 1980, key : 0.8020304568527918
step : 1960, key : 0.8020304568527918
step : 3580, key : 0.8020304568527918
step : 1920, key : 0.8020304568527918
step : 2480, key : 0.8020304568527918
step : 2640, key : 0.8016074450084603
step : 3640, key : 0.8016074450084603
step : 3381, key : 0.8016074450084603
step : 2380, key : 0.8016074450084603
step : 3000, key : 0.8016074450084603
{3380: 11, 2640: 5, 3640: 4, 3381: 3, 2160: 20, 2380: 2, 1980: 10, 2200: 16, 2880: 18, 3000: 1, 1960: 9, 2360: 15, 3360: 14, 3580: 8, 1920: 7, 2220: 17, 2480: 6, 2400: 19, 2780: 13, 2240: 12}


In [65]:
sort_by(step_result, 'eval_f1', top_n, score_dict)

sorting by eval_f1
step : 2240, key : 0.832796276405299
step : 2200, key : 0.8314037626628076
step : 2220, key : 0.8303538854432688
step : 2400, key : 0.8292682926829268
step : 2880, key : 0.8292324641939038
step : 3000, key : 0.8283937065495792
step : 1920, key : 0.8283198826118855
step : 2160, key : 0.8279208659947742
step : 3360, key : 0.8277900960827791
step : 3381, key : 0.8273831431726169
step : 1960, key : 0.8266666666666667
step : 1980, key : 0.8264094955489615
step : 2360, key : 0.8262490678598061
step : 2780, key : 0.8261194029850747
step : 2480, key : 0.8258928571428571
step : 3380, key : 0.8257632166790767
step : 2640, key : 0.8257153474544779
step : 3580, key : 0.8243243243243243
step : 2380, key : 0.8230856280648812
step : 3640, key : 0.8229520573801434
{3380: 16, 2640: 9, 3640: 5, 3381: 14, 2160: 33, 2380: 4, 1980: 19, 2200: 35, 2880: 34, 3000: 16, 1960: 19, 2360: 23, 3360: 26, 3580: 11, 1920: 21, 2220: 35, 2480: 12, 2400: 36, 2780: 20, 2240: 32}


In [66]:
sort_by(step_result, 'eval_precision', top_n, score_dict)

sorting by eval_precision
step : 3640, key : 0.8128262490678598
step : 2380, key : 0.8123603871928519
step : 3580, key : 0.8097345132743363
step : 2160, key : 0.8088986141502553
step : 2780, key : 0.8068513119533528
step : 2360, key : 0.8064046579330422
step : 3380, key : 0.8047895500725689
step : 2480, key : 0.8043478260869565
step : 2640, key : 0.8033261026753434
step : 1980, key : 0.8025936599423631
step : 2400, key : 0.8025751072961373
step : 1960, key : 0.8017241379310345
step : 3360, key : 0.8011444921316166
step : 2880, key : 0.7978798586572439
step : 3381, key : 0.7977288857345636
step : 1920, key : 0.7961918194640338
step : 3000, key : 0.7943859649122808
step : 2220, key : 0.7941381716678297
step : 2200, key : 0.7891483516483516
step : 2240, key : 0.7831649831649832
{3380: 30, 2640: 21, 3640: 25, 3381: 20, 2160: 50, 2380: 23, 1980: 30, 2200: 37, 2880: 41, 3000: 20, 1960: 28, 2360: 38, 3360: 34, 3580: 29, 1920: 26, 2220: 38, 2480: 25, 2400: 46, 2780: 36, 2240: 33}


In [67]:
sort_by(step_result, 'eval_recall', top_n, score_dict)

sorting by eval_recall
step : 2240, key : 0.8891437308868502
step : 2200, key : 0.8784403669724771
step : 2220, key : 0.8700305810397554
step : 3000, key : 0.8654434250764526
step : 2880, key : 0.8631498470948012
step : 1920, key : 0.8631498470948012
step : 3381, key : 0.8593272171253823
step : 2400, key : 0.8577981651376146
step : 3360, key : 0.8562691131498471
step : 1960, key : 0.8532110091743119
step : 1980, key : 0.8516819571865444
step : 2640, key : 0.849388379204893
step : 2480, key : 0.8486238532110092
step : 3380, key : 0.8478593272171254
step : 2160, key : 0.8478593272171254
step : 2360, key : 0.8470948012232415
step : 2780, key : 0.8463302752293578
step : 3580, key : 0.8394495412844036
step : 2380, key : 0.8340978593272171
step : 3640, key : 0.8333333333333334
{3380: 37, 2640: 30, 3640: 26, 3381: 34, 2160: 56, 2380: 25, 1980: 40, 2200: 56, 2880: 57, 3000: 37, 1960: 39, 2360: 43, 3360: 46, 3580: 32, 1920: 41, 2220: 56, 2480: 33, 2400: 59, 2780: 40, 2240: 53}


In [68]:
def sort_by(result, key, top_k, score_dict):
    newlist = sorted(result, key=lambda d: d[key], reverse = True) 
    print(f'sorting by {key}')
    for dic_ in newlist[:top_k]:
        print(f"step : {dic_['step']}, key : {dic_[key]}")
    for dic_, score_ in zip(newlist, list(range(len(step_result) ,0 , -1))):
        score_dict[dic_['step']] += score_
    print(score_dict)

In [69]:
sorted_score_dict = sorted(score_dict.items(), key=lambda x:x[1], reverse = True)
for k, v in sorted_score_dict:
    print(f'step {k} : scores : {v}')

step 2400 : scores : 59
step 2880 : scores : 57
step 2160 : scores : 56
step 2200 : scores : 56
step 2220 : scores : 56
step 2240 : scores : 53
step 3360 : scores : 46
step 2360 : scores : 43
step 1920 : scores : 41
step 1980 : scores : 40
step 2780 : scores : 40
step 1960 : scores : 39
step 3380 : scores : 37
step 3000 : scores : 37
step 3381 : scores : 34
step 2480 : scores : 33
step 3580 : scores : 32
step 2640 : scores : 30
step 3640 : scores : 26
step 2380 : scores : 25


## Reverse REsult

In [70]:
files = path.glob('*/*rev_results.json')
# pprint(list(files))

In [71]:
step_result = []

for file in files:
    step = str(file).split('/')[-2]
    result = utils.open_json(file)
    step_result.append(result)

In [72]:
score_dict = {}
for dict_ins in step_result:
    score_dict[dict_ins['step']] = 0
# print(score_dict)

In [73]:
top_n = len(step_result)

In [74]:
sort_by(step_result, 'eval_rev_accuracy', top_n, score_dict)

sorting by eval_rev_accuracy
step : 2160, key : 0.8049915397631133
step : 2400, key : 0.8045685279187818
step : 2880, key : 0.8032994923857868
step : 2220, key : 0.8032994923857868
step : 2200, key : 0.8028764805414551
step : 2360, key : 0.8028764805414551
step : 3360, key : 0.8028764805414551
step : 2780, key : 0.8028764805414551
step : 2240, key : 0.8024534686971235
step : 3380, key : 0.8020304568527918
step : 1980, key : 0.8020304568527918
step : 1960, key : 0.8020304568527918
step : 3580, key : 0.8020304568527918
step : 1920, key : 0.8020304568527918
step : 2480, key : 0.8020304568527918
step : 2640, key : 0.8016074450084603
step : 3640, key : 0.8016074450084603
step : 3381, key : 0.8016074450084603
step : 2380, key : 0.8016074450084603
step : 3000, key : 0.8016074450084603
{3380: 11, 2640: 5, 3640: 4, 3381: 3, 2160: 20, 2380: 2, 1980: 10, 2200: 16, 2880: 18, 3000: 1, 1960: 9, 2360: 15, 3360: 14, 3580: 8, 1920: 7, 2220: 17, 2480: 6, 2400: 19, 2780: 13, 2240: 12}


In [75]:
sort_by(step_result, 'eval_rev_precision', top_n, score_dict)

sorting by eval_rev_precision
step : 2240, key : 0.8350398179749715
step : 2200, key : 0.8248898678414097
step : 2220, key : 0.8174006444683136
step : 3000, key : 0.8125665601703941
step : 2880, key : 0.8113804004214963
step : 1920, key : 0.8107822410147991
step : 2400, key : 0.8074534161490683
step : 3381, key : 0.8073298429319372
step : 3360, key : 0.8053830227743272
step : 1960, key : 0.8024691358024691
step : 1980, key : 0.8012295081967213
step : 2160, key : 0.7995971802618328
step : 2640, key : 0.799184505606524
step : 2480, key : 0.7987804878048781
step : 3380, key : 0.7981744421906694
step : 2360, key : 0.797979797979798
step : 2780, key : 0.7973790322580645
step : 3580, key : 0.7916666666666666
step : 2380, key : 0.7874632713026445
step : 3640, key : 0.7869012707722385
{3380: 17, 2640: 13, 3640: 5, 3381: 16, 2160: 29, 2380: 4, 1980: 20, 2200: 35, 2880: 34, 3000: 18, 1960: 20, 2360: 20, 3360: 26, 3580: 11, 1920: 22, 2220: 35, 2480: 13, 2400: 33, 2780: 17, 2240: 32}


In [76]:
sort_by(step_result, 'eval_rev_recall', top_n, score_dict)

sorting by eval_rev_recall
step : 3640, key : 0.7623106060606061
step : 2380, key : 0.7613636363636364
step : 3580, key : 0.7556818181818182
step : 2160, key : 0.7518939393939394
step : 2780, key : 0.7490530303030303
step : 2360, key : 0.7481060606060606
step : 3380, key : 0.7452651515151515
step : 2480, key : 0.7443181818181818
step : 2640, key : 0.7424242424242424
step : 1980, key : 0.740530303030303
step : 1960, key : 0.7386363636363636
step : 2400, key : 0.7386363636363636
step : 3360, key : 0.7367424242424242
step : 3381, key : 0.7301136363636364
step : 2880, key : 0.7291666666666666
step : 1920, key : 0.7263257575757576
step : 3000, key : 0.7225378787878788
step : 2220, key : 0.7206439393939394
step : 2200, key : 0.709280303030303
step : 2240, key : 0.6950757575757576
{3380: 31, 2640: 25, 3640: 25, 3381: 23, 2160: 46, 2380: 23, 1980: 31, 2200: 37, 2880: 40, 3000: 22, 1960: 30, 2360: 35, 3360: 34, 3580: 29, 1920: 27, 2220: 38, 2480: 26, 2400: 42, 2780: 33, 2240: 33}


In [77]:
sort_by(step_result, 'eval_rev_f1', top_n, score_dict)

sorting by eval_rev_f1
step : 2160, key : 0.7750122010736944
step : 3640, key : 0.7744107744107743
step : 2380, key : 0.7741935483870969
step : 3580, key : 0.7732558139534884
step : 2780, key : 0.7724609375
step : 2360, key : 0.772238514173998
step : 2400, key : 0.7715133531157269
step : 3380, key : 0.7708129285014692
step : 2480, key : 0.7705882352941177
step : 2640, key : 0.7697594501718213
step : 1980, key : 0.7696850393700787
step : 3360, key : 0.7695351137487635
step : 1960, key : 0.7692307692307692
step : 2880, key : 0.7680798004987531
step : 3381, key : 0.7667826951765291
step : 1920, key : 0.7662337662337662
step : 2220, key : 0.7659788626069453
step : 3000, key : 0.7649122807017544
step : 2200, key : 0.7627291242362524
step : 2240, key : 0.7586563307493539
{3380: 44, 2640: 36, 3640: 44, 3381: 29, 2160: 66, 2380: 41, 1980: 41, 2200: 39, 2880: 47, 3000: 25, 1960: 38, 2360: 50, 3360: 43, 3580: 46, 1920: 32, 2220: 42, 2480: 38, 2400: 56, 2780: 49, 2240: 34}


In [78]:
def sort_by(result, key, top_k, score_dict):
    newlist = sorted(result, key=lambda d: d[key], reverse = True) 
    print(f'sorting by {key}')
    for dic_ in newlist[:top_k]:
        print(f"step : {dic_['step']}, key : {dic_[key]}")
    for dic_, score_ in zip(newlist, list(range(len(step_result) ,0 , -1))):
        score_dict[dic_['step']] += score_
    print(score_dict)

In [79]:
sorted_score_dict = sorted(score_dict.items(), key=lambda x:x[1], reverse = True)
for k, v in sorted_score_dict:
    print(f'step {k} : scores : {v}')

step 2160 : scores : 66
step 2400 : scores : 56
step 2360 : scores : 50
step 2780 : scores : 49
step 2880 : scores : 47
step 3580 : scores : 46
step 3380 : scores : 44
step 3640 : scores : 44
step 3360 : scores : 43
step 2220 : scores : 42
step 2380 : scores : 41
step 1980 : scores : 41
step 2200 : scores : 39
step 1960 : scores : 38
step 2480 : scores : 38
step 2640 : scores : 36
step 2240 : scores : 34
step 1920 : scores : 32
step 3381 : scores : 29
step 3000 : scores : 25
