# NQ-5-HandsOnAnalysis_Method3Topk0sDataset

- NQ dev with FiD retrieved passages
    - Method3. '00011011000'  -> 
<span style="color:blue">0</span><span style="color:blue">0</span><span style="color:blue">0</span><span style="color:red">1</span></span><span style="color:red">1</span>0<span style="color:red">1</span></span><span style="color:red">1</span>000  (__0.57839__)     

- Both cases show that including consecutive 0s improved the overall output
- So Questions : How many consecutive 0s are needed?
    - __Creating DATASET with Method3 with Top0s from 1 ~ 5__ on ctx20
    - __Creating DATASET with Method3 with Top0s from 1 ~ 5__ on ctx100
     - __Creating DATASET with Method3 with Top0s from 1 ~ 10__ on ctx100

In [49]:
import numpy as np
import pathlib
from pprint import pprint
import pandas as pd
from util import utils
import re
import os

# Implementation

In [50]:
def build_method_data(input_file, option, option_p, option_d, sample_size, top_zeros):
    '''
    input_file : incremental inference result from FiD from KILT-5-1
        path : /data/philhoon-relevance/FiD/results/KILT_BM25_NQ/incremental_result/pos1_ctx5.json
        
    output : FiD input json format
    
    option(required) : removing strategies
        op1 : removes damages only
        op2 : removes damaging + irrelevant
        op3 : removes damaging + relevant
        op4 : removes damaging + irrelevant + relevant
        
    option_p(required) : positive passage selection options
        strict : strict positive
            e.g.) 11 pattern 
                1st '1' is positive, 2nd '1' is relevant
        naive : naive positive
            e.g.) 11 pattern 
                1st '1' is positive, 2nd '1' is positive
                
    option_d(required) : damaging passage selection options
        strict : strict negative
            e.g.) A00 pattern 
                if there is at least one '1' occurred in A, 2nd '0' is irrelevant
        naive : naive damaging
            e.g.) A00 pattern 
                if there is at least one '1' occurred in A, 2nd '0' is damaging
    
    '''
    
    output_format = []
    null_em = '0' * sample_size
    
    # 'strict', 'naive'  
    # option_p = 'naive'
    # option_d = 'naive'
    # option = 'op4'

    for id_, instance in enumerate(input_file,1):
        template_dict = {}
        if 'id' in instance.keys():
            template_dict['id'] = instance['id']
        else:
            template_dict['id'] = str(id_)
        template_dict['answers'] = instance['answers']
        template_dict['question'] = instance['question']
        template_dict['em_pattern'] = instance['em_pattern']

        em_pattern = instance['em_pattern']

        # when there is at least one EM in the accumulated inference
        if em_pattern != null_em:   
            new_ctx = []

            # relevant vs positive
            positve_ctx_lst = []
            relevant_ctx_lst = []

            # irrelevant vs damaging
            damaging_ctx_lst = []
            irrelevant_ctx_lst = []


            for idx_, ctx in enumerate(instance['ctxs']):

                # checking current em
                cur_em = em_pattern[idx_]
                pre_em_pattern = em_pattern[:idx_]


                # first 1 : positive
                if not pre_em_pattern and cur_em == '1':
                    positve_ctx_lst.append(ctx)

                # first 0 : irrelevant
                elif not pre_em_pattern and cur_em == '0':
                    irrelevant_ctx_lst.append(ctx)
                    
                # 01 pattern : positive 
                elif pre_em_pattern and pre_em_pattern[-1] == '0' and cur_em == '1':
                    positve_ctx_lst.append(ctx)

                # 10 pattern : damaging
                elif pre_em_pattern and pre_em_pattern[-1] == '1' and cur_em == '0':
                    damaging_ctx_lst.append(ctx)

                # 11 pattern : Strict Positive(relevant) or Naive Positive(positive)
                elif pre_em_pattern and pre_em_pattern[-1] == '1' and cur_em == '1':
                    if option_p == 'strict':
                        relevant_ctx_lst.append(ctx)

                    elif option_p == 'naive':
                        positve_ctx_lst.append(ctx)

                    else:
                        print('option_p should be either \'strict\' or \'naive\'')
                        return 

                # 00 pattern : Strict Damaging(irrelevant) or Naive Damaging(damaging) 
                elif pre_em_pattern and pre_em_pattern[-1] == '0' and cur_em == '0':
                    # if '1' does not occured in A, currnet passage is irrelevant
                    if not '1' in pre_em_pattern:
                        irrelevant_ctx_lst.append(ctx)

                    # if '1' occurred in A, 
                    else:
                        # strict : consider it as irrelevnat 
                        if option_d == 'strict':
                            irrelevant_ctx_lst.append(ctx)

                        # naive : consider it as damaging 
                        elif option_d == 'naive':
                            damaging_ctx_lst.append(ctx)

                        else:
                            print('option_p should be either \'strict\' or \'naive\'')
                            return 

            # op1 removes damages only
            if option == 'op1':
                new_ctx.extend(positve_ctx_lst)
                new_ctx.extend(relevant_ctx_lst)
                irrelevant_ctx_lst_len = len(irrelevant_ctx_lst)
                if irrelevant_ctx_lst_len > 0:
                    if len(irrelevant_ctx_lst) < top_zeros:
                        new_ctx.extend(irrelevant_ctx_lst)
                    else:
                        new_ctx.extend(irrelevant_ctx_lst[:top_zeros])


            # op2 removes damaging + irrelevant
            elif option == 'op2':
                new_ctx.extend(positve_ctx_lst)
                new_ctx.extend(relevant_ctx_lst)

            # op3 : Removes damaging + relevant
            elif option == 'op3':
                new_ctx.extend(positve_ctx_lst)
                new_ctx.extend(irrelevant_ctx_lst)

            # op4 : Removes damaging + irrelevant + relevant
            elif option == 'op4':
                new_ctx.extend(positve_ctx_lst)

            else:
                print('option should be op1, op2, op3, op4')
                return 

            template_dict['ctxs'] = new_ctx
            output_format.append(template_dict)

        # when there is no EM in the accumulated inference
        else:
            template_dict['ctxs']= instance['ctxs']
            output_format.append(template_dict)
    
    print('==============instance finished======================')
    return output_format
        


## Method
- option1, option2 are not needed
- Method1 : option4_naive_naive
- Method2 : option4_strict_naive 
- Method3 : option3_naive_naive == option1_strict_naive
- Method4 : option3_strict_naive
- Method5 : option3_naive_strict
- Method6 : option3_strict_strict

In [51]:
option_dict = {
    'op1' : 'remove_damage',
}

In [52]:
option_p_dict = {
    'strict' : 'strict_positive',
}

In [53]:
option_d_dict = {
    'naive' : 'naive_damaging',
}

In [54]:
# input_file = utils.open_json('/data/philhoon-relevance/FiD/results/KILT_BM25_NQ/incremental_result/pos1_ctx5.json')
# output_path = '/data/philhoon-relevance/FiD/open_domain_data/NQ_KILT_BM25_SELECTION'

## NQ dev set

In [55]:
sample_size = 100
# incremental_result_100

In [56]:
input_file = f'/data/philhoon-relevance/FiD/results/NQ_DPR/DEV/incremental_result_{sample_size}/ctx{sample_size}.json'
output_path = f'/data/philhoon-relevance/FiD/open_domain_data/NQ_DPR_DEV_SELECTION_METHOD3/ctx_{sample_size}'

In [57]:
input_file

'/data/philhoon-relevance/FiD/results/NQ_DPR/DEV/incremental_result_100/ctx100.json'

In [58]:
input_ = utils.open_json(input_file)

In [59]:
output_path

'/data/philhoon-relevance/FiD/open_domain_data/NQ_DPR_DEV_SELECTION_METHOD3/ctx_100'

In [60]:
len(input_)

8757

In [61]:
for top_zeros in range(6, 11):
    for o_ in option_dict.keys():
        for op in option_p_dict.keys():
            for od in option_d_dict.keys():
                option = o_
                option_p = op
                option_d = od
                out_path = output_path + f'/top_zeros_{top_zeros}'
                isExist = os.path.exists(out_path)
                os.makedirs(out_path, exist_ok=True)

                filename = f'method3_topzeros{top_zeros}.json'
                output_file = os.path.join(out_path, filename)
#                 print(output_file)
                output_format = build_method_data(input_, option, option_p, option_d, sample_size, top_zeros)
            
                utils.save_json(output_format, output_file)
                print(f'{filename} save on \n {out_path}')

method3_topzeros6.json save on 
 /data/philhoon-relevance/FiD/open_domain_data/NQ_DPR_DEV_SELECTION_METHOD3/ctx_100/top_zeros_6
method3_topzeros7.json save on 
 /data/philhoon-relevance/FiD/open_domain_data/NQ_DPR_DEV_SELECTION_METHOD3/ctx_100/top_zeros_7
method3_topzeros8.json save on 
 /data/philhoon-relevance/FiD/open_domain_data/NQ_DPR_DEV_SELECTION_METHOD3/ctx_100/top_zeros_8
method3_topzeros9.json save on 
 /data/philhoon-relevance/FiD/open_domain_data/NQ_DPR_DEV_SELECTION_METHOD3/ctx_100/top_zeros_9
method3_topzeros10.json save on 
 /data/philhoon-relevance/FiD/open_domain_data/NQ_DPR_DEV_SELECTION_METHOD3/ctx_100/top_zeros_10
