# Analysis3 - extracting FiD encoder embedding 3

## Checking Embedding instance preserved
## Converting pickle -> pickle per instance
## Converting files -> jsonl

### Checking Embedding instance preserved

In [1]:
from pathlib import Path
from util import utils
import pickle
from tqdm import tqdm

#### TQA SPLIT1 train/dev + TQA test

In [2]:
def checking_ins(embedding_path, json_path, data_type):
    path = Path(embedding_path)
    json_data = utils.open_json(json_path)
    
    print(f'json file : {json_path}')
    print(f'total instances in json : {len(json_data)}')
    
    if not(data_type == 'train' or data_type == 'dev' or data_type == 'test'):
        raise TypeError('data_type should be train or dev or test')
    
    files = list(path.glob(f'ctx100id_embedding_{data_type}*'))
    
    cnt = 0
    for file in tqdm(files):
        with open(file, 'rb') as f:
            data = pickle.load(f)
        cnt += len(data)
    
    print(f'embedding path : {embedding_path}')
    print(f'total instances in embedding : {cnt}')
    print(f'Checking train instances are matched : {cnt==len(json_data)}')

In [3]:
tqa_split1_embedding = '/scratch/philhoon-relevance/decoder-classification/TQA-DEV-DPR/5-fold/1/embedding'

In [4]:
tqa_split1_train_json = '/scratch/philhoon-relevance/decoder-classification/TQA-DEV-DPR/5-fold/1/ctx100id_split_train_1.json'
tqa_split1_dev_json = '/scratch/philhoon-relevance/decoder-classification/TQA-DEV-DPR/5-fold/1/ctx100id_split_dev_1.json'

In [9]:
checking_ins(tqa_split1_embedding, tqa_split1_dev_json, 'dev')

json file : /scratch/philhoon-relevance/decoder-classification/TQA-DEV-DPR/5-fold/1/ctx100id_split_dev_1.json
total instances in json : 1768


100%|██████████| 18/18 [02:19<00:00,  7.74s/it]


embedding path : /scratch/philhoon-relevance/decoder-classification/TQA-DEV-DPR/5-fold/1/embedding
total instances in embedding : 1768
Checking train instances are matched : True


In [5]:
checking_ins(tqa_split1_embedding, tqa_split1_train_json, 'train')

json file : /scratch/philhoon-relevance/decoder-classification/TQA-DEV-DPR/5-fold/1/ctx100id_split_train_1.json
total instances in json : 7069


100%|██████████| 71/71 [08:34<00:00,  7.25s/it]


embedding path : /scratch/philhoon-relevance/decoder-classification/TQA-DEV-DPR/5-fold/1/embedding
total instances in embedding : 7069
Checking train instances are matched : True


In [6]:
tqa_test_embedding = '/scratch/philhoon-relevance/decoder-classification/TQA-TEST-DPR/embedding/'
tqa_test_json = '/scratch/philhoon-relevance/decoder-classification/TQA-TEST-DPR/ctx100id_test.json'

In [8]:
checking_ins(tqa_test_embedding, tqa_test_json, 'test')

json file : /scratch/philhoon-relevance/decoder-classification/TQA-TEST-DPR/ctx100id_test.json
total instances in json : 8837


100%|██████████| 89/89 [11:36<00:00,  7.82s/it]


embedding path : /scratch/philhoon-relevance/decoder-classification/TQA-TEST-DPR/embedding/
total instances in embedding : 8837
Checking train instances are matched : True


#### NQ SPLIT1 train/dev + NQ test

In [10]:
nq_split1_embedding = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/embedding'

In [11]:
nq_split1_train_json = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/ctx100id_split_train_1.json'
nq_split1_dev_json = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/ctx100id_split_dev_1.json'

In [12]:
checking_ins(nq_split1_embedding, nq_split1_dev_json, 'dev')

json file : /scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/ctx100id_split_dev_1.json
total instances in json : 1752


100%|██████████| 18/18 [02:37<00:00,  8.75s/it]


embedding path : /scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/embedding
total instances in embedding : 1752
Checking train instances are matched : True


In [13]:
checking_ins(nq_split1_embedding, nq_split1_train_json, 'train')

json file : /scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/ctx100id_split_train_1.json
total instances in json : 7005


100%|██████████| 71/71 [11:52<00:00, 10.04s/it]


embedding path : /scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/embedding
total instances in embedding : 7005
Checking train instances are matched : True


In [14]:
nq_test_embedding = '/scratch/philhoon-relevance/decoder-classification/NQ-TEST-DPR/embedding'
nq_test_json = '/scratch/philhoon-relevance/binary-classification/NQ-TEST-DPR/ctx100id.json'

In [15]:
checking_ins(nq_test_embedding, nq_test_json, 'test')

json file : /scratch/philhoon-relevance/binary-classification/NQ-TEST-DPR/ctx100id.json
total instances in json : 3610


100%|██████████| 37/37 [05:46<00:00,  9.38s/it]


embedding path : /scratch/philhoon-relevance/decoder-classification/NQ-TEST-DPR/embedding
total instances in embedding : 3610
Checking train instances are matched : True


### Converting pickle -> pickle per instance
    - file checking

In [None]:
import pickle
from pathlib import Path

In [None]:
test_file = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/ctx100id_embedding_train_1_1.pickle'

In [None]:
output_path = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/train'

In [None]:
with open(test_file, 'rb') as f: 
    data = pickle.load(f)

In [None]:
print(data[0].keys())
len(data)
print(data[0]['id'])
print(data[0]['embedding'].shape)

### Converting pickle -> pickle per instance
    - Converting Pickle by instance
    - train_pickle -> train directory
        '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/train'
    - dev_pickle -> dev directory
    - output_file_name : {id}.pickle
        '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/dev

In [None]:
file_type = 'train'
output_path = f'/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/{file_type}'
path_name = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1'
file_filter = f'ctx100id_embedding_{file_type}_*.pickle'

In [None]:
print(output_path)
print(path_name)
print(file_filter)

In [None]:
def save_by_instance(data, output_path):
    for instance in data:
        temp_dict = {}
        output_file_name = f"{instance['id']}"
        output_file = output_path + '/' + output_file_name + '.pickle'

        temp_dict['id'] = instance['id']
        temp_dict['embedding'] = instance['embedding']

        with open(output_file, 'wb') as f:
            pickle.dump(temp_dict, f)
    

In [None]:
path = Path(path_name)

In [None]:
file_lst = path.glob(file_filter)

In [None]:
cnt = 0
for file in file_lst:
    print(file)
    with open(file, 'rb') as f:
        data = pickle.load(f)
        
    save_by_instance(data, output_path)
    cnt += 1

print(cnt)

## File Check

In [None]:
test_output_file = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/train/98.pickle'

In [None]:
with open(test_output_file, 'rb') as f: 
    file_check = pickle.load(f)

In [None]:
test_output_path = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/train'

In [None]:
path = Path(test_output_path)

In [None]:
file_lst = path.glob('*')

In [None]:
for file in file_lst:
#     print(file)
    with open(file, 'rb') as f: 
        file_check = pickle.load(f)
    print(file_check)

In [None]:
parser = argparse.ArgumentParser(description = 'Merging pickles to one jsonl file')

parser.add_argument('--path', type = str, 
                    default = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1')
parser.add_argument('--ft', type = str, 
                    default = 'train')
parser.add_argument('--output', type = str, 
                    default = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1')

args = parser.parse_args([])

## Testing Files

In [None]:
from pprint import pprint
import pickle
from pathlib import Path
import argparse
import json
import numpy as np
import gc
import torch

from torch.utils.data import Dataset, ConcatDataset

In [None]:
parser = argparse.ArgumentParser(description = 'Merging pickles to one jsonl file')

parser.add_argument('--path', type = str, 
                    default = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1')
parser.add_argument('--ft', type = str, 
                    default = 'train')
parser.add_argument('--output', type = str, 
                    default = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1')

args = parser.parse_args([])

In [None]:
pprint(vars(args))

In [None]:
path = Path(args.path)

In [None]:
file_type = f'*{args.ft}*'
file_lst = path.glob(file_type)

In [None]:
target_files = [file for file in file_lst]

In [None]:
class CustomDataSet(Dataset):
    def __init__(self, file_name):
        self.file_name = file_name
        with open(self.file_name, 'rb') as f: 
            self.data = pickle.load(f)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        id_ = self.data[idx]['id']
        embedding = self.data[idx]['embedding']

        return id_, embedding

In [None]:
input_file = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/ctx100id_embedding_train_1_1.pickle'
dataset = CustomDataSet(input_file)

In [None]:
testing_file = torch.load(input_file)

In [None]:
# output_file = f'ctx100id_embedding_{args.ft}.jsonl'

In [None]:
# output_path = args.path + '/' +output_file
# print(output_path)

In [None]:
# f = open(output_path, 'a', encoding="utf-8")

In [None]:
# input_path = '/scratch/philhoon-relevance/decoder-classification/NQ-DEV-DPR/5-fold/1/ctx100id_embedding_train_1_1.pickle'

# with open(input_path, 'rb') as f:
#     data = pickle.load(f)

In [None]:
# print(type(data))

In [None]:
# data[3]

In [None]:
dev = 'dev'
file_type = f'*{dev}*'
file_lst = path.glob(file_type)

In [None]:
dev_target_files = [file for file in file_lst]

In [None]:
dev_data = []
cnt = 0
for file in dev_target_files:
    cnt += 1
    print(file)
    with open(file, 'rb') as f:
        data = pickle.load(f)
    dev_data.extend(data)
    del data
    gc.collect()
    print(cnt)
#     if cnt == 10:
#         break

In [None]:
total_data = []
cnt = 0
for file in target_files:
    cnt += 1
    print(file)
    with open(file, 'rb') as f:
        data = pickle.load(f)
    total_data.extend(data)
    del data
    gc.collect()
    print(cnt)
#     if cnt == 10:
#         break

In [None]:
len(total_data)

In [None]:
total_data[0]

In [None]:
d = []
k = [1,2,3]
d.extend(k)
print(k)

In [None]:
for instance in data:
    print(type(instance))
    print(instance['embedding'])
#     np_array = instance['embedding'].numpy()
#     print('------')
#     print(np_array)
#     array_lst = np_array.tolist()
#     print('------')
#     print(array_lst)
    print('------')
    print('------')
    py_array = instance['embedding'].tolist()
    print(py_array)
    break
#     json.dump(instance, f, ensure_ascii=False)
#     f.write("\n")

In [None]:
print(len(py_array))

In [None]:

import json
from collections import OrderedDict

my_data = OrderedDict()
my_data["id"] = "this is id"
my_data["title"] = "this is title"
with open("target_data.jsonl", "w", encoding="utf-8") as f:
    json.dump(my_data, f, ensure_ascii=False) # ensure_ascii로 한글이 깨지지 않게 저장
    f.write("\n") # json을 쓰는 것과 같지만, 여러 줄을 써주는 것이므로 "\n"을 붙여준다.
    

In [None]:
f.close()

In [None]:
for files in target_files:
    print(files)

In [None]:
print(output_file)

In [None]:

with open(input_path, 'rb') as f:
    data = pickle.load(f)

In [None]:
len(data)

In [None]:

for i in data:
    print(type(i))
    

In [None]:
data[7]['embedding'].shape

In [None]:

import json
from collections import OrderedDict

my_data = OrderedDict()
my_data["id"] = "this is id"
my_data["title"] = "this is title"
with open("target_data.jsonl", "w", encoding="utf-8") as f:
    json.dump(my_data, f, ensure_ascii=False) # ensure_ascii로 한글이 깨지지 않게 저장
    f.write("\n") # json을 쓰는 것과 같지만, 여러 줄을 써주는 것이므로 "\n"을 붙여준다.
    