### Author: Dmitrii Iakushechkin

In [1]:
import h5py
import os
import errno
import json
import jsonlines
import numpy as np

---

#### Helper functions

In [14]:
def convertSciTLDRforPacSum(path,task,mode):
    '''
    Args:
        task: ['A', 'AIC', 'FullText']
        mode: ['train','test','dev']
    '''
    
    tldr_list = []
    with jsonlines.open(os.path.join(path,'SciTLDR-{}'.format(task),'{}.jsonl'.format(mode))) as reader:
        print('1. {} dataset is loaded'.format(mode))
        counter = 0
        counter_tldrs = 0
        for obj in reader:
            counter +=1
            obj_pacsum = dict()
            obj_pacsum['article'] = obj['source']
            
            if isinstance(obj['target'], list):
                for item in obj['target']:
                    counter_tldrs += 1
                    obj_pacsum['abstract'] = [item]
                    s = json.dumps(obj_pacsum).encode('UTF-8')
                    tldr_list.append(s)
            elif isinstance(obj['target'], str):
                    counter_tldrs += 1
                    obj_pacsum['abstract'] = [obj['target']]
                    s = json.dumps(obj_pacsum).encode('UTF-8')
                    tldr_list.append(s)
            else:
                print('Unknown type of target data.')

        print('2. The {} dataset has {} articles'.format(mode, counter))
        print('3. The {} dataset has {} TLDRs'.format(mode, counter_tldrs))
    arr = np.array(tldr_list)
    print('Array is ready for saving!')
    return arr
    

In [15]:
def saveSciTLDRforPacSum(path,arr,task,mode):
    '''
    Function for saving an array with bytes as a h5df file.
    Args:
        task: ['A', 'AIC', 'FullText']
        mode: ['train','test','dev']
    '''
    path = os.path.join(path,'SciTLDR-{}'.format(task),'forPacSum')
    try:
        os.mkdir(path)
    except OSError as err:
        if err.errno == errno.EEXIST:
            pass
        else:
            raise
    
    with h5py.File(os.path.join(path,'tldr_{}.h5df'.format(mode)), 'w') as f:
        f.create_dataset("dataset", data=arr,dtype = h5py.string_dtype(encoding='ascii'))
    

In [16]:
def SciTLDRforPacSum(path,task,mode):
    arr = convertSciTLDRforPacSum(path,task,mode)
    saveSciTLDRforPacSum(path,arr,task,mode)
    print('The {} dataset for {} task is saved. \n'.format(mode,task))

---

In [17]:
#change the path is necessary
scitldr_path = os.path.join(os.getcwd(), '../../scitldr/SciTLDR-Data/')

In [21]:
SciTLDRforPacSum(scitldr_path,'A','test')
SciTLDRforPacSum(scitldr_path,'A','train')
SciTLDRforPacSum(scitldr_path,'A','dev')

1. test dataset is loaded
2. The test dataset has 618 articles
3. The test dataset has 1324 TLDRs
Array is ready for saving!
The test dataset for A task is saved. 

1. train dataset is loaded
2. The train dataset has 1992 articles
3. The train dataset has 1992 TLDRs
Array is ready for saving!
The train dataset for A task is saved. 

1. dev dataset is loaded
2. The dev dataset has 619 articles
3. The dev dataset has 619 TLDRs
Array is ready for saving!
The dev dataset for A task is saved. 

