In [10]:
import struct
from tensorflow.core.example import example_pb2
from google.protobuf import json_format
import json
import base64
import pandas as pd
import re

In [11]:
# Read from processed binary data of the cnn-dm dataset.
# The processing is in a format expected by the tensorflow code.
# https://github.com/tatami-galaxy/pointer-generator/blob/master/data.py
# Convert it to text data and store in csv format expected by torchtext
def data_generator(data_path):
    reader = open(data_path, 'rb')
    while(True):
        len_bytes = reader.read(8)
        if not len_bytes: break
        str_len = struct.unpack('q', len_bytes)[0]
        example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
        yield example_pb2.Example.FromString(example_str)

In [12]:
def write_to_csv(input_file, output_file):
    
    # Dictionary to hold list of articles and abstracts
    dictionary = {'article' : [], 'abstract' : []}
    # Iterate through generator
    # https://github.com/dsindex/textsum/blob/master/check_data.py
    for ret in data_generator(input_file):
        json_string = json_format.MessageToJson(ret)
        json_obj = json.loads(json_string)
        feature = json_obj['features']['feature']
        # Append article to article list
        dictionary['article'].append(base64.b64decode(
            feature['article']['bytesList']['value'][0]).decode("utf-8"))
        # Remove sentence delimiters from abstract
        abstract = base64.b64decode(feature['abstract']['bytesList']['value'][0]).decode("utf-8")
        abstract = re.sub(r"<s>", "", abstract)
        abstract = re.sub(r"</s>", "", abstract)
        # Append abstract to abstract list
        dictionary['abstract'].append(abstract)
    # Create pandas dataframe and write to csv file 
    df = pd.DataFrame(data=dictionary)
    df.to_csv(output_file, index=False)
    print('Done')

In [13]:
write_to_csv('finished_files/train.bin', 'datasets/train.csv')
write_to_csv('finished_files/val.bin', 'datasets/val.csv')
write_to_csv('finished_files/test.bin', 'datasets/test.csv')

Done
Done
Done
