In [12]:
import sys
import os
import hashlib
import struct
import subprocess
import collections
import tensorflow as tf
from tensorflow.core.example import example_pb2
import json
import io
import shutil
import subprocess
import pathlib
from nltk import sent_tokenize 

In [3]:
# generate mapping.txt and run the command manually
def tokenize_stories(stories_dir, tokenized_stories_dir):
    # clear data directory
    if os.path.exists(tokenized_dir):
        shutil.rmtree(tokenized_dir)
        os.makedirs(tokenized_dir)
    else:
        os.makedirs(tokenized_dir)
    
    """Maps a whole directory of .story files to a tokenized version using Stanford CoreNLP Tokenizer"""
    print "Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir)
    stories = os.listdir(stories_dir)
    # make IO list file
    print "Making list of files to tokenize..."
    if not os.path.isdir(separated_dir):
        os.mkdir(separated_dir)
    with open("mapping.txt", "w") as f:
        for s in stories:
            f.write("%s \t %s\n" % (os.path.join(stories_dir, s), os.path.join(tokenized_stories_dir, s)))
    # call system command
    #os.system("java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines /home/xw0078/jupyter_notebooks/ETD/mapping.txt") 
    mapping_path = "/home/xw0078/jupyter_notebooks/ETD/mapping.txt"
    os.environ['CLASSPATH'] = "/home/xw0078/my_programs/lib/stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar"
    #command_1 = "export CLASSPATH=/home/xw0078/my_programs/lib/stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar"
    command_2 = "java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines /home/xw0078/jupyter_notebooks/ETD/mapping.txt"
    #subprocess.check_output(command_1,shell=True)
    subprocess.check_output(command_2,shell=True)
    os.remove("mapping.txt")
    print "Tokenization Finished"

In [4]:
# load and separate Json file to abstract and body text
def separate_json(input_path, separated_dir):
    # clear data directory
    if os.path.exists(separated_dir):
        shutil.rmtree(separated_dir)
        os.makedirs(separated_dir)
    else:
        os.makedirs(separated_dir)
        
    # Open the file with read only permit
    jsonFile = io.open(input_path,'r', encoding='utf8')
    # use readline() to read the first line 
    line = jsonFile.readline()
    # use the read line to read further.
    # If the file is not empty keep reading one line
    # at a time, till the file is empty
    ID = 0
    flag = 0
    empty_count = 0
    while line:
        record = json.loads(line)
        abstract =  record['introduction']
        body = record['textbody']
        len_abs = len(abstract)
        len_body = len(body)
        if len_abs < 50:
            line = jsonFile.readline()
            continue
        if len_body < 1000:
            line = jsonFile.readline()
            continue
        if len_body/len_abs < 3:
            line = jsonFile.readline()
            continue
        #write abstract
        if not os.path.isdir(separated_dir):
            os.mkdir(separated_dir)
        f = open(separated_dir+str(ID)+".abs","wb")
        f.write(abstract.encode("utf-8",'ignore'))
        f.close()
        #write bodytext
        f = open(separated_dir+str(ID)+".body","wb")
        f.write(body.encode("utf-8",'ignore'))
        f.close()

        #next line
        ID = ID+1
        line = jsonFile.readline()
    jsonFile.close()

In [34]:
def get_art_abs(input_dir,ID):
    with open(input_dir+ID+".abs", 'r') as absFile:
        abstract = absFile.read()
    with open(input_dir+ID+".body", 'r') as bodyFile:
        body = bodyFile.read()
    if body and abstract:
        return body,abstract
    else:
        exit(0)

SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

def wrap_sentence_token(sent):
    return SENTENCE_START+sent+SENTENCE_END

def get_art_abs_with_st(input_dir,ID):
    with open(input_dir+ID+".abs", 'r') as absFile:
        abstract = absFile.read()
    with open(input_dir+ID+".body", 'r') as bodyFile:
        body = bodyFile.read()
    if body and abstract:
        sent_tokenized_list = sent_tokenize(abstract.decode("ascii",errors="ignore").encode())
        sent_tokenized_list = map(wrap_sentence_token,sent_tokenized_list)
        st_abstract = "".join(sent_tokenized_list)
        #print st_abstract
        return body,st_abstract
    else:
        exit(0)

In [35]:
# Write to bin file
dm_single_close_quote = u'\u2019' # unicode
dm_double_close_quote = u'\u201d'
END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")"] # acceptable ways to end a sentence
VOCAB_SIZE = 200000
# We use these to separate the summary sentences in the .bin datafiles
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'
def write_to_bin(input_dir, out_dir, makevocab=False):
    
    # clear data directory
    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)
        os.makedirs(out_dir)
    else:
        os.makedirs(out_dir)
    out_file = os.path.join(out_dir, "train.bin")
    """Reads the tokenized .story files corresponding to the urls listed in the url_file and writes them to a out_file."""
    if makevocab:
        vocab_counter = collections.Counter()
    ID = 0
    with open(out_file, 'wb') as writer:
        # Get the strings to write to .bin file
        num_files = len(os.listdir(input_dir))/2-1
        for ID in range(0,num_files):
            
            article, abstract = get_art_abs_with_st(input_dir,str(ID))
            #print abstract
            # Write to tf.Example
            tf_example = example_pb2.Example()
            tf_example.features.feature['article'].bytes_list.value.extend([article])
            tf_example.features.feature['abstract'].bytes_list.value.extend([abstract])
            tf_example_str = tf_example.SerializeToString()
            str_len = len(tf_example_str)
            if not str_len:
                print "MyError-0"
            writer.write(struct.pack('q', str_len))
            writer.write(struct.pack('%ds' % str_len, tf_example_str))

            # Write the vocab to file, if applicable
            if makevocab:
                art_tokens = article.split(' ')
                abs_tokens = abstract.split(' ')
                abs_tokens = [t for t in abs_tokens if t not in [SENTENCE_START, SENTENCE_END]] # remove these tags from vocab
                tokens = art_tokens + abs_tokens
                tokens = [t.strip() for t in tokens] # strip
                tokens = [t for t in tokens if t!=""] # remove empty
                vocab_counter.update(tokens)

    print "Finished writing file %s\n" % out_file

    # write vocab to file
    if makevocab:
        print "Writing vocab file..."
        with open(os.path.join(out_dir, "vocab"), 'w+') as writer:
            for word, count in vocab_counter.most_common(VOCAB_SIZE):
                writer.write(word + ' ' + str(count) + '\n')
        print "Finished writing vocab file"

In [9]:
CHUNK_SIZE = 1000 # num examples per chunk, for the chunked data
def chunk_file(input_dir,set_name):
    
    in_file = input_dir + '%s.bin' % set_name
    print "In File:" + in_file
    reader = open(in_file, "rb")
    chunk = 0
    finished = False
    while not finished:
        chunk_fname = os.path.join(chunks_dir, '%s_%03d.bin' % (set_name, chunk)) # new chunk
        with open(chunk_fname, 'wb') as writer:
            for _ in range(CHUNK_SIZE):
                len_bytes = reader.read(8)
                if not len_bytes:
                    finished = True
                    break
                str_len = struct.unpack('q', len_bytes)[0]
                example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
                writer.write(struct.pack('q', str_len))
                writer.write(struct.pack('%ds' % str_len, example_str))
            chunk += 1
        
def chunk_all(input_dir, chunks_dir):
  # Make a dir to hold the chunks
    if os.path.exists(chunks_dir):
        shutil.rmtree(chunks_dir)
        os.makedirs(chunks_dir)
    else:
        os.makedirs(chunks_dir)
  # Chunk the data
    for set_name in ['train']:
        print "Splitting %s data into chunks..." % set_name
        chunk_file(input_dir,set_name)
    print "Saved chunked data in %s" % chunks_dir

In [36]:
input_path = "/mnt/6t/wiki/processed/Th_related_wiki_gensim_v5.json"
separated_dir = "/mnt/6t/DeepLearningPreTrainModels/wiki_thesis_st/separated_files/"
tokenized_dir = "/mnt/6t/DeepLearningPreTrainModels/wiki_thesis_st/tokenized_separated_files/"
output_dir = "/mnt/6t/DeepLearningPreTrainModels/wiki_thesis_st/finished_files/"
chunks_dir = "/mnt/6t/DeepLearningPreTrainModels/wiki_thesis_st/finished_files/chunked/"
    
# separate_json(input_path,separated_dir)

# tokenize_stories(separated_dir,tokenized_dir)

write_to_bin(tokenized_dir,output_dir, makevocab=True)

chunk_all(output_dir,chunks_dir)

Finished writing file /mnt/6t/DeepLearningPreTrainModels/wiki_thesis_st/finished_files/train.bin

Writing vocab file...
Finished writing vocab file
Splitting train data into chunks...
In File:/mnt/6t/DeepLearningPreTrainModels/wiki_thesis_st/finished_files/train.bin
Saved chunked data in /mnt/6t/DeepLearningPreTrainModels/wiki_thesis_st/finished_files/chunked/


In [None]:
input_path = "/mnt/6t/wiki/processed/All_related_wiki_gensim_v5.0.json"
separated_dir = "/mnt/6t/DeepLearningPreTrainModels/wiki_all_st/separated_files/"
tokenized_dir = "/mnt/6t/DeepLearningPreTrainModels/wiki_all_st/tokenized_separated_files/"
output_dir = "/mnt/6t/DeepLearningPreTrainModels/wiki_all_st/finished_files/"
chunks_dir = "/mnt/6t/DeepLearningPreTrainModels/wiki_all_st/finished_files/chunked/"
    
separate_json(input_path,separated_dir)

tokenize_stories(separated_dir,tokenized_dir)

write_to_bin(tokenized_dir,output_dir, makevocab=True)

chunk_all(output_dir,chunks_dir)