In [7]:
import struct
from tensorflow.core.example import example_pb2
from google.protobuf import json_format
import json
import base64
import pandas as pd
import re
import numpy as np
from fastText import load_model
import pickle

### Functions

In [2]:
# Read from processed binary data of the cnn-dm dataset.
# The processing is in a format expected by the tensorflow code.
# https://github.com/tatami-galaxy/pointer-generator/blob/master/data.py
# Convert it to text data and store in csv format expected by torchtext
def data_generator(data_path):
    reader = open(data_path, 'rb')
    while(True):
        len_bytes = reader.read(8)
        if not len_bytes: break
        str_len = struct.unpack('q', len_bytes)[0]
        example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
        yield example_pb2.Example.FromString(example_str)

In [3]:
def create_dict(input_file):
    # Dictionary to hold list of articles and abstracts
    dictionary = {'article' : [], 'abstract' : []}
    # Iterate through generator
    # https://github.com/dsindex/textsum/blob/master/check_data.py
    for ret in data_generator(input_file):
        json_string = json_format.MessageToJson(ret)
        json_obj = json.loads(json_string)
        feature = json_obj['features']['feature']
        # Append article to article list
        article = base64.b64decode(feature['article']['bytesList']['value'][0]).decode("utf-8")
        # Remove unnecessary tokens from article
        article = re.sub(r"-rrb-", "", article)
        article = re.sub(r"-lrb-", "", article)
        article = re.sub(r"-lsb-", "", article)
        article = re.sub(r"-rsb-", "", article)
        article = re.sub(r"cnn", "", article)
        dictionary['article'].append(article)
        # Remove sentence delimiters from abstract
        abstract = base64.b64decode(feature['abstract']['bytesList']['value'][0]).decode("utf-8")
        abstract = re.sub(r"<s>", "", abstract)
        abstract = re.sub(r"</s>", "", abstract)
        # Append abstract to abstract list
        dictionary['abstract'].append(abstract)
    return dictionary

In [4]:
def write_to_text(input_files, output_file):
    output_file = open(output_file, 'a')
    for file in input_files:
        dictionary = create_dict(file)
        article_list = dictionary['article']
        abstract_list = dictionary['abstract']
        for i in range(len(article_list)):
            output_file.write(abstract_list[i]+'\n')
            output_file.write(article_list[i]+'\n')
            
    print('Done')

In [5]:
def write_to_csv(input_file, output_file): 
    dictionary = create_dict(input_file)
    # Create pandas dataframe and write to csv file 
    df = pd.DataFrame(data=dictionary)
    df.to_csv(output_file, index=False)
    print('Done')

### Operations

In [30]:
# Create datasets
write_to_csv('finished_files/train.bin', 'datasets/train.csv')
write_to_csv('finished_files/val.bin', 'datasets/val.csv')
write_to_csv('finished_files/test.bin', 'datasets/test.csv')

Done
Done
Done


In [None]:
# Create text file for making word embeddings
input_files = ['finished_files/train.bin', 'finished_files/val.bin']
write_to_text(input_files, 'finished_files/text.txt')

In [10]:
# Fasttext embeddings trained on train and val sets
# ./fasttext skipgram -input input_text_file -output output_model -dim 128 (fastText-0.1.0)
fasttext_model = load_model('word_vectors/fasttext_model.bin')
num_dims = 128

# vocab contains frequent words apperaing in the text along with their frequencies
# minimum frequency = 6
vocab_file = open('finished_files/vocab')
# Store appearing words
vocab_words = {}
for line in vocab_file:
    li = line.split()
    if len(li) == 2:
        word, freq = li
        vocab_words[word] = freq
# Final word to id dictionary    
word2id = {}
tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
for token in tokens:
    word2id[token] = len(word2id)
# Retrieve words from fasttext model and keep only those which are also present in 'vocab'
fasttext_words = fasttext_model.get_words()
for word in fasttext_words:
    if word in vocab_words:
        word2id[word] = len(word2id)        
vocab_size = len(word2id)
# Reverse dictionary
id2word = dict(zip(word2id.values(), word2id.keys()))
# Embeddings
embeddings = np.zeros((vocab_size, num_dims))
# <pad> token vector contains all zeros. Rest sampled from a normal distribution
mu, sigma = 0, 0.05
for i in range(1, len(tokens)):
    embeddings[i] = np.random.normal(mu, sigma, num_dims)
# Get word vectors from fasttext model and store in embeddings matrix
for i in range(len(tokens), vocab_size):
    embeddings[i] = fasttext_model.get_word_vector(id2word[i])
# Pickle vocab dictionaries and save    
pickled_word2id = open('colab_files/word2id.pickle', 'wb')
pickled_id2word = open('colab_files/id2word.pickle', 'wb')
pickle.dump(word2id, pickled_word2id)
pickle.dump(id2word, pickled_id2word)

pickled_word2id.close()
pickled_id2word.close()

np.save('colab_files/embeddings', embeddings)