# Testing how to read data from new formatting

In [44]:
import os
import re
import logging
from pprint import pprint
import numpy as np

# TODO : split data into val

def _tokenize(raw):
    tokens = re.findall(r"[\w]+", raw) # finds every word
    normalized_tokens = [token.lower() for token in tokens]
    return normalized_tokens


_s_re = re.compile("^F:")
_q_re = re.compile("Q:")
_a_re = re.compile("A:")

'''
called from read_babi_split
defines the vocabulary, paragraphs (x input), questions and answers
'''
def read_babi_files(file_paths):

    vocab_set = set()
    paragraphs = []
    questions = []
    answers = []
    for file_path in file_paths:
        with open(file_path, 'r') as fh:
            lines = fh.readlines()
            paragraph = []
            for line_num, line in enumerate(lines):
                line = line.strip('\n')
                sm = _s_re.match(line) # matches pattern of a sentence
                qm = _q_re.match(line) # matches pattern of a question
                am = _a_re.match(line)

                # if it is a question, peel off the 'Q: ' beginning part
                if qm:
                    raw_question = line[2:].strip() # should start from the space after Q:
                    question = _tokenize(raw_question)
                    questions.append(question)
                    vocab_set |= set(question)
                    # now that we've hit a question we know we're at the end of the "story"
                    # add the paragraph so far to the paragraphs list
                    paragraphs.append(paragraph)
                    paragraph = [] # clear the paragraph to start adding new things to it on the next time we hit a sentence
                # if it is a sentence/part of the paragraph, peel off the 'A: ' part
                elif sm:
                    raw_sentence = line[2:].strip()
                    sentence = _tokenize(raw_sentence)
                    paragraph.append(sentence)
                    vocab_set |= set(sentence)
                elif am:
                    answer = line[2:].strip()
                    answers.append(answer)
                    vocab_set.add(answer)
                else:
                    logging.error("Invalid line encountered: line %d in %s" % (line_num + 1, file_path))
            
            print("Loaded %d examples from: %s" % (len(paragraphs), os.path.basename(file_path)))
            
    return vocab_set, paragraphs, questions, answers


In [45]:
vocab_set, paragraphs, questions, answers = read_babi_files(["/Users/williamcosby/Documents/metis/Passion_Project_Stratus/data/custom/custom_train.txt"])

Loaded 40 examples from: custom_train.txt


In [47]:
vocab_set

{'a',
 'black',
 'blue',
 'car',
 'color',
 'does',
 'engine',
 'green',
 'has',
 'have',
 'is',
 'red',
 'the',
 'v12',
 'v4',
 'v6',
 'v8',
 'what',
 'white'}