In [None]:
import nltk
from nltk.corpus import reuters

In [None]:
reuters.words(['training/9865', 'training/9880'])

In [None]:
len(reuters.sents(['training/9880']))

In [None]:
reuters.sents(['training/9880'])

In [42]:
from typing import List
from nltk.tokenize import sent_tokenize, word_tokenize
import random

## Data

In [43]:
import json

In [44]:
def read_file(file):
    result = []
    with open(file) as f:
        lines = f.readlines()
        for line in lines:
            l = line.strip()
            if len(l) > 0:
                result.append(l)
    return result

In [45]:
def load_json(file):
    with open(file) as json_file:
        data = json.load(json_file)
    
    return data

In [46]:
data_file = "data/corpus.txt"

In [47]:
data = read_file(data_file)

In [48]:
len(data)

10804

In [49]:
class DatasetGenerator:
    
    END_OF_SENT_PUNCT = ['.', '!', '?']
    
    
    def generate(self, data: List[str]):
        result = []
        for data_line in data:
            result += self.generate_for_text(data_line)
        return result

    def generate_for_text(self, text: str):
        sentences = sent_tokenize(text)

        size = len(sentences)
        start = 0
        result = []
        i = 0
        while i < size:
            num_of_sents = self.get_random_num_of_sentences()
            end = i + num_of_sents if i + num_of_sents < len(sentences) else size

            data = self.__generate_data_for_sentences(sentences[i:end])
            i = end

            if len(data) > 0:
                result.append(data)

        return result

    def __generate_data_for_sentences(self, sentences: List[str]):

        result = []
        sentences_size = len(sentences)
        for i in range(0, sentences_size):

            sent_data = []
            tokens = word_tokenize(sentences[i])
            tokens_size = len(tokens) 
            should_set_end_word = False
            for j in range(tokens_size- 1, -1, -1):
                token = tokens[j]
                
                if token in self.END_OF_SENT_PUNCT:
                    if j == tokens_size - 1 and i < sentences_size - 1:
                        should_set_end_word = True
                    continue
                
                if should_set_end_word:
                    word = self.__randomly_lowercase_word(token, word_idx=j, sent_idx=i)
                    sent_data.insert(0, [word, True])
                    should_set_end_word = False
                else:
                    sent_data.insert(0, [token, False])
            
            if len(sent_data) > 0:
                result += sent_data
            
        return result

    def __randomly_lowercase_word(self, word: str, word_idx: int, sent_idx: int):
        if sent_idx == 0 or word_idx > 0:
            return word

        should_be_lower = random.choice([True, False])
        if should_be_lower:
            return word.lower()

        return word

    def __is_punctuation(self, token: str):
        if token in string.punctuation:
            return True
        return False

    def __is_end_word_in_sent(self, word: str, word_idx: int, num_token_in_sents: int):
        if self.word_pattern.match(word) is None:
            return False

        if word_idx == num_token_in_sents - 1:
            return True
        
        return False

    def get_random_num_of_sentences(self):
        num = random.choices([2, 3, 4], weights=[12, 4, 3], k=1)[0]
        return num

In [50]:
dataset_generator = DatasetGenerator()

In [51]:
dataset_generator.generate_for_text("Thanks for talking to me. Let's meet again tomorrow. Bye.")

[[['Thanks', False],
  ['for', False],
  ['talking', False],
  ['to', False],
  ['me', True],
  ['Let', False],
  ["'s", False],
  ['meet', False],
  ['again', False],
  ['tomorrow', False]],
 [['Bye', False]]]

Generate dataset for full data corpus

In [52]:
dataset = dataset_generator.generate(data)

Save dataset into file

In [53]:
import json

In [54]:
def write_to_file(data: str, file: str):
    with open(file, 'w') as f:
        f.write(data)

In [55]:
dataset_file = "data/dataset.json"
write_to_file(data=json.dumps(dataset), file=dataset_file)