## Data

In [6]:
import json

In [7]:
def read_file(file):
    result = []
    with open(file) as f:
        lines = f.readlines()
        for line in lines:
            l = line.strip()
            if len(l) > 0:
                result.append(l)
    return result

In [8]:
def load_json(file):
    with open(file) as json_file:
        data = json.load(json_file)
    
    return data

In [9]:
data_file = "data/corpus.txt"

In [10]:
data = read_file(data_file)

In [11]:
len(data)

10804

In [96]:
class DatasetGenerator:
    
    END_OF_SENT_PUNCT = ['.', '!', '?']
    
    
    def generate(self, data: List[str]):
        result = []
        for data_line in data:
            result += self.generate_for_text(data_line)
        return result

    def generate_for_text(self, text: str):
        sentences = sent_tokenize(text)
        
        if len(sentences) < 2:
            return []

        return self.generate_for_tokenized_sentences(sentences)
    
    def generate_for_tokenized_sentences(self, tokenized_sentences, tokenized_words = False):
        size = len(tokenized_sentences)
        start = 0
        result = []
        i = 0
        while i < size:
            num_of_sents = self.get_random_num_of_sentences()
            end = i + num_of_sents if i + num_of_sents < size else size

            data = self.__generate_data_for_sentences(tokenized_sentences[i:end], tokenized_words)
            i = end

            if len(data) > 0:
                result.append(data)

        return result


    def __generate_data_for_sentences(self, sentences: List[str], tokenized_words = False):

        result = []
        sentences_size = len(sentences)
        for i in range(0, sentences_size):

            sent_data = []
            tokens = sentences[i] if tokenized_words else word_tokenize(sentences[i])
            tokens_size = len(tokens) 
            should_set_end_word = False
            for j in range(tokens_size- 1, -1, -1):
                token = tokens[j]
                
                if token in self.END_OF_SENT_PUNCT:
                    if i < sentences_size - 1:
                        if j == tokens_size - 1:
                            should_set_end_word = True
                        continue
                   
                word = token
                if j == 0 and i > 0:
                    word = self.__randomly_lowercase_word(token, word_idx=j, sent_idx=i)
                
                if should_set_end_word:
                    sent_data.insert(0, [word, True])
                    should_set_end_word = False
                else:
                    sent_data.insert(0, [word, False])
            
            if len(sent_data) > 0:
                result += sent_data
            
        return result

    def __randomly_lowercase_word(self, word: str, word_idx: int, sent_idx: int):
        if sent_idx == 0 or word_idx > 0:
            return word

        should_be_lower = random.choice([True, False])
        if should_be_lower:
            return word.lower()

        return word

    def __is_punctuation(self, token: str):
        if token in string.punctuation:
            return True
        return False

    def __is_end_word_in_sent(self, word: str, word_idx: int, num_token_in_sents: int):
        if self.word_pattern.match(word) is None:
            return False

        if word_idx == num_token_in_sents - 1:
            return True
        
        return False

    def get_random_num_of_sentences(self):
        num = random.choices([2, 3, 4], weights=[12, 4, 3], k=1)[0]
        return num

In [97]:
dataset_generator = DatasetGenerator()

In [98]:
dataset_generator.generate_for_text("Thanks for talking to me. Let's meet again tomorrow. Bye.")

[[['Thanks', False],
  ['for', False],
  ['talking', False],
  ['to', False],
  ['me', True],
  ['let', False],
  ["'s", False],
  ['meet', False],
  ['again', False],
  ['tomorrow', True],
  ['Bye', False],
  ['.', False]]]

Generate dataset for full data corpus

In [28]:
dataset = dataset_generator.generate(data)

In [29]:
len(dataset)

4533

Save dataset into file

In [30]:
import json

In [32]:
dataset_file = "data/dataset.json"
write_to_file(data=json.dumps(dataset), file=dataset_file)

## Generate dataset for brown corpus

In [56]:
from nltk.corpus import brown

In [57]:
nltk.download('brown')

[nltk_data] Downloading package brown to /home/dbabenko/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [113]:
categories = brown.categories()

In [114]:
train_categories = categories[:9]
val_categories = categories[9:12]
test_categories = categories[9:15]

Generate train dataset

In [115]:
train_brown_sentences = brown.sents(categories=train_categories)

In [116]:
train_brown_dataset = dataset_generator.generate_for_tokenized_sentences(train_brown_sentences, tokenized_words=True)

In [117]:
train_brown_dataset_file = "data/brown-dataset-train.json"
write_to_file(data=json.dumps(train_brown_dataset), file=train_brown_dataset_file)

Generate validation dataset

In [120]:
val_brown_sentences = brown.sents(categories=val_categories)

In [121]:
val_brown_dataset = dataset_generator.generate_for_tokenized_sentences(val_brown_sentences, tokenized_words=True)

In [122]:
val_brown_dataset_file = "data/brown-dataset-val.json"
write_to_file(data=json.dumps(val_brown_dataset), file=val_brown_dataset_file)

Generate test dataset

In [123]:
test_brown_sentences = brown.sents(categories=test_categories)

In [124]:
test_brown_dataset = dataset_generator.generate_for_tokenized_sentences(test_brown_sentences, tokenized_words=True)

In [125]:
test_brown_dataset_file = "data/brown-dataset-test.json"
write_to_file(data=json.dumps(test_brown_dataset), file=test_brown_dataset_file)

# Generate n-grams probabilities

In [165]:
import nltk
import pandas as pd

In [131]:
pos_tag_sents = nltk.pos_tag_sents(train_brown_sentences)

In [195]:
def convert_tuple_ngram_to_str(ngram):
    ngram_str = f"{ngram[0]}"
    for i in range(1, len(ngram)):
        ngram_str += f"_{ngram[i]}"
    return ngram_str

In [210]:
def generate_ngram_dict(pos_tag_sents, n):
    ngram_dict = dict()
    for pos_tag_sent in pos_tag_sents:
        ngrams = nltk.ngrams(list(zip(*pos_tag_sent))[1], n)
        
        for ngram in ngrams:
            ngram_str = convert_tuple_ngram_to_str(ngram)
            if ngram_str not in ngram_dict:
                ngram_dict[ngram_str] = 1
            else:
                ngram_dict[ngram_str] += 1
                
    size = len(ngram_dict)
    for ngram in ngram_dict:
        ngram_dict[ngram] /= size
        
    return ngram_dict

    

In [211]:
bi_gram_dict = generate_ngram_dict(pos_tag_sents, 2)

In [213]:
write_to_file(data=json.dumps(bi_gram_dict), file="data/pos-bigrams.json")

In [214]:
three_gram_dict = generate_ngram_dict(pos_tag_sents, 3)

In [216]:
write_to_file(data=json.dumps(three_gram_dict), file="data/pos-threegrams.json")