In [310]:
import pandas as pd
import random
from nltk.tokenize import sent_tokenize, word_tokenize

In [242]:
import string
import re
from typing import List
import json

In [249]:
def write_to_file(data: str, file: str):
    with open(file, 'w') as f:
        f.write(data)

In [177]:
def read_file(file):
    result = []
    with open(file) as f:
        lines = f.readlines()
        for line in lines:
            l = line.strip()
            if len(l) > 0:
                result.append(l)
    return result

In [259]:
def load_json(file):
    with open(file) as json_file:
        data = json.load(json_file)
    
    return data

# Data

In [178]:
data_file = "data/corpus.txt"

In [180]:
data = read_file(data_file)

## Generate dataset

In [229]:
class DatasetGenerator:
    
    END_OF_SENT_PUNCT = ['.', '!', '?']
    
    
    def generate(self, data: List[str]):
        result = []
        for data_line in data:
            result += self.generate_for_text(data_line)
        return result

    def generate_for_text(self, text: str):
        sentences = sent_tokenize(text)

        size = len(sentences)
        start = 0
        result = []
        i = 0
        while i < size:
            num_of_sents = self.get_random_num_of_sentences()
            end = i + num_of_sents if i + num_of_sents < len(sentences) else size

            data = self.__generate_data_for_sentences(sentences[i:end])
            i = end

            result.append(data)

        return result

    def __generate_data_for_sentences(self, sentences: List[str]):

        result = []
        sentences_size = len(sentences)
        for i in range(0, sentences_size):

            sent_data = []
            tokens = word_tokenize(sentences[i])
            tokens_size = len(tokens) 
            should_set_end_word = False
            for j in range(tokens_size- 1, -1, -1):
                token = tokens[j]
                
                if token in self.END_OF_SENT_PUNCT:
                    if j == tokens_size - 1 and i < sentences_size - 1:
                        should_set_end_word = True
                    continue

                word = self.__randomly_lowercase_word(token, word_idx=j, sent_idx=i)
                
                if should_set_end_word:
                    sent_data.insert(0, [word, True])
                    should_set_end_word = False
                else:
                    sent_data.insert(0, [word, False])

            result += sent_data

        return result

    def __randomly_lowercase_word(self, word: str, word_idx: int, sent_idx: int):
        if sent_idx == 0 or word_idx > 0:
            return word

        should_be_lower = random.choice([True, False])
        if should_be_lower:
            return word.lower()

        return word

    def __is_punctuation(self, token: str):
        if token in string.punctuation:
            return True
        return False

    def __is_end_word_in_sent(self, word: str, word_idx: int, num_token_in_sents: int):
        if self.word_pattern.match(word) is None:
            return False

        if word_idx == num_token_in_sents - 1:
            return True

    def get_random_num_of_sentences(self):
        num = random.choices([1, 2, 3, 4], weights=[1, 12, 4, 3], k=1)[0]
        return num


In [230]:
dataset_generator = DatasetGenerator()

In [231]:
text = "Suicide won't oure shyness problems.  [Headline over Beth Winship's teen-advice column, Morning Union , Springfield, (Massachusetts), .  Submitted by .]"

In [232]:
dataset_generator.generate_for_text("Suicide won't oure shyness problems.  [Headline over Beth Winship's teen-advice column, Morning Union , Springfield, (Massachusetts), .  Submitted by .]")

[[['Suicide', False],
  ['wo', False],
  ["n't", False],
  ['oure', False],
  ['shyness', False],
  ['problems', True],
  ['[', False],
  ['Headline', False],
  ['over', False],
  ['Beth', False],
  ['Winship', False],
  ["'s", False],
  ['teen-advice', False],
  ['column', False],
  [',', False],
  ['Morning', False],
  ['Union', False],
  [',', False],
  ['Springfield', False],
  [',', False],
  ['(', False],
  ['Massachusetts', False],
  [')', False],
  [',', True],
  ['submitted', False],
  ['by', False],
  [']', False]]]

In [236]:
dataset = dataset_generator.generate(data)

Save dataet to json file

In [250]:
dataset_file = "data/dataset.json"
write_to_file(data=json.dumps(dataset), file=dataset_file)

## N-Gram

In [328]:
def read_n_gram_file(ngram_file):
    result = dict()
    lines = read_file(ngram_file)
    for line in lines:
        line_split = line.split('\t')
        key = ' '.join(line_split[1:])
        result[key] = line_split[0]
        
    return result

Here, we use ngrams downloaded from here: https://www.ngrams.info/download_coca.asp

In [329]:
five_gram_dict = read_n_gram_file("data/w5.txt")

In [330]:
five_gram_dict

{'a babe in the woods': '16',
 'a baby at her breast': '6',
 'a baby brother or sister': '9',
 'a baby crying in the': '6',
 'a baby girl was born': '6',
 'a baby in a stroller': '8',
 'a baby in her arms': '28',
 'a baby in his arms': '6',
 'a baby in the house': '10',
 'a baby into the world': '12',
 'a baby of her own': '12',
 'a baby on her hip': '11',
 'a baby on the way': '41',
 'a baby out of wedlock': '16',
 'a bachelor of arts degree': '16',
 'a Bachelor of Arts degree': '9',
 'a bachelor of fine arts': '9',
 'a bachelor of science degree': '27',
 'a bachelor of science in': '6',
 'a back corner of the': '6',
 'a back room of the': '8',
 'a back seat to a': '7',
 'a back seat to other': '7',
 'a back seat to the': '66',
 'a background check on him': '7',
 'a background report on the': '6',
 'a backlog of more than': '11',
 'a backup plan in case': '7',
 'a bad call from the': '6',
 'a bad case of indigestion': '6',
 'a bad case of the': '43',
 'a bad day at the': '16',
 'a bad

In [319]:
d[0].split('\t')[1:]

['a', 'babe', 'in', 'the', 'woods']

# Testing

In [269]:
from sklearn.metrics import classification_report

- Напишіть базове рішення та метрику для тестування якості.
- Для тестування використайте корпус run-on-test.json. Формат корпусу:
```
[
  [
    ["Thanks", false],
    ["for", false],
    ["talking", false],
    ["to", false],
    ["me", true],
    ["let", false],
    ["'s", false],
    ["meet", false],
    ["again", false],
    ["tomorrow", true],
    ["Bye", false],
    [".", false]
  ],
...
]
```

In [260]:
test_file = "../../../tasks/06-language-as-sequence/run-on-test.json"

In [262]:
test_data = load_json(test_file)

## Baseline

In [294]:
class StubBaselineRunOnSentences:
    
    def find_sentences(self, merged_sentences: List[str]):
        result = []
        sentence_tokens_list = []
        for merged_sentence in merged_sentences:
            tokens = word_tokenize(merged_sentence)
            sentence_tokens_list.append(tokens)
            
            
        return self.classify_stub(sentence_tokens_list)
        
        
    
    
    def classify_stub(self, sentence_tokens_list):
        #TODO: insert call of classification model
        result = []
        for tokens in sentence_tokens_list:
            sent_result = [[token, False] for token in tokens]
            result.append(sent_result)
        return result

## Metric

In [303]:
class Metric:
    
    @staticmethod
    def measure(result_true, result_predicted):
        assert len(result_true) == len(result_predicted)
        
        size = len(result_true)
        y_pred_flatten, y_true_flatten = [], []
        num_of_mathced_sentences = 0
        
        for i in range(0, size):
            sent_true = result_true[i]
            sent_pred = result_predicted[i]
            
            
            y_true = np.array(sent_true)[:, 1]
            y_pred = np.array(sent_pred)[:, 1]
            
            y_true_flatten += y_true.tolist()
            y_pred_flatten += y_pred.tolist()
            
            if sum(y_true == y_pred) == len(y_pred):
                num_of_mathced_sentences += 1
            
            
        return classification_report(y_true_flatten, y_pred_flatten), num_of_mathced_sentences / size

Let's show example of this metric on test data vs predicted result in stub baseline

In [304]:
stub_baseline_run_on_sentecnes = StubBaselineRunOnSentences()

In [305]:
sentence_tokens_list = [ np.array(item)[:, 0].tolist() for item in test_data]

In [306]:
test_pred_data = stub_baseline_run_on_sentecnes.classify_stub(sentence_tokens_list)

In [307]:
clf_report, matched_sents = Metric.measure(test_data, test_pred_data)

In [308]:
print(clf_report)

              precision    recall  f1-score   support

       False       0.97      1.00      0.98      4542
        True       0.00      0.00      0.00       155

    accuracy                           0.97      4697
   macro avg       0.48      0.50      0.49      4697
weighted avg       0.94      0.97      0.95      4697



In [309]:
print(f"There are matched {100 * matched_sents}% sentences.")

There are matched 25.0% sentences.
