In [1]:
import os
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
import pandas as pd
# from simpletransformers.ner import NERArgs, NERModel
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
# from torch.utils.data import IterableDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [2]:
# class TokenizedExamplesDataset(IterableDataset):
#     def __init__(self, examples):
#         self.examples = examples

#     def __iter__(self):
#         for example in self.examples:
#             yield example

#     def __len__(self):
#         return len(self.examples)

In [75]:
class ModelForInference:
    def __init__(self, model_path, max_seq_len=256, use_sliding_window=True, stride=0.8, device='cpu',  batch_size=1): 
        self.model = AutoModelForTokenClassification.from_pretrained(model_path)
        self.model.eval()
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.use_sliding_window = use_sliding_window
        self.stride = stride
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.device = device
        self.model.to(self.device)

    def prepare_example(self, example, tokenizer):
        tokenized_example = []
        token_mappings = []
        window_counts = []

        tokens = []
        token_id_to_word_id = []
        word_id_to_tokenized_len = []
        num_windows = 0

        for idx, word in enumerate(example):
            tokenized_word = tokenizer.tokenize(word)
            word_id_to_tokenized_len.append(len(tokenized_word))

            if len(tokens) + len(tokenized_word) >= self.max_seq_len - 1:
                if not self.use_sliding_window:
                    break
                tokenized_example.append(
                    tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokens + [tokenizer.sep_token])
                )
                num_windows += 1
                token_mappings.append([-1] + token_id_to_word_id + [-1])  # encounter for cls and sep tokens

                first_word_id = token_id_to_word_id[0]
                last_word_id = token_id_to_word_id[-1]
                jump_to_id = first_word_id + int((last_word_id - first_word_id) * self.stride)
                offset = sum(word_id_to_tokenized_len[first_word_id:jump_to_id])

                tokens = tokens[offset:]
                token_id_to_word_id = token_id_to_word_id[offset:]

            tokens.extend(tokenized_word)
            token_id_to_word_id.extend([idx] * len(tokenized_word))

        if tokens:
            tokenized_example.append(
                tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokens + [tokenizer.sep_token])
            )
            num_windows += 1
            token_mappings.append([-1] + token_id_to_word_id + [-1])  # encounter for cls and sep tokens

        window_counts.append(num_windows)
        # print('tokenized_example', tokenized_example)

        return tokenized_example, window_counts, token_mappings


    @staticmethod
    def merge_predictions(windows_preds, windows_scores, windows_token_mappings):
        # TODO: refactor
        result_size = windows_token_mappings[-1][-2] + 1  # element at -1 corresponds to </s> token, at -2 to last word idx
        results = [0] * result_size
        result_score = [-100] * result_size

        prev_word_id = -1

        for window_preds, window_scores, window_token_mappings in \
                zip(windows_preds, windows_scores, windows_token_mappings):
            first_word_id=min([x for x in window_token_mappings if x>0])
            last_word_id=max(window_token_mappings)
            #print('first_word_id',first_word_id)
            #print('last_word_id',last_word_id)
            for pred, score, word_id in zip(window_preds, window_scores, window_token_mappings):
                if word_id == prev_word_id:
                    continue  # only check prediction for first token of the word

                prev_word_id = word_id
                if word_id == -1:
                    continue

                if pred != 0:
                    context=min(word_id-first_word_id, last_word_id-word_id)
                    #print('context', context, first_word_id, word_id, last_word_id, pred)
                    if context >= result_score[word_id]:
                        result_score[word_id] = context
                        results[word_id] = pred
        return results

    def predict(self, to_predict: [str]): 
        tokenizer = self.tokenizer

        example, window_counts, token_mappings = self.prepare_example(to_predict, tokenizer)

        tokenized_example = tokenizer.pad({'input_ids': example},  return_tensors='pt') #padding='longest',
        # print('tokenized_example', tokenized_example)

        preds = []
        scores = []
        
        with torch.no_grad():
                logits = self.model(**tokenized_example)[0]

                batch_score, batch_pred = torch.max(logits, dim=2)

                batch_score = batch_score.detach().cpu().numpy()
                batch_pred = batch_pred.detach().cpu().numpy()
                
                scores.extend(batch_score)
                preds.extend(batch_pred)
                print(preds)

        offset = 0

        concated_preds = []
        for window in window_counts:
            concated_preds.append(
                self.merge_predictions(preds[offset:offset + window],
                                   scores[offset:offset + window],
                                   token_mappings[offset:offset + window])
            )
        print(window_counts)
        print(concated_preds)
        return concated_preds
        # return preds, scores, token_mappings

In [3]:
def ids_to_labels(preds, labels):
    return [labels[p] for p in preds]

In [4]:
# path_to_model = '/content/drive/MyDrive/Semestr_III/INL/punctuation_restoration/best_model/'
path_to_model = 'best_model'
path_to_test = '2021-punctuation-restoration\\test-C\\in.tsv'
path_to_result = 'predictions\\test-C-predictions.tsv'

In [76]:
model = ModelForInference(path_to_model)
tokenizer = AutoTokenizer.from_pretrained(path_to_model)

In [5]:

model = AutoModelForTokenClassification.from_pretrained(path_to_model)
tokenizer = AutoTokenizer.from_pretrained(path_to_model)

In [6]:
pred_labels = ['B', ':', ';', ',', '.', '-', '...', '?', '!']

In [22]:
original_text = "Szef MSWiA dodał, że operacja policji przebiegła bardzo profesjonalnie, a słowa Marcina Mastalerka, prezydenckiego doradcy, o tym, że Mariusz Kamiński został popchnięty na futrynę, są niepotrzebnym eskalowaniem napięcia. To był niebywały skandal, że posłowie PiS dobijali się do bramy więzienia w sprawie kolegów. Tam ma być cicho i spokojnie, dlatego należy pochwalić decyzję służb o przewiezieniu do innych więzień Kamińskiego i Wąsika - komentuje dr Paweł Moczydłowski, były szef więziennictwa. W rozmowie z WP wskazuje, że planowana w Warszawie demonstracja w obronie prawomocnie skazanych polityków, mogła źle wpłynąć na stan emocjonalny więziennej społeczności. Jak poinformowała Wirtualna Polska, Maciej Wąsik trafił do zakładu karnego w Przytułach Starych niedaleko Ostrołęki, z kolei Mariusz Kamiński do Radomia. Decyzję uzasadniano względami bezpieczeństwa. Od razu okazało się, że pokrzyżowało to plany sympatyków osadzonych w więzieniu polityków. Syn Mariusza Kamińskiego odwołał planowane na czwartek zgromadzenie pod Aresztem Śledczym Warszawa-Grochów, gdzie dotąd przebywali skazani."
text = "Szef MSWiA dodał że operacja policji przebiegła bardzo profesjonalnie a słowa Marcina Mastalerka prezydenckiego doradcy o tym że Mariusz Kamiński został popchnięty na futrynę są niepotrzebnym eskalowaniem napięcia To był niebywały skandal że posłowie PiS dobijali się do bramy więzienia w sprawie kolegów Tam ma być cicho i spokojnie dlatego należy pochwalić decyzję służb o przewiezieniu do innych więzień Kamińskiego i Wąsika komentuje dr Paweł Moczydłowski były szef więziennictwa W rozmowie z WP wskazuje że planowana w Warszawie demonstracja w obronie prawomocnie skazanych polityków mogła źle wpłynąć na stan emocjonalny więziennej społeczności Jak poinformowała Wirtualna Polska Maciej Wąsik trafił do zakładu karnego w Przytułach Starych niedaleko Ostrołęki z kolei Mariusz Kamiński do Radomia Decyzję uzasadniano względami bezpieczeństwa Od razu okazało się że pokrzyżowało to plany sympatyków osadzonych w więzieniu polityków Syn Mariusza Kamińskiego odwołał planowane na czwartek zgromadzenie pod Aresztem Śledczym Warszawa Grochów gdzie dotąd przebywali skazani"

In [7]:
original_text = "Obserwując polityczny rollercoaster ostatnich dni, jak mantrę powtarzamy, że Polska znalazła się na granicy ustrojowego chaosu. A wolne państwo, którego wyglądały kolejne pokolenia, pogrążą się w głębokim kryzysie - pisze dla Wirtualnej Polski prof. Sławomir Sowiński."
text = "Obserwując polityczny rollercoaster ostatnich dni jak mantrę powtarzamy że Polska znalazła się na granicy ustrojowego chaosu A wolne państwo którego wyglądały kolejne pokolenia pogrążą się w głębokim kryzysie pisze dla Wirtualnej Polski prof Sławomir Sowiński"

In [77]:
# one sentence example
idx=0
output = model.predict(text.split())
preds = ids_to_labels(output[0], pred_labels)

for _, (word, label) in enumerate(zip(text.split(), preds)):
    print(f'{"" if idx == 0 else " "}{word}{label if label != "B" else ""}', end=' ')

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0,
       0, 0, 0, 0, 4, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0,
       0, 4, 0, 4, 0, 0], dtype=int64)]
[1]
[[0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0, 4]]
Obserwując polityczny rollercoaster ostatnich dni, jak mantrę powtarzamy, że Polska znalazła się na granicy ustrojowego chaosu. A wolne państwo, którego wyglądały kolejne pokolenia, pogrążą się w głębokim kryzysie- pisze dla Wirtualnej Polski prof. Sławomir Sowiński. 

In [24]:
print(output)

[[0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 3, 0, 0, 4, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4]]


In [8]:
def prepare_token_mappings(example, tokenizer):
        use_sliding_window = True
        max_seq_len = 256
        stride = 0.8
        token_mappings = []

        tokens = []
        token_id_to_word_id = []
        word_id_to_tokenized_len = []

        for idx, word in enumerate(example):
            tokenized_word = tokenizer.tokenize(word)
            word_id_to_tokenized_len.append(len(tokenized_word))

            if len(tokens) + len(tokenized_word) >= max_seq_len - 1:
                if not use_sliding_window:
                    break
                
                token_mappings.append([-1] + token_id_to_word_id + [-1])  # encounter for cls and sep tokens

                first_word_id = token_id_to_word_id[0]
                last_word_id = token_id_to_word_id[-1]
                jump_to_id = first_word_id + int((last_word_id - first_word_id) * stride)
                offset = sum(word_id_to_tokenized_len[first_word_id:jump_to_id])

                tokens = tokens[offset:]
                token_id_to_word_id = token_id_to_word_id[offset:]

            tokens.extend(tokenized_word)
            token_id_to_word_id.extend([idx] * len(tokenized_word))

        if tokens:
            token_mappings.append([-1] + token_id_to_word_id + [-1])  # encounter for cls and sep tokens

        return token_mappings

In [9]:
token_mappings = prepare_token_mappings(text, tokenizer)

In [10]:
print(token_mappings)

[[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 50, 51, 52, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 91, 92, 93, 95, 96, 98, 99, 100, 101, 102, 103, 104, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 118, 119, 120, 121, 122, 123, 125, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 141, 142, 143, 144, 145, 146, 147, 149, 150, 151, 152, 153, 154, 155, 156, 157, 159, 160, 161, 162, 163, 164, 165, 167, 168, 169, 170, 171, 172, 173, 174, 175, 177, 178, 179, 180, 181, 182, 183, 185, 186, 187, 189, 191, 192, 193, 194, 195, 196, 197, 198, 200, 201, 202, 203, 204, 205, 206, 207, 209, 210, 211, 212, 213, 215, 216, 217, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 230, 231, 232, 233, 234, 235, 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 251, 2

In [66]:
@staticmethod
def merge_predictions(windows_preds, windows_scores, windows_token_mappings):
        # TODO: refactor
        result_size = windows_token_mappings[-1][-2] + 1  # element at -1 corresponds to </s> token, at -2 to last word idx
        results = [0] * result_size
        result_score = [-100] * result_size

        prev_word_id = -1

        for window_preds, window_scores, window_token_mappings in \
                zip(windows_preds, windows_scores, windows_token_mappings):
            first_word_id=min([x for x in window_token_mappings if x>0])
            last_word_id=max(window_token_mappings)
            #print('first_word_id',first_word_id)
            #print('last_word_id',last_word_id)
            for pred, score, word_id in zip(window_preds, window_scores, window_token_mappings):
                if word_id == prev_word_id:
                    continue  # only check prediction for first token of the word

                prev_word_id = word_id
                if word_id == -1:
                    continue

                if pred != 0:
                    context=min(word_id-first_word_id, last_word_id-word_id)
                    #print('context', context, first_word_id, word_id, last_word_id, pred)
                    if context >= result_score[word_id]:
                        result_score[word_id] = context
                        results[word_id] = pred
        return results

In [67]:
idx=0
output, scores, _ = model.predict(text.split())
output = model(**tokenizer.batch_encode_plus([(text)], padding='longest', add_special_tokens=True, return_tensors='pt'))
offset = 0

concated_preds = merge_predictions(output[offset:offset + 1], scores[offset:offset + 1], token_mappings[offset:offset + 1])
preds = ids_to_labels(concated_preds, pred_labels)

for _, (word, (_,label)) in enumerate(zip(text.split(), enumerate(preds))):
    print(f'{"" if idx == 0 else " "}{word}{label if label != "B" else ""}', end=' ')

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0,
       0, 0, 0, 0, 4, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0,
       0, 4, 0, 4, 0, 0], dtype=int64)]
Obserwując polityczny rollercoaster ostatnich dni jak mantrę powtarzamy że Polska, znalazła się na granicy ustrojowego chaosu, A, wolne, państwo którego wyglądały kolejne pokolenia pogrążą się w głębokim kryzysie. pisze dla Wirtualnej, Polski prof Sławomir Sowiński, 

In [68]:
print(concated_preds)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
