In [8]:
import os
import json
import stanza
import string
from progress.bar import IncrementalBar
import sys

In [2]:
LABEL_MAPPER = {
    'PF': 1,
    'PF_reprezentat': 1,
    'PF_delegat': 1,
    'PJ': 3,
    'PJ_reprezentat': 3,
    'PJ_delegat': 3,
    'STAT': 5,
    'STAT_reprezentat': 5,
    'STAT_delegat': 5,
    'Locatie_PJ': 7
}
ROM_SPECIAL_CHARS = 'ǎǍăĂâÂȃȂșȘîÎțȚãÃȋȊ' 
PUNCTUATION_SPECIAL_CHARS = '„”“«»’‘…´″‚'
MONEY_CHARS = '€$£'

ALLOWED_CHARS = string.printable + ROM_SPECIAL_CHARS + PUNCTUATION_SPECIAL_CHARS + MONEY_CHARS

nlp = stanza.Pipeline(lang='ro', processors='tokenize', tokenize_no_ssplit=True)

2022-05-13 09:54:02 INFO: Loading these models for language: ro (Romanian):
| Processor | Package |
-----------------------
| tokenize  | rrt     |

2022-05-13 09:54:03 INFO: Use device: cpu
2022-05-13 09:54:03 INFO: Loading: tokenize
2022-05-13 09:54:03 INFO: Done loading processors!


In [6]:
def clean_text(text):
    text = text.lower()

    text = text.replace('Ţ', 'Ț')
    text = text.replace('ţ', 'ț')
    text = text.replace('–', '-')
    text = text.replace('Ş', 'Ș')
    text = text.replace('ş', 'ș')
    text = text.replace('ˮ', '“')
    text = text.replace('ʼ', '’')

    return text

In [3]:
json_dataset = []

# for nume_fisier in ['bogdan', 'ioana', 'madalina', 'marius', 'sergiu']:
#     with open(f'dosare/{nume_fisier}.json') as json_file:
#         json_var = json.load(json_file)
#         json_dataset.extend(json_var)

# for nume_fisier in os.listdir(os.path.join('dosare','new')):
#     with open(os.path.join('dosare','new',nume_fisier)) as json_file:
#         json_doc = json.load(json_file)
#         json_dataset.extend(json_doc)

with open(f'dosare/bejuri_prelucrate.json') as json_file:
    json_doc = json.load(json_file)
    json_dataset.extend(json_doc)

In [4]:
len(json_dataset)

1351

In [16]:
lista_exemple_incorecte = []
for json_entry in json_dataset:
    text = json_entry['data']['ner']

    annotations = json_entry['annotations'][-1]['result'] # -> list (UPDATE: O singura adnotare per exemplu)

    if annotations['value']['end'] != len(text)-1:
        lista_exemple_incorecte.append(text)

    # sorted_annotations = sorted(annotations, key=lambda x: x['value']['start'])

    # current_idx = 0
    # for annotation in sorted_annotations:

In [20]:
output_json = []
id_idx = 0

with IncrementalBar('Tokenizing...', max=len(json_dataset)) as bar:
    for json_entry in json_dataset[0:5]:
        sentence_dict = {'id': id_idx, 'ner_tags': [], 'ner_ids': [], 'tokens': [], 'space_after': []}

        text = json_entry['data']['ner']

        annotations = json_entry['annotations'][-1]['result'] # -> list (UPDATE: O singura adnotare per exemplu)
        sorted_annotations = sorted(annotations, key=lambda x: x['value']['start'])

        current_idx = 0
        for annotation in sorted_annotations:
            annot = annotation['value']

            start_idx = annot['start']
            if current_idx < start_idx:
                current_text_fragment = text[current_idx:start_idx]
                doc = nlp(current_text_fragment)
                for sentence in doc.sentences:
                    for token in sentence.tokens:
                        not_allowed_chars = set(token.text).difference(ALLOWED_CHARS)
                        if not not_allowed_chars:
                            sentence_dict['tokens'].append(token.text)
                            sentence_dict['ner_tags'].append('O')
                            sentence_dict['ner_ids'].append(0)
                            sentence_dict['space_after'].append(True if (current_idx+token.end_char<len(text) and text[current_idx+token.end_char] == ' ') else False)
                        else:
                            print('Am gasit caractere dubioase')

            current_idx = start_idx
                        
            current_text_fragment = annot['text']
            doc = nlp(current_text_fragment)

            for sentence in doc.sentences:
                flag_first_token = True # Used for special chars, if they are first in token, they will be skipped
                for index, token in enumerate(sentence.tokens):
                    not_allowed_chars = set(token.text).difference(ALLOWED_CHARS)
                    if not not_allowed_chars:
                        sentence_dict['tokens'].append(token.text)
                        if index == 0 or flag_first_token:
                            sentence_dict['ner_tags'].append('B-'+annot['labels'][0])
                            sentence_dict['ner_ids'].append(LABEL_MAPPER[annot['labels'][0]])

                            flag_first_token = False
                        else:
                            sentence_dict['ner_tags'].append('I-'+annot['labels'][0])
                            sentence_dict['ner_ids'].append(LABEL_MAPPER[annot['labels'][0]]+1)
                        sentence_dict['space_after'].append(True if (current_idx+token.end_char<len(text) and text[current_idx+token.end_char] == ' ') else False)
                    elif token.text[0] == '\ufeff' and token.text.replace('\ufeff', ''):
                        sentence_dict['tokens'].append(token.text.replace('\ufeff', ''))
                        if index == 0 or flag_first_token:
                            sentence_dict['ner_tags'].append('B-'+annot['labels'][0])
                            sentence_dict['ner_ids'].append(LABEL_MAPPER[annot['labels'][0]])

                            flag_first_token = False
                        else:
                            sentence_dict['ner_tags'].append('I-'+annot['labels'][0])
                            sentence_dict['ner_ids'].append(LABEL_MAPPER[annot['labels'][0]]+1)
                        sentence_dict['space_after'].append(True if (current_idx+token.end_char<len(text) and text[current_idx+token.end_char] == ' ') else False)
                    else:
                        print('Am gasit caractere dubioase')

            current_idx = annot['end']

        current_text_fragment = text[current_idx:]
        doc = nlp(current_text_fragment)

        for sentence in doc.sentences:
            for token in sentence.tokens:
                not_allowed_chars = set(token.text).difference(ALLOWED_CHARS)
                if not not_allowed_chars:
                    sentence_dict['tokens'].append(token.text)
                    sentence_dict['ner_tags'].append('O')
                    sentence_dict['ner_ids'].append(0)
                    sentence_dict['space_after'].append(True if (current_idx+token.end_char<len(text) and text[current_idx+token.end_char] == ' ') else False)
                else:
                    print('Am gasit caractere dubioase')

        output_json.append(sentence_dict)

        id_idx += 1

        bar.next()

SC FITOCOM SRL TÂRGU MURES


In [1]:
a = "BEJ ARDELEANU BIANCA"
a[0:19]

'BEJ ARDELEANU BIANC'

In [3]:
b = 'BEJ NEAGOS DOREL'
b[0:15]

'BEJ NEAGOS DORE'

In [4]:
c = 'POPA IONEL PRIMAR- PRESEDINTELE COMISIEI LOCALE DE FOND FUNCIAR'
c[32:63]

'COMISIEI LOCALE DE FOND FUNCIAR'

In [27]:
c = 0
t = 0

for nume_fisier in os.listdir(os.path.join('dosare','new')):
    with open(os.path.join('dosare','new',nume_fisier)) as json_file:
        json_doc = json.load(json_file)
        
        for json_var in json_doc:
            t += 1
            text = clean_text(json_var['data']['ner'])

            annotations = json_var['annotations'][-1]['result'] # -> list (UPDATE: O singura adnotare per exemplu)
            try:
                sorted_annotations = sorted(annotations, key=lambda x: x['value']['start'])
            except:
                begin_idx, end_idx = annotations['value']['start'], annotations['value']['end']
                if text[begin_idx:end_idx] != text:
                    cu = text[begin_idx:end_idx+1]
                    fara = text

                    if len(cu) != len(fara):
                        c += 1

                else:
                    sorted_annotations = [annotations]

                # if annotations['value']['end'] != len(text)-1:
                #     # print(text)
                #     c += 1
                #     continue
                # else:
                #     print(text)
                # sorted_annotations = [annotations]

print(f'{c=}')
print(f'{t=}')

c=0
t=32752


In [23]:
c = 0
t = 0

with open(os.path.join('dosare','bejuri_prelucrate.json')) as json_file:
    json_doc = json.load(json_file)
    
    for json_var in json_doc:
        t += 1
        text = clean_text(json_var['data']['ner'])

        annotations = json_var['annotations'][-1]['result'] # -> list (UPDATE: O singura adnotare per exemplu)
        try:
            sorted_annotations = sorted(annotations, key=lambda x: x['value']['start'])
        except:
            begin_idx, end_idx = annotations['value']['start'], annotations['value']['end']
            if text[begin_idx:end_idx] != text:
                c += 1

                cu = text[begin_idx:end_idx]
                fara = text

                if len(cu) + 1 != len(fara):
                    print('caca')
                break
            else:
                continue
                # print(text)
            # if annotations['value']['end'] != len(text)-1:
            #     print(text)
            #     c += 1
            #     continue
            sorted_annotations = [annotations]

print(f'{c=}')
print(f'{t=}')

c=1
t=1
