# Loading original RuLegalNER datasets

Data source:
- https://github.com/zeino8/RuLegalNER

Label types:
- **IND [2]** - Individual
- **LE [4]** - Legal Entity
- **PEN [9]** -Penalty
- **LAW [13]** - Law
- **CR [17]** - Crime

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd
import numpy as np
import json
import ast

In [3]:
RuLegalNER_train = pd.read_csv('/content/drive/MyDrive/coursework2024/RuLegalNER/train.csv', header=None, names=["text", "entities"])
RuLegalNER_test = pd.read_csv('/content/drive/MyDrive/coursework2024/RuLegalNER/test.csv', header=None,  names=["text", "entities"])
RuLegalNER_validation = pd.read_csv('/content/drive/MyDrive/coursework2024/RuLegalNER/validation.csv', header=None,  names=["text", "entities"])

In [4]:
RuLegalNER_train.columns

Index(['text', 'entities'], dtype='object')

In [5]:
RuLegalNER_train['entities'] = RuLegalNER_train['entities'].apply(ast.literal_eval)
RuLegalNER_test['entities'] = RuLegalNER_test['entities'].apply(ast.literal_eval)
RuLegalNER_validation['entities'] = RuLegalNER_validation['entities'].apply(ast.literal_eval)

In [6]:
keys = ["start",
        "end",
        "entity_text",
        "entity_id",
        "entity_type"
        ]
types = {2:"IND",
         4:"LE",
         9:"PEN",
         13:"LAW",
         17:"CR",
         }
RuLegalNER_train['entities'] = RuLegalNER_train['entities'].apply(lambda entities: [dict(zip(keys, values + [types[values[3]]])) for values in entities])
RuLegalNER_test['entities'] = RuLegalNER_test['entities'].apply(lambda entities: [dict(zip(keys, values + [types[values[3]]])) for values in entities])
RuLegalNER_validation['entities'] = RuLegalNER_validation['entities'].apply(lambda entities: [dict(zip(keys, values + [types[values[3]]])) for values in entities])

In [7]:
RuLegalNER_train.sample(5)

Unnamed: 0,text,entities
56576,\r\n\r\n\t \tРешение по административном...,"[{'start': 1923, 'end': 1931, 'entity_text': '..."
3119,\r\n\r\n\t \tРешение по административном...,"[{'start': 319, 'end': 332, 'entity_text': 'ми..."
45050,\r\n\r\n\t \tРешение по гражданскому дел...,"[{'start': 1097, 'end': 1102, 'entity_text': '..."
44025,\r\n\r\n\t \tРешение по административном...,"[{'start': 732, 'end': 741, 'entity_text': 'гр..."
46591,\r\n\r\n\t \tРешение по административном...,"[{'start': 267, 'end': 280, 'entity_text': 'ми..."


In [8]:
examples = {}
for entities in RuLegalNER_train['entities']:
  for entity in entities:
    examples[entity["entity_id"]] = entity["entity_text"]
examples

{2: 'мировой судья',
 4: 'ЗАО',
 9: 'доход',
 13: 'освидетельствование',
 17: 'уклонение'}

In [9]:
%%capture
!pip install datasets transformers seqeval
!pip install accelerate -U
!pip install razdel

In [10]:
from datasets import load_metric

In [11]:
from razdel import tokenize

def extract_labels(item):
    raw_toks = list(tokenize(item.text))
    words = [tok.text for tok in raw_toks]
    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(item.text)
    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)

    for e in item.entities:
        e_words = sorted({idx for idx in char2word[e['start']:e['end']] if idx is not None})
        word_labels[e_words[0]] = 'B-' + e['entity_type'] # begining
        for idx in e_words[1:]:
            word_labels[idx] = 'I-' + e['entity_type'] # internal

    return {'tokens': words, 'labels': word_labels}

In [12]:
RuLegalNER_train.loc[0,'entities']

[{'start': 173,
  'end': 186,
  'entity_text': 'мировой судья',
  'entity_id': 2,
  'entity_type': 'IND'},
 {'start': 988,
  'end': 996,
  'entity_text': 'ответчик',
  'entity_id': 2,
  'entity_type': 'IND'},
 {'start': 348,
  'end': 351,
  'entity_text': 'ООО',
  'entity_id': 4,
  'entity_type': 'LE'},
 {'start': 353,
  'end': 360,
  'entity_text': 'капитал',
  'entity_id': 9,
  'entity_type': 'PEN'},
 {'start': 729,
  'end': 733,
  'entity_text': 'пени',
  'entity_id': 9,
  'entity_type': 'PEN'}]

In [13]:
for i, label in enumerate( extract_labels(RuLegalNER_train.loc[0])['labels'] ):
  if label != 'O':
    print(extract_labels(RuLegalNER_train.loc[0])['tokens'][i] + ": " + label)

Мировой: B-IND
судья: I-IND
ООО: B-LE
Капитал: B-PEN
пени: B-PEN
Ответчик: B-IND


# Saving tokenized pandas datasets

In [14]:
ner_train = pd.DataFrame([extract_labels(RuLegalNER_train.loc[i]) for i in range(RuLegalNER_train.shape[0])])
ner_train.to_pickle("/content/drive/MyDrive/coursework2024/RuLegalNER_train.pkl")
del RuLegalNER_train

ner_test = pd.DataFrame([extract_labels(RuLegalNER_test.loc[i]) for i in range(RuLegalNER_test.shape[0])])
ner_test.to_pickle("/content/drive/MyDrive/coursework2024/RuLegalNER_test.pkl")
del RuLegalNER_test

ner_validation = pd.DataFrame([extract_labels(RuLegalNER_validation.loc[i]) for i in range(RuLegalNER_validation.shape[0])])
ner_validation.to_pickle("/content/drive/MyDrive/coursework2024/RuLegalNER_validation.pkl")
# del RuLegalNER_validation

ner_validation.sample(5)

Unnamed: 0,tokens,labels
12766,"[Решение, по, административному, делу, Дело, №...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
9248,"[Решение, по, гражданскому, делу, РЕШЕНИЕ, Име...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6534,"[Решение, по, гражданскому, делу, Дело, №, NNN...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
105,"[Решение, по, гражданскому, делу, Дело, №, 2-,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6543,"[Решение, по, административному, делу, Дело, №...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [17]:
example = ner_validation.loc[5:7]
example

Unnamed: 0,tokens,labels
5,"[Решение, по, гражданскому, делу, Дело, №, 2-5...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6,"[Решение, по, гражданскому, делу, Дело, №, 2, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
7,"[Решение, по, административному, делу, Адм, .,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [18]:
example.to_pickle("/content/drive/MyDrive/coursework2024/RuLegalNER_example.pkl")