<a href="https://colab.research.google.com/github/wizard339/education/blob/main/misis/nlp/token_classification/token_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets evaluate

In [42]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sklearn.preprocessing import LabelEncoder

In [None]:
!wget 'https://raw.githubusercontent.com/wizard339/education/main/misis/nlp/token_classification/train.txt'
!wget 'https://raw.githubusercontent.com/wizard339/education/main/misis/nlp/token_classification/dev.txt'
!wget 'https://raw.githubusercontent.com/wizard339/education/main/misis/nlp/token_classification/test.txt'

In [5]:
with open('/content/train.txt', 'r') as f:
    train_txt = f.read()

with open('/content/dev.txt', 'r') as f:
    val_txt = f.read()

with open('/content/test.txt', 'r') as f:
    test_txt = f.read()

In [6]:
def text_preproc(text: str) -> pd.DataFrame:
    '''
    preprocessing input texts 
    '''
    splitted_text = text.split('\n')
    sentences, tags = [], []
    sentence, tag = [], []
    
    for text in splitted_text:

        if text != '':
            text = text.split(' ')
            sentence.append(text[0])
            tag.append(text[1])
        else:
            sentences.append(sentence)
            tags.append(tag)
            sentence, tag = [], []
    
    df = pd.DataFrame({'Text': sentences,
                       'Tags': tags})

    return df

In [7]:
train_data = text_preproc(train_txt)
print(train_data.shape)
train_data.head()

(7747, 2)


Unnamed: 0,Text,Tags
0,"["", Если, Миронов, занял, столь, оппозиционную...","[O, O, B-PER, O, O, O, O, O, O, O, O, O, O, O,..."
1,"[Источник, "", Ъ, '', в, руководстве, столичной...","[O, O, B-ORG, O, O, O, O, O, O, O, O, O, O, B-..."
2,"[В, Ханты-Мансийском, автономном, округе, с, д...","[O, B-LOC, I-LOC, I-LOC, O, O, O, O, B-ORG, B-..."
3,"[С, 1992, года, по, настоящее, время, является...","[O, O, O, O, O, O, O, O, B-ORG, I-ORG, I-ORG, ..."
4,"[Для, этого, ей, пришлось, выиграть, выборы, в...","[O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC, O..."


In [8]:
val_data = text_preproc(val_txt)
print(val_data.shape)
val_data.head()

(2583, 2)


Unnamed: 0,Text,Tags
0,"[как, акционерный, коммерческий, Московский, м...","[O, O, O, B-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I..."
1,"[Управлять, ЦАО, и, САО, вместо, Алексея, Алек...","[O, B-LOC, O, B-LOC, O, B-PER, I-PER, O, B-PER..."
2,"[О, задержании, Шакирьянова, стало, известно, ...","[O, O, B-PER, O, O, O, O, O, O, O]"
3,"[После, майского, ухода, вице-премьера, Владис...","[O, O, O, O, B-PER, I-PER, O, O, O, O, O, B-PE..."
4,"[Армяне, со, мной, согласились, ,, с, Ильхамом...","[O, O, O, O, O, O, B-PER, I-PER, O, O, O, O, O..."


In [9]:
test_data = text_preproc(test_txt)
print(test_data.shape)
test_data.head()

(2583, 2)


Unnamed: 0,Text,Tags
0,"[Тогда, замешанные, в, скандале, прокуроры, от...","[O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,"[Символичным, назвал, председатель, РФМ, ,, де...","[O, O, O, B-ORG, O, O, B-ORG, I-ORG, B-LOC, B-..."
2,"[На, посту, гендиректора, Yahoo, !, B-ORG, Кэр...","[O, O, O, O, O, O, B-PER, I-PER, O, B-PER, I-P..."
3,"[Считаю, невозможным, руководить, областью, с,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[Боксер, победила, Карли, Фиорину, (, Carly, F...","[B-PER, O, B-PER, I-PER, O, I-PER, O, O, O, O,..."


In [15]:
hg_train = Dataset.from_pandas(train_data)
assert hg_train.shape == train_data.shape
print(hg_train.shape)

(7747, 2)


In [16]:
hg_val = Dataset.from_pandas(val_data)
assert hg_val.shape == val_data.shape
print(hg_val.shape)

(2583, 2)


In [17]:
hg_test = Dataset.from_pandas(test_data)
assert hg_test.shape == test_data.shape
print(hg_test.shape)

(2583, 2)


In [19]:
print(hg_train)
print(hg_val)
print(hg_test)

Dataset({
    features: ['Text', 'Tags'],
    num_rows: 7747
})
Dataset({
    features: ['Text', 'Tags'],
    num_rows: 2583
})
Dataset({
    features: ['Text', 'Tags'],
    num_rows: 2583
})


In [40]:
ner_tags = []
for tag_list in hg_train['Tags']:
    for tag in tag_list:
        ner_tags.append(tag)
ner_tags = list(set(ner_tags))

print(ner_tags)

['O', 'B-LOC', 'B-ORG', 'I-LOC', 'I-ORG', 'I-PER', 'B-PER']


In [50]:
ner_tags = ['O', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']
le = LabelEncoder()
le.fit(ner_tags)
le.classes_
# for el in hg_train['Tags']:
#     le.transform(el)

array(['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O'],
      dtype='<U5')

In [47]:
hg_train['Tags'][0]

['O',
 'O',
 'B-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'O']

In [21]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")

Downloading (…)lve/main/config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [23]:
tokenizer.is_fast

True

In [33]:
inputs = tokenizer(hg_train['Text'][0], is_split_into_words=True)
inputs.tokens()

['<s>',
 '▁"',
 '▁Если',
 '▁Мир',
 'о',
 'нов',
 '▁за',
 'нял',
 '▁столь',
 '▁оп',
 'пози',
 'цион',
 'ную',
 '▁позицию',
 '▁',
 ',',
 '▁то',
 '▁мне',
 '▁представляет',
 'ся',
 '▁',
 ',',
 '▁что',
 '▁для',
 '▁него',
 '▁было',
 '▁бы',
 '▁поряд',
 'о',
 'чным',
 '▁и',
 '▁прави',
 'льным',
 '▁уйти',
 '▁в',
 '▁от',
 'ставку',
 '▁с',
 '▁занима',
 'емого',
 '▁им',
 '▁поста',
 '▁',
 ',',
 '▁поста',
 '▁',
 ',',
 '▁который',
 '▁предо',
 'ставлен',
 '▁ему',
 '▁сегодня',
 '▁"',
 '▁Един',
 'ой',
 '▁Россией',
 "▁''",
 '▁и',
 '▁ни',
 'кем',
 '▁больше',
 "▁''",
 '▁',
 ',',
 '▁-',
 '▁заключа',
 'ет',
 '▁Иса',
 'ев',
 '▁',
 '.',
 '</s>']

In [34]:
inputs.word_ids()

[None,
 0,
 1,
 2,
 2,
 2,
 3,
 3,
 4,
 5,
 5,
 5,
 5,
 6,
 7,
 7,
 8,
 9,
 10,
 10,
 11,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 17,
 17,
 18,
 19,
 19,
 20,
 21,
 22,
 22,
 23,
 24,
 24,
 25,
 26,
 27,
 27,
 28,
 29,
 29,
 30,
 31,
 31,
 32,
 33,
 34,
 35,
 35,
 36,
 37,
 38,
 39,
 39,
 40,
 41,
 42,
 42,
 43,
 44,
 44,
 45,
 45,
 46,
 46,
 None]

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as precious token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels