In [None]:
pip install ipymarkup

In [None]:
import numpy as np 
import pandas as pd 
import os
import torch
from torch import nn
import re
from tqdm import tqdm, tqdm_notebook
from torch.optim import Adam, AdamW
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
pd.options.mode.chained_assignment = None  # default='warn'
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import f1_score, accuracy_score
from ipymarkup import show_span_box_markup

In [None]:
# Для версии Андрея
# PATH1 = '/kaggle/input/rurebustraindata/train_part_1/train_part_1'
# PATH2 = '/kaggle/input/rurebustraindata/train_part_2/train_part_2'
# PATH3 = '/kaggle/input/rurebustraindata/train_part_3/train_part_3'
# Для версии Гарри и Тани
PATH1 = '/kaggle/input/rurebus-train-data/train_part_1/train_part_1'
PATH2 = '/kaggle/input/rurebus-train-data/train_part_2/train_part_2'
PATH3 = '/kaggle/input/rurebus-train-data/train_part_3/train_part_3'
PATH4 = '/kaggle/input/rurebus-train-data/test_ner_only' #чтобы все ок было надо обновить датасет до последней версии
# имена сущностей
NAMES = ['BIN', 'SOC', 'MET', 'CMP', 'ECO', 'INST', 'ACT', 'QUA']
# тэги для разметки
TAG = ['B-bin', 'I-bin', 'B-soc','I-soc','B-met', 'I-met','B-cmp','I-cmp','B-eco', 'I-eco','B-inst','I-inst','B-act','I-act','B-qua','I-qua']

В тесте присутствует множество тестов разбитых по частям (part_1, part_2 и тд.). Они представляют собой логически целое однако среди нет переносов в аннотациях. 
В аннотации присутствует следующая информация: именование сущности, ее класс а также местоположение в тексте (посимвольное).
Что касается отношений то дается информация о номере сущности и классе связи.
Для дальнейшем работы необходимо понять какой формат входных данных нужен для cnn lstm 


In [None]:
# подготовка данных для обучения
def make_data(lst,path):        
    res_df = pd.DataFrame(columns = ['word', 'tag'])
    with tqdm(desc="n", total=len(lst)) as pbar_outer:
        for item in lst:
            df1 = make_ann(item,path)
            df2 = make_text(item, df1,path)
            res_df = pd.concat([res_df, df2], ignore_index=True)
            pbar_outer.update(1)
    return res_df


# продготовка аннотаций
def make_ann(file,path):
    # открываю файл
    ann_df = pd.read_csv(path+'/'+file+'.ann', sep='\t', engine='python', header=None, on_bad_lines='skip') # здесь добавлено скипанье плохих строк(если например столбцов 6, а в трок только 4), наверно это не оч хорошо надо что-то придумать
    # нормальные названия для столбцов
    ann_df.rename(columns = {1:'class', 2:'words'}, inplace = True )
    # разделяю в разные столбцы классы и координаты
    ann_df.insert(2, 'coords' , ann_df['class'])
    ann_df['class'] = ann_df['class'].apply(lambda x: x.split(" ")[0])
    ann_df['coords'] = ann_df['coords'].apply(lambda x: x.split(" ")[1:])
    # удаляю нафиг строки с отношениями
    ann_df = ann_df.dropna()
    ann_df.reset_index(drop= True , inplace= True )
    # для удобства
    ann_df.insert(2, 'coords1' , ann_df['coords'])
    ann_df['coords1'] = ann_df['coords1'].apply(lambda x: int(x[0]))
    ann_df['words'] = ann_df['words'].apply(lambda x: my_split(x.split(" ")))
    ann_df['words'] = ann_df['words'].apply(lambda x: del_all(x))
    ann_df['words'] = ann_df['words'].apply(lambda x: [item.strip() for item in x if item not in ['','»', '«',':',' ']])
    ann_df = ann_df.sort_values(by='coords1')
    ann_df.reset_index(drop= True , inplace= True )
    return ann_df


# разметка
def make_text(file, df,path):
    # открываем файл и записываем его в dataframe
    with open(path+'/'+file+'.txt') as f:
        lines = f.readlines()
    text_df = pd.DataFrame({'word':lines})
    # считаем длины строк для удобства дальнейшей разметки
    text_df.insert(1, 'len' , text_df['word'].copy())
    text_df['len'] = text_df['len'].apply(lambda x: len(x)) # считаем длину строки
    new_lens = [text_df['len'][0]] # считаем длину предыдущих строк + длина новой строки
    for i in range(1, len(list(text_df['len']))):
        new_lens.append(sum(list(text_df['len'])[:i+1]))
    text_df.insert(2, 'new_len' , new_lens)
    # добавляем столбец для разметки
    text_df.insert(3, 'tag', 0)
    # удаляем \n
    text_df['word'] = text_df['word'].apply(lambda x: re.split('\n',x)[0])
    #удаляем строрки с []
    idx = [i for i in range(len(text_df)) if len(text_df['word'][i])==0]
    text_df = text_df.drop(index=idx)
    text_df.reset_index(drop = True, inplace= True)
    # делаем разметку
    # находим индекс строки для каждой аннотации
    df = find_rows(text_df, df)
#     print(df.head(5))
    # преобразовываем строку в массив
    text_df['word'] = text_df['word'].apply(lambda x: my_split([item for item in re.split(' ',x) if item != '']))
    text_df['word'] = text_df['word'].apply(lambda x: del_all(x))
    text_df['word'] = text_df['word'].apply(lambda x: [item.strip() for item in x if item not in ['',' ']])
    # момент разметки
    text_df = make_markup(text_df, df)
    text_df = last_changes(text_df)
#     # удаляем лишнее
    del text_df['len']
    del text_df['new_len']
    text_df['word'] = text_df['word'].apply(lambda x: [item.lower() for item in x if item not in ['',' ']])
    idx = [i for i in range(len(text_df)) if len(text_df['word'][i])==0]
    text_df = text_df.drop(index=idx)
    text_df.reset_index(drop = True, inplace= True)
    return text_df


# отделяем все что можно
def my_split(lst):
    for k in range(3):
        for i in range(len(lst)-1,-1,-1):
            if lst[i] in ['',' ']:
                lst.pop(i)
            else:
                lst[i] = lst[i].replace('\xa0', ' ')
                lst[i] = lst[i].replace('\t', ' ')
                lst[i]=lst[i].replace('………………','')
                lst[i]=lst[i].replace('………','')
                lst[i] = lst[i].replace('……','')
                idx = []
                lens = len(lst[i])
                for item in ['+', ')', '»',';','.',',', '"','(', '«',':',' ', '-\t','\\', '/','”','“','-','–','_________','*','№','%']:
                     idx.append(lst[i].find(item))
                for item in ['\xa0','"','.']:
                    if lst[i].endswith(item) and lens-1 not in idx:
                        idx.append(lens-1)                    
                idx.sort(reverse=True)
                for item in idx:
                    if item!=-1 and item!=0:
                        lst.insert(i+1, lst[i][item])
                        lst.insert(i+2, lst[i][item+1:])
                        lst[i] = lst[i][:item]
                    elif item!=-1 and item == 0:
                        lst.insert(i+1, lst[i][item+1:])
                        lst[i] = lst[i][item]
                    elif item!= -1 and item == lens-1:
                        lst.insert(i+1, lst[i][item])
                        lst[i] = lst[i][:item] 
    return lst


def del_all(lst):
    for i in range(len(lst)-1,-1,-1):
        split = lst[i]
        split = split.split(' ')
        if len(split) >1:
            for j in range(len(split)):
                lst.insert(i+1+j, split[j])
            lst.pop(i)
    return lst


# обработка строк без сущностей и оставшихся слов
def last_changes(df):
    for i in range(len(df)):
        if df['tag'][i] == 0:
            df['tag'][i] = ['O'] * len(df['word'][i])
        else:
#             df['tag'][i] = my_split(df['tag'][i])
            llst = []
            for item in df['tag'][i]:
                if item in TAG:
                    llst.append(item)
                else:
                    llst.append('O')
            df['tag'][i] = llst
        assert (len(df['word'][i]) == len(df['tag'][i]))
    return df

# находим индекс строки для каждой аннотации
def find_rows(txt_df, ann_df):
    ann_df.insert(4, 'idx' , 0)
    for i in range(len(ann_df)):
        for j in range(len(txt_df)):
            start = txt_df['new_len'][j]-txt_df['len'][j]
            end = txt_df['new_len'][j]
            if int(ann_df['coords'][i][0]) in np.arange(start, end):
                ann_df['idx'][i] = j
                break
            else:
                pass
    return ann_df
    
# основная часть разметки
def make_markup(text_df, ann_df):
    for i in range(len(ann_df)):
        words_count = len(ann_df['words'][i])
        lens = [len(item) for item in ann_df['words'][i]] 
        row = text_df.iloc[[ann_df['idx'][i]]]
#         print(row['word'].item())
        if row['tag'].item() == 0:
                        text_df['tag'][ann_df['idx'][i]] = text_df['word'][ann_df['idx'][i]].copy()
                        row['tag'] = row['word']
        for k in range(words_count):
                if k == 0:
                    idx = (row['tag'].item()).index(ann_df['words'][i][k])
                    text_df['tag'][ann_df['idx'][i]][idx] = 'B-'+ ann_df['class'][i].lower()
                else:
                    idx = row['tag'].item().index(ann_df['words'][i][k])
                    text_df['tag'][ann_df['idx'][i]][idx] = 'I-'+ ann_df['class'][i].lower()
    return text_df

In [None]:
# список всех файлов для обучения
data_lst = list(set([item[:-4] for item in os.listdir(PATH1)]))
data_lst.remove('.stats_c')
train_df = make_data(data_lst, PATH1)
train_df

In [None]:
#файлы для валидации
data_lst = list(set([item[:-4] for item in os.listdir(PATH2)]))
data_lst = data_lst[0]
val_df = make_data([data_lst], PATH2)
val_df

In [None]:
vocab = build_vocab_from_iterator(train_df['word'], min_freq=1, specials=["<UNK>"])
vocab.set_default_index(vocab["<UNK>"])
vocab_lables = build_vocab_from_iterator(train_df['tag'], min_freq=1)

In [None]:
vocab(train_df['word'][0])

In [None]:
def make_tl(df):
    tokens = []
    lables = []
    max_len = 0
    for ind in df.index:
        tokens.append(vocab(df['word'][ind]))
        lables.append(vocab_lables(df['tag'][ind]))
        if len(df['word'][ind]) > max_len:
            max_len = len(df['word'][ind])
    df['tokens'] = tokens
    df['lables'] = lables
    return df,max_len


def make_pad(df):
    list_sent = []
    list_labels = []
    for ind in df.index:
        list_sent.append(df['tokens'][ind])
        list_labels.append(df['lables'][ind])
    padded_sent = pad_sequences(list_sent)
    padded_labels = pad_sequences(list_labels)
    print(padded_sent.shape)
    padd_df = pd.DataFrame(columns = ['sentence', 'labels'])
    padd_df['sentence'] = pd.Series(padded_sent.tolist())
    padd_df['labels'] = pd.Series(padded_labels.tolist())
    return padd_df

In [None]:
train_df,max_len1 = make_tl(train_df)
val_df,max_len2 = make_tl(val_df)
train_df

In [None]:
print(max_len1, ' ',max_len2)

In [None]:
# делаем паддинг 
train_df = make_pad(train_df)
val_df = make_pad(val_df)
train_df

In [None]:
print(len(train_df['sentence'][4]),len(val_df['sentence'][4]))

In [None]:
def one_hot(x: np.ndarray, vocab_len: int) -> np.ndarray:
    """
    Args:
        x - одномерный массив значений словаря
        vocab_len - длина словаря
    Выход:
        двумерный массив encoded, где encoded[i] - результат one hot кодирования x[i]
    """
    encoded = np.zeros((len(x), vocab_len))
    for i in range(len(x)):
        encoded[i][x[i]] = 1
    return encoded

text_vocab_len = len(vocab)
target_vocab_len = len(vocab_lables)

class TokenDataset(torch.utils.data.Dataset):
    def __init__(self, data, text_vocab_len = text_vocab_len, target_vocab_len = target_vocab_len, classes = None,
                 transform=None, target_transform=None):
        self.data = data
        self.sequence_len = len(data.iloc[0][0])
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        tokens, tag = self.data.iloc[idx]
        return torch.Tensor(tokens).int(), torch.Tensor(one_hot(tag, target_vocab_len))
        # return torch.Tensor(tokens).int(), torch.Tensor(tag)


In [None]:
datasets = {
    'train': TokenDataset(train_df),
    'val': TokenDataset(val_df)
}

In [None]:
dataloader = {
    'train':
    torch.utils.data.DataLoader(datasets['train'],
                                batch_size=16,
                                shuffle=True,
                                num_workers=0),  # for Kaggle
    'val':
    torch.utils.data.DataLoader(datasets['val'],
                                batch_size=16,
                                shuffle=False,
                                num_workers=0)  # for Kaggle
}

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
class CNN_LSTM(nn.Module):
    def __init__(self, vocab_size, n_classes, embedding_dim=250, hidden_size = 32, filters=((2, 10), (3, 8))):
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=200, kernel_size=3,padding=1)
        self.pool1 = nn.MaxPool1d(2)
        input_size = 100
        self.hidden_size = hidden_size
        self.lstm_layer = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, n_classes)
        outputs = []

    def forward(self, inputs):
        projections = self.embeddings_layer.forward(inputs) 
        projections = projections.transpose(1, 2)
        projections = self.conv1(projections)
        projections = projections.transpose(1, 2)
        projections = self.pool1(projections)
        output, (final_hidden_state, final_cell_state) = self.lstm_layer(projections)
        output = output.reshape(-1, self.hidden_size)
        output = self.fc(output)
        return output


In [None]:
def train_model(model, criterion, optimizer, num_epochs=3):
    all_true_labels = []
    all_preds = []
    inputs_str = []
    loss_list = {'train' : [], 'val':[]}
    acc_list = {'train' : [], 'val':[]}
    f1_list = {'train' : [], 'val':[]}
#     lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
#                                                    step_size=10,
#                                                    gamma=0.2)
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)
        for phase in ['train', 'val']:
            if phase == 'train':
                print('start train')
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            all_true_labels = []
            all_preds = []
            inputs_str = []
            for inputs, labels in tqdm(dataloader[phase]):
                batch_size, n_words, n_classes = labels.shape
                labels = labels.reshape(-1, n_classes).to(device)
                outputs = model(inputs.to(device))
                
                loss = criterion(outputs, labels)
                if phase == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                _, preds = torch.max(outputs, 1)
                running_loss += loss.item() #* inputs.size(0)
                _, labels = torch.max(labels, 1)

                all_true_labels.extend(labels.tolist())
                all_preds.extend(preds.tolist())
            epoch_loss = running_loss /len(dataloader[phase])
            epoch_acc = accuracy_score(all_preds, all_true_labels)
            epoch_f1 = f1_score(all_preds, all_true_labels, average='macro')
#             lr_scheduler.step()  
            print('{} loss: {:.4f}, acc: {:.4f}, f1: {:.4f}'.format(phase,
                                                        epoch_loss,
                                                        epoch_acc,
                                                        epoch_f1            
                                                        ))
            loss_list[phase].append(epoch_loss)
            acc_list[phase].append(epoch_acc.tolist())
            f1_list[phase].append(epoch_f1.tolist())
    return all_true_labels, all_preds, inputs_str, loss_list, acc_list, f1_list

In [None]:
n_classes = len(vocab_lables)
model = CNN_LSTM(len(vocab), n_classes = n_classes).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = Adam(model.parameters(), lr = 3e-4)

In [None]:
epox_num = 15
all_true_labels, all_preds, inputs_str, loss, acc, f1 = train_model(model, criterion, optimizer, epox_num)

In [None]:
epox_list = [i for i in range(epox_num)]
def graf(loss, acc):
    
    fig, ax = plt.subplots(2, 3, figsize=(26, 13))
    ax[0, 0].plot(epox_list, loss['train'])
    ax[0, 0].set_title("Изменение потерь на обучающей выборке")
    ax[0, 1].plot(epox_list, acc['train'])
    ax[0, 1].set_title("Изменение точности на обучающей выборке")
    ax[0, 2].plot(epox_list, f1['train'])
    ax[0, 2].set_title("Изменение f1-score на обучающей выборке")
    ax[1, 0].plot(epox_list, loss['val'])
    ax[1, 0].set_title("Изменение потерь на валидационной выборке")
    ax[1, 1].plot(epox_list, acc['val'])
    ax[1, 1].set_title("Изменение точности на валидационной выборке")
    ax[1, 2].plot(epox_list, f1['val'])
    ax[1, 2].set_title("Изменение f1-score на валидационной выборке")
    plt.show()

In [None]:
import matplotlib.pyplot as plt
graf(loss, acc)

In [None]:
def make_test(model,dataloader,dataset):
    all_true_labels = []
    all_preds = []
    inputs_str = []
    acc_list = []
    model.eval()
    for inputs, labels in tqdm(dataloader):
        batch_size, n_words, n_classes = labels.shape
        labels = labels.reshape(-1, n_classes).to(device)
        outputs = model(inputs.to(device))
        _, preds = torch.max(outputs, 1)
        _, labels = torch.max(labels, 1)
        all_true_labels.extend(labels.tolist())
        all_preds.extend(preds.tolist())
    epoch_acc = accuracy_score(all_preds, all_true_labels)
    epoch_f1 = f1_score(all_preds, all_true_labels, average='macro')
    print('acc: {:.4f}, f1: {:.4f}'.format(epoch_acc, epoch_f1))
    acc_list.append(epoch_acc.tolist())
    return all_true_labels,all_preds

In [None]:
data_lst = list(set([item[:-4] for item in os.listdir(PATH4)]))
data_lst.remove('31339221025603182330049_24_part_1')
data_lst.remove('31339131024502051716072_24_part_1')
data_lst.remove('31339251033301001216016_20_part_2')
data_lst.remove('31339011021101006981035_9_part_1')
data_lst.remove('31339011021100987258005_5_part_2')
data_lst.remove('31339011061672000026002_4_part_2')
data_lst.remove('31339011026200597103018_8_part_1')
data_lst.remove('31339201027002952877006_14_part_2')
test_df = make_data(data_lst[:100], PATH4)
test_df,max_len1 = make_tl(test_df)
test_df = make_pad(test_df)

In [None]:
dataset =  TokenDataset(test_df)
test_dataloader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=False, num_workers=0)
all_t,all_p = make_test(model,test_dataloader,dataset)

In [None]:
# убираем палдинги 
data = {'sentence': test_df['sentence'], 'real_cat': np.array_split(all_t, len(test_df['sentence'])), 'pred_cat':np.array_split(all_p, len(test_df['sentence']))}
df_test_see = pd.DataFrame(data)
for ind in df_test_see.index:
    sent = df_test_see['sentence'][ind]
    try:
        len_pad = sent.index(next(filter(lambda x: x!=0, sent)))
    except StopIteration:
        pass
    sent = sent[len_pad:]
    sent = vocab.lookup_tokens(sent)
    df_test_see.at[ind, 'sentence'] = sent 
    sent = df_test_see['real_cat'][ind]
    sent = sent[len_pad:]
    sent = vocab_lables.lookup_tokens(sent)
    df_test_see.at[ind, 'real_cat'] = sent 
    sent = df_test_see['pred_cat'][ind]
    sent = sent[len_pad:]
    sent = vocab_lables.lookup_tokens(sent)
    df_test_see.at[ind, 'pred_cat'] = sent 
df_test_see

In [None]:
def make_show(row,lbl):
    spans = []
    coords = 0
    c1 = 0
    c2 = 0
    tag = ''
    text = ''
    last = 0
    for i in range(len(row)):
        text += row[i]+' '
        if lbl[i] in TAG:
            if 'B-' in lbl[i]:
                c1 = coords
                c2 = coords + len(row[i])
                tag = lbl[i][2:].upper()
                coords+=1+len(row[i])
            elif 'I-' in lbl[i]:
                c2+=1+len(row[i])
                coords+=1+len(row[i])
        elif lbl[i] =='O':
            coords += len(row[i])+1
        if (c1,c2,tag) !=(0, 0, ''):
            if last == c1:
                spans = spans[:-1]
            spans.append((c1,c2,tag))
            last = c1
    return text, spans

In [None]:
row_idx = 5971
row = df_test_see['sentence'][row_idx]
true_lbl = df_test_see['real_cat'][row_idx]
pred_lbl = df_test_see['pred_cat'][row_idx]
text,spans1 = make_show(row,true_lbl)
_,spans2 = make_show(row,pred_lbl)
print('true:')
show_span_box_markup(text, spans1)
print('pred:')
show_span_box_markup(text, spans2)