In [186]:
config = {   
    'input_path': "test_data/HIPE-data-v1.2-test-masked-de.tsv",
#     'models':  ["ner-french-8-europeana", "ner-french-5-europeana", 
#               "ner-french-8", "ner-french-5"],
    
    'models': ['ner-german-5', 'ner-german-5-europeana'],
    'vote_threshold': 2,
    'tokenizer': "bert-base-multilingual-cased",
    'output_path': "first_submissions/UvA.ILPS_bundle2_DE_2.tsv.nerc_only"
}

In [187]:
PREFIX = "/ivi/ilps/personal/vprovat/good_models_for_clef/"

In [188]:
FILE_PATH = config['input_path']

MODELS = [PREFIX + model for model in config['models']]

VOTE_THRESHOLD = config['vote_threshold']

TOKENIZER = config['tokenizer']

OUTPUT_PATH = config['output_path']

Step 0: reading

In [189]:
from utils.data_processing import read_data_to_dfs, write_results, add_beginnings

In [190]:
import pickle
filename = FILE_PATH.split('/')[-1]
# pickle.dump(dfs, open('pickles/'+filename+'.p', 'wb'))
dfs = pickle.load(open('pickles/'+filename+'.p', 'rb'))

In [191]:
# dfs = read_data_to_dfs(FILE_PATH)

Step 1: NER

In [192]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
import numpy as np 

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

label_list = [
    'B-loc',
    'B-org',
    'B-pers',
    'B-prod',
    'B-time',
    'I-loc',
    'I-org',
    'I-pers',
    'I-prod',
    'I-time',
    'O'
]

models = [AutoModelForTokenClassification.from_pretrained(MODEL_NAME) for MODEL_NAME in MODELS]


In [193]:
def get_predictions(sequence, model_):
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
    inputs = tokenizer.encode(sequence, return_tensors="pt")

    # print([(a, b) for a, b in (zip(tokens,inputs[0]))])

    outputs = model_(inputs)
    # print(outputs)
    predictions = torch.argmax(outputs[0], dim=2)
    
    nice_predictions = [(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())]
    return nice_predictions

In [194]:
def detokenize_predictions(predictions):
    cur_token = ''
    cur_label = ''
    res = []
    for token, label in predictions:
        if token[0] == '[':
            continue
        if token[0] == '#':
            cur_token += token.strip('#')
            cur_label = label
        else:
            if cur_token:
                res.append((cur_token, cur_label))
            cur_token = token
            cur_label = label
            
    if cur_token:
        res.append((cur_token, cur_label))        
    return res

In [195]:
def predict_for_df(df, model_):
    sequence = " ".join(df['TOKEN'].tolist())
    
    predictions_raw = get_predictions(sequence, model_)
    predictions = detokenize_predictions(predictions_raw)
    
    return predictions

In [196]:
for model in models:
    print(predict_for_df(dfs[0], model))
    print('+')

[('Neuigkeiten', 'O'), ('.', 'O'), ('(', 'O'), ('Mißhandlung', 'O'), ('der', 'O'), ('Franken', 'O'), ('in', 'O'), ('Rom', 'B-loc'), ('.', 'O'), (']', 'O'), ('Buonaparte', 'I-pers'), ("'", 'O'), ('s', 'O'), ('Bruder', 'O'), ('hat', 'O')]
+
[('Neuigkeiten', 'O'), ('.', 'O'), ('(', 'O'), ('Mißhandlung', 'O'), ('der', 'O'), ('Franken', 'O'), ('in', 'O'), ('Rom', 'B-loc'), ('.', 'O'), (']', 'O'), ('Buonaparte', 'I-pers'), ("'", 'O'), ('s', 'O'), ('Bruder', 'O'), ('hat', 'O')]
+


In [197]:
def zip_predictions(preds):
    zipped = [[] for i in range(len(preds[0]))]
    for p in preds:
        for i, item in enumerate(p):
            zipped[i].append(item)
            
    return zipped

In [198]:
import random
from collections import Counter 

def find_best_label(labels): # not using VOTE_THRESHOLD yet
    candidates = []
    counter = Counter(labels).most_common()
    top_votes = counter[0][-1] # (first_label, number_votes) -> number_votes
    
    candidates = [label for label, cnt in counter if cnt == top_votes]
    return random.choice(candidates)

In [199]:
def combine_predictions(pred_list):
    res = []
    zipped_preds = zip_predictions(pred_list)
    for preds in zipped_preds: # list of M (token, label) pairs, where label is predicted by each of M models
#         print(preds)
        token = preds[0][0]
        labels = [pred[-1] for pred in preds]
        best_label = find_best_label(labels)
        res.append((token, best_label))
        
    return res

In [200]:
def add_predictions_to_df(df, target_column='NE-COARSE-LIT'):
    
    all_predictions = [predict_for_df(df, model) for model in models]
    
    best_predictions = combine_predictions(all_predictions)
    res_df = df.copy()
    for i, (token, label) in enumerate(best_predictions):
        res_df[target_column][i] = label
        
    return res_df

In [201]:
add_predictions_to_df(dfs[1])

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC
0,die,O,_,_,_,_,_,_,_,_
1,neulich,O,_,_,_,_,_,_,_,_
2,schon,O,_,_,_,_,_,_,_,_
3,erzählten,O,_,_,_,_,_,_,_,_
4,Vorfälle,O,_,_,_,_,_,_,_,_
5,in,O,_,_,_,_,_,_,_,_
6,Rom,B-loc,_,_,_,_,_,_,_,_
7,ans,O,_,_,_,_,_,_,_,EndOfLine|NoSpaceAfter
8,Directorium,O,_,_,_,_,_,_,_,_
9,berichtet,O,_,_,_,_,_,_,_,NoSpaceAfter


In [202]:
dfs[1]

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC
0,die,_,_,_,_,_,_,_,_,_
1,neulich,_,_,_,_,_,_,_,_,_
2,schon,_,_,_,_,_,_,_,_,_
3,erzählten,_,_,_,_,_,_,_,_,_
4,Vorfälle,_,_,_,_,_,_,_,_,_
5,in,_,_,_,_,_,_,_,_,_
6,Rom,_,_,_,_,_,_,_,_,_
7,ans,_,_,_,_,_,_,_,_,EndOfLine|NoSpaceAfter
8,Directorium,_,_,_,_,_,_,_,_,_
9,berichtet,_,_,_,_,_,_,_,_,NoSpaceAfter


In [203]:
dfs_with_predictions = [add_predictions_to_df(df) for df in dfs]

In [204]:
dfs_with_predictions[1]

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC
0,die,O,_,_,_,_,_,_,_,_
1,neulich,O,_,_,_,_,_,_,_,_
2,schon,O,_,_,_,_,_,_,_,_
3,erzählten,O,_,_,_,_,_,_,_,_
4,Vorfälle,O,_,_,_,_,_,_,_,_
5,in,O,_,_,_,_,_,_,_,_
6,Rom,B-loc,_,_,_,_,_,_,_,_
7,ans,O,_,_,_,_,_,_,_,EndOfLine|NoSpaceAfter
8,Directorium,O,_,_,_,_,_,_,_,_
9,berichtet,O,_,_,_,_,_,_,_,NoSpaceAfter


In [205]:
# from utils.data_processing import add_beginnings

In [206]:
# dfs_with_beginnings = [add_beginnings(add_predictions_to_df(df)) for df in dfs]

In [207]:
import pickle

run_name = OUTPUT_PATH.split('/')[-1]
folder = OUTPUT_PATH.split('/')[0]
pickle_path = folder + '/pickles/' + run_name + '.p'

pickle.dump(dfs_with_predictions, open(pickle_path, 'wb'))

In [208]:
write_results(dfs_with_predictions, OUTPUT_PATH)  

In [122]:
'''
TEST_bundle2_FR_1 – no Europeana yet, 10 epochs
TEST_bundle2_FR_2 – with Europeana, 5 epochs
TEST_bundle2_FR_3 – no Europeana, 5 epochs - just to compare

TEST_bundle2_FR_4 – no Europeana, 8 epochs (it was the best for FR so far)
TEST_bundle2_FR_5 – with Europeana, 8 epochs - is it better? probably yes!

testing_submissions/TEST_bundle2_FR_6 – ensembling: 5 or 8 epochs, with or without Europeana

TEST_bundle2_DE_2 – with Europeana, 5 epochs
TEST_bundle2_DE_3 – no Europeana, 7 epochs
TEST_bundle4_DE_4 – with Europeana, 7 epochs


TEST_bundle2_EN_1 – BERT finetuned on conll English

'''

'\nTEST_bundle2_FR_1 – no Europeana yet, 10 epochs\nTEST_bundle2_FR_2 – with Europeana, 5 epochs\nTEST_bundle2_FR_3 – no Europeana, 5 epochs - just to compare\n\nTEST_bundle2_FR_4 – no Europeana, 8 epochs (it was the best for FR so far)\nTEST_bundle2_FR_5 – with Europeana, 8 epochs - is it better?\n\nTEST_bundle2_DE_2 – with Europeana, 5 epochs\nTEST_bundle2_DE_3 – no Europeana, 7 epochs\nTEST_bundle4_DE_4 – with Europeana, 7 epochs\n\n\nTEST_bundle2_EN_1 – BERT finetuned on conll English\n\n'