Usage: specify the parameters in the config dictionary below.\
Model options here are the following: "vera-pro/bert-mention-de", "vera-pro/bert-mention-fr" and "vera-pro/bert-mention-en". Alternatively, you can train your own transformer model (and probably achieve better results than I did right before the deadline :)

In [1]:
config = {
    'input_path': "data/test_bundle2/HIPE-data-v1.2-test-masked-de.tsv",
    'model': "vera-pro/bert-mention-de",
    'tokenizer': "bert-base-multilingual-cased",
    'output_path': "results/mention_detection.tsv"
}

In [2]:
FILE_PATH = config['input_path']

MODEL = config['model']

TOKENIZER = config['tokenizer']

OUTPUT_PATH = config['output_path']

Step 0: reading the data

In [3]:
from utils.data_processing import read_data_to_dfs, write_results, add_beginnings

In [5]:
dfs = read_data_to_dfs(FILE_PATH)

In [6]:
dfs[1]

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC
0,die,_,_,_,_,_,_,_,_,_
1,neulich,_,_,_,_,_,_,_,_,_
2,schon,_,_,_,_,_,_,_,_,_
3,erzählten,_,_,_,_,_,_,_,_,_
4,Vorfälle,_,_,_,_,_,_,_,_,_
5,in,_,_,_,_,_,_,_,_,_
6,Rom,_,_,_,_,_,_,_,_,_
7,ans,_,_,_,_,_,_,_,_,EndOfLine|NoSpaceAfter
8,Directorium,_,_,_,_,_,_,_,_,_
9,berichtet,_,_,_,_,_,_,_,_,NoSpaceAfter


In [8]:
# import pickle

# data = OUTPUT_PATH.split('/')[-1]
# pickle_path = 'pickles/' + data + '.p'

# pickle.dump(dfs, open(pickle_path, 'wb'))

Step 1: Mention detection

In [13]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
import numpy as np 

model = AutoModelForTokenClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

label_list = [
    'B-ent',
    'I-ent',
    'O'
]



In [17]:
def get_predictions(sequence):
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
    inputs = tokenizer.encode(sequence, return_tensors="pt")

    # print([(a, b) for a, b in (zip(tokens,inputs[0]))])

    outputs = model(inputs)
    # print(outputs)
    predictions = torch.argmax(outputs[0], dim=2)

    nice_predictions = [(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())]
    return nice_predictions

In [18]:
def detokenize_predictions(predictions):
    cur_token = ''
    cur_label = ''
    res = []
    for token, label in predictions:
        if token[0] == '[': # or (len(token) == 1 and not token.isalpha()):
            continue
        if token[0] == '#':
            cur_token += token.strip('#')
            cur_label = label
        else:
            if cur_token:
                res.append((cur_token, cur_label))
            cur_token = token
            cur_label = label
            
    if cur_token:
        res.append((cur_token, cur_label))        
    return res

In [19]:
def predict_for_df(df):
    sequence = " ".join(df['TOKEN'].tolist())
    
    predictions_raw = get_predictions(sequence)
#     print(predictions_raw)
    predictions = detokenize_predictions(predictions_raw)
    
    return predictions

In [20]:
predict_for_df(dfs[-100])

[('warme', 'O'),
 (',', 'O'),
 ('wohltuende', 'O'),
 (',', 'O'),
 ('menschliche', 'O'),
 ('Worte', 'O'),
 ('sind', 'O'),
 ('es', 'O'),
 ('.', 'O'),
 ('mit', 'O')]

In [21]:
def add_predictions_to_df(df, target_column='NE-COARSE-LIT'):
    predictions = predict_for_df(df)
    
    res_df = df.copy()
    for i, (token, label) in enumerate(predictions):
        res_df[target_column][i] = label
        
    return res_df

In [22]:
from utils.data_processing import add_beginnings

In [23]:
# dfs_with_predictions = [add_beginnings(add_predictions_to_df(df)) for df in dfs]

In [24]:
dfs_with_predictions = [add_predictions_to_df(df) for df in dfs]

In [25]:
dfs_with_predictions[1003]

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC
0,beitslosigkeit,O,_,_,_,_,_,_,_,_
1,hineinpaßt,O,_,_,_,_,_,_,_,NoSpaceAfter
2,.,O,_,_,_,_,_,_,_,_
3,Er,O,_,_,_,_,_,_,_,_
4,bringt,O,_,_,_,_,_,_,_,_
5,ferner,O,_,_,_,_,_,_,_,_
6,einige,O,_,_,_,_,_,_,_,EndOfLine|NoSpaceAfter
7,Wünsche,O,_,_,_,_,_,_,_,_
8,in,O,_,_,_,_,_,_,_,_
9,bezug,O,_,_,_,_,_,_,_,_


In [26]:
get_predictions('John Smit and Maurice Bourgoin')

[('[CLS]', 'O'),
 ('John', 'B-ent'),
 ('Sm', 'I-ent'),
 ('##it', 'I-ent'),
 ('and', 'O'),
 ('Maurice', 'B-ent'),
 ('Bourgoin', 'I-ent'),
 ('[SEP]', 'O')]

In [27]:
dfs_with_predictions[4]

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC
0,reichs,O,_,_,_,_,_,_,_,_
1,Schutz,O,_,_,_,_,_,_,_,_
2,und,O,_,_,_,_,_,_,_,_
3,Beystand,O,_,_,_,_,_,_,_,_
4,erflehten,O,_,_,_,_,_,_,_,_
5,sehr,O,_,_,_,_,_,_,_,_
6,dringend,O,_,_,_,_,_,_,_,EndOfLine|NoSpaceAfter
7,„,O,_,_,_,_,_,_,_,NoSpaceAfter
8,und,O,_,_,_,_,_,_,_,_
9,derb,O,_,_,_,_,_,_,_,_


In [28]:
# import pickle

# run_name = OUTPUT_PATH.split('/')[-1]
# folder = OUTPUT_PATH.split('/')[0]
# pickle_path = folder + '/pickles/' + run_name + '.p'

# pickle.dump(dfs_with_predictions, open(pickle_path, 'wb'))

In [21]:
write_results(dfs_with_predictions, OUTPUT_PATH)  