In [1]:
import torch
import numpy as np
import random
import gensim
import heapq
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import logging
import pickle
import requests, json
logging.basicConfig(level=logging.INFO)

import nltk
from nltk import tokenize

from transformers import LineByLineTextDataset
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer, BertForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import NearestNeighbors

from transformers import AutoTokenizer, AutoModelWithLMHead

## Restructure Prolog query

e.g. writes(Who, hamlet) -> Who writes hamlet.

In [2]:
def restructure_prolog(query):
    # parse
    predicate = query.split('(')[0]
    arguments = query.split('(')[1][:-1]
    arguments = [arg.strip() for arg in arguments.split(',')]
    
    # remove underline
    # arguments = [arg.replace('_', ' ') for arg in arguments]
    # predicate = predicate.replace('_', ' ')
    
    rtn = ['[CLS]', arguments[0], predicate] + arguments[1:] + ['.', '[SEP]']
    
    # mask
    for i in range(len(rtn)):
        if rtn[i][0].isupper():
            rtn[i] = '[MASK]'
    
    return ' '.join(rtn)

In [3]:
restructure_prolog('is(X, the fifth planet from the Sun)')

'[CLS] [MASK] is the fifth planet from the Sun . [SEP]'

## Fine-tuning

In [2]:
print('loading data')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="data/trainset.txt",
    block_size=128,
)
#train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=0)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.3
)
print('data loaded')

loading data


INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt from cache at C:\Users\Administrator/.cache\torch\transformers\9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO:transformers.data.datasets.language_modeling:Creating features from dataset file at ../dataset/geo/trainset.txt


data loaded


In [3]:
device = 'cuda'
fine_tuning_model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json from cache at C:\Users\Administrator/.cache\torch\transformers\6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.788fed32bb8481a9b15ce726d41c53d5d5066b04c667e34ce3a7a3826d1573d8
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file https://cdn.huggingface.co/bert-large-uncased-pytorch_model.bin from cache at C:\Users\Administrator/.cache\

In [4]:
training_args = TrainingArguments(
    output_dir="./myBERT",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_gpu_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=2e-05,
    no_cuda=False
)

trainer = Trainer(
    model=fine_tuning_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)
trainer.train()
trainer.save_model("./myBERT")

INFO:transformers.training_args:PyTorch: setting up devices
INFO:transformers.trainer:You are instantiating a Trainer but W&B is not installed. To use wandb logging, run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface.
INFO:transformers.trainer:***** Running training *****
INFO:transformers.trainer:  Num examples = 43624
INFO:transformers.trainer:  Num Epochs = 3
INFO:transformers.trainer:  Instantaneous batch size per device = 8
INFO:transformers.trainer:  Total train batch size (w. parallel, distributed & accumulation) = 2
INFO:transformers.trainer:  Gradient Accumulation steps = 1
INFO:transformers.trainer:  Total optimization steps = 65436


HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=21812, style=ProgressStyle(description_width=…

{"loss": 2.6600315257534386, "learning_rate": 1.984717892291705e-05, "epoch": 0.02292316156244269, "step": 500}
{"loss": 2.656905919849873, "learning_rate": 1.96943578458341e-05, "epoch": 0.04584632312488538, "step": 1000}
{"loss": 2.5357310264110566, "learning_rate": 1.954153676875115e-05, "epoch": 0.06876948468732808, "step": 1500}
{"loss": 2.5893159299492834, "learning_rate": 1.9388715691668196e-05, "epoch": 0.09169264624977076, "step": 2000}
{"loss": 2.6749846221208573, "learning_rate": 1.9235894614585243e-05, "epoch": 0.11461580781221346, "step": 2500}
{"loss": 2.559907356619835, "learning_rate": 1.9083073537502293e-05, "epoch": 0.13753896937465615, "step": 3000}
{"loss": 2.5717898807525637, "learning_rate": 1.8930252460419344e-05, "epoch": 0.16046213093709885, "step": 3500}
{"loss": 2.523244404554367, "learning_rate": 1.877743138333639e-05, "epoch": 0.18338529249954152, "step": 4000}
{"loss": 2.5903904271603095, "learning_rate": 1.8624610306253438e-05, "epoch": 0.2063084540619842

INFO:transformers.trainer:Saving model checkpoint to ./myBERT_large\checkpoint-10000
INFO:transformers.configuration_utils:Configuration saved in ./myBERT_large\checkpoint-10000\config.json
INFO:transformers.modeling_utils:Model weights saved in ./myBERT_large\checkpoint-10000\pytorch_model.bin


{"loss": 2.5578433197140695, "learning_rate": 1.6790757381258025e-05, "epoch": 0.4813863928112965, "step": 10500}
{"loss": 2.4669101606309414, "learning_rate": 1.663793630417507e-05, "epoch": 0.5043095543737393, "step": 11000}
{"loss": 2.451165301144123, "learning_rate": 1.6485115227092122e-05, "epoch": 0.5272327159361819, "step": 11500}
{"loss": 2.364134666264057, "learning_rate": 1.6332294150009172e-05, "epoch": 0.5501558774986246, "step": 12000}
{"loss": 2.4243748498335482, "learning_rate": 1.617947307292622e-05, "epoch": 0.5730790390610673, "step": 12500}
{"loss": 2.4457915949225426, "learning_rate": 1.6026651995843266e-05, "epoch": 0.59600220062351, "step": 13000}
{"loss": 2.4201494171917437, "learning_rate": 1.5873830918760316e-05, "epoch": 0.6189253621859527, "step": 13500}
{"loss": 2.4789063927344976, "learning_rate": 1.5721009841677367e-05, "epoch": 0.6418485237483954, "step": 14000}
{"loss": 2.3562137131989003, "learning_rate": 1.5568188764594414e-05, "epoch": 0.6647716853108

INFO:transformers.trainer:Saving model checkpoint to ./myBERT_large\checkpoint-20000
INFO:transformers.configuration_utils:Configuration saved in ./myBERT_large\checkpoint-20000\config.json
INFO:transformers.modeling_utils:Model weights saved in ./myBERT_large\checkpoint-20000\pytorch_model.bin


{"loss": 2.4539842493236064, "learning_rate": 1.3734335839598997e-05, "epoch": 0.9398496240601504, "step": 20500}
{"loss": 2.405084967076778, "learning_rate": 1.3581514762516047e-05, "epoch": 0.962772785622593, "step": 21000}
{"loss": 2.438625141851604, "learning_rate": 1.3428693685433096e-05, "epoch": 0.9856959471850357, "step": 21500}



HBox(children=(IntProgress(value=0, description='Iteration', max=21812, style=ProgressStyle(description_width=…

{"loss": 2.3101954451948403, "learning_rate": 1.3275872608350145e-05, "epoch": 1.0086191087474785, "step": 22000}
{"loss": 2.265027540564537, "learning_rate": 1.3123051531267192e-05, "epoch": 1.031542270309921, "step": 22500}
{"loss": 2.2659556266069414, "learning_rate": 1.2970230454184242e-05, "epoch": 1.0544654318723639, "step": 23000}
{"loss": 2.2776337320506572, "learning_rate": 1.281740937710129e-05, "epoch": 1.0773885934348066, "step": 23500}
{"loss": 2.329091015994549, "learning_rate": 1.266458830001834e-05, "epoch": 1.1003117549972492, "step": 24000}
{"loss": 2.2086111823022367, "learning_rate": 1.251176722293539e-05, "epoch": 1.123234916559692, "step": 24500}
{"loss": 2.343381143331528, "learning_rate": 1.2358946145852437e-05, "epoch": 1.1461580781221345, "step": 25000}
{"loss": 2.265583106189966, "learning_rate": 1.2206125068769485e-05, "epoch": 1.1690812396845773, "step": 25500}
{"loss": 2.245767152041197, "learning_rate": 1.2053303991686534e-05, "epoch": 1.19200440124702, "

INFO:transformers.trainer:Saving model checkpoint to ./myBERT_large\checkpoint-30000
INFO:transformers.configuration_utils:Configuration saved in ./myBERT_large\checkpoint-30000\config.json
INFO:transformers.modeling_utils:Model weights saved in ./myBERT_large\checkpoint-30000\pytorch_model.bin
INFO:transformers.trainer:Deleting older checkpoint [myBERT_large\checkpoint-10000] due to args.save_total_limit


{"loss": 2.333539779677987, "learning_rate": 1.0677914297939973e-05, "epoch": 1.3983128553090043, "step": 30500}
{"loss": 2.2039798449352386, "learning_rate": 1.0525093220857022e-05, "epoch": 1.4212360168714469, "step": 31000}
{"loss": 2.2049983873963357, "learning_rate": 1.0372272143774069e-05, "epoch": 1.4441591784338896, "step": 31500}
{"loss": 2.2573918953416867, "learning_rate": 1.0219451066691119e-05, "epoch": 1.4670823399963324, "step": 32000}
{"loss": 2.2568867580145597, "learning_rate": 1.0066629989608168e-05, "epoch": 1.490005501558775, "step": 32500}
{"loss": 2.2566232802569868, "learning_rate": 9.913808912525216e-06, "epoch": 1.5129286631212175, "step": 33000}
{"loss": 2.2700930643081665, "learning_rate": 9.760987835442265e-06, "epoch": 1.5358518246836603, "step": 33500}
{"loss": 2.236307323265821, "learning_rate": 9.608166758359314e-06, "epoch": 1.5587749862461031, "step": 34000}
{"loss": 2.2640343472361564, "learning_rate": 9.455345681276362e-06, "epoch": 1.58169814780854

INFO:transformers.trainer:Saving model checkpoint to ./myBERT_large\checkpoint-40000
INFO:transformers.configuration_utils:Configuration saved in ./myBERT_large\checkpoint-40000\config.json
INFO:transformers.modeling_utils:Model weights saved in ./myBERT_large\checkpoint-40000\pytorch_model.bin
INFO:transformers.trainer:Deleting older checkpoint [myBERT_large\checkpoint-20000] due to args.save_total_limit


{"loss": 2.3060937882810832, "learning_rate": 7.6214927562809475e-06, "epoch": 1.8567760865578582, "step": 40500}
{"loss": 2.188187908321619, "learning_rate": 7.468671679197995e-06, "epoch": 1.8796992481203008, "step": 41000}
{"loss": 2.2936088038384916, "learning_rate": 7.315850602115045e-06, "epoch": 1.9026224096827433, "step": 41500}
{"loss": 2.2142816158384084, "learning_rate": 7.1630295250320925e-06, "epoch": 1.925545571245186, "step": 42000}
{"loss": 2.204173205435276, "learning_rate": 7.010208447949142e-06, "epoch": 1.948468732807629, "step": 42500}
{"loss": 2.1612360605448484, "learning_rate": 6.857387370866191e-06, "epoch": 1.9713918943700715, "step": 43000}
{"loss": 2.2009097110033036, "learning_rate": 6.704566293783239e-06, "epoch": 1.9943150559325142, "step": 43500}



HBox(children=(IntProgress(value=0, description='Iteration', max=21812, style=ProgressStyle(description_width=…

{"loss": 2.1078995309770105, "learning_rate": 6.551745216700288e-06, "epoch": 2.017238217494957, "step": 44000}
{"loss": 2.1744367275834082, "learning_rate": 6.3989241396173365e-06, "epoch": 2.0401613790573996, "step": 44500}
{"loss": 2.1157472562491892, "learning_rate": 6.246103062534385e-06, "epoch": 2.063084540619842, "step": 45000}
{"loss": 2.1088514780700205, "learning_rate": 6.093281985451434e-06, "epoch": 2.086007702182285, "step": 45500}
{"loss": 2.1483673489689825, "learning_rate": 5.940460908368482e-06, "epoch": 2.1089308637447277, "step": 46000}
{"loss": 2.1313816441185773, "learning_rate": 5.787639831285531e-06, "epoch": 2.1318540253071703, "step": 46500}
{"loss": 2.0758882296085357, "learning_rate": 5.63481875420258e-06, "epoch": 2.1547771868696133, "step": 47000}
{"loss": 2.096497130870819, "learning_rate": 5.481997677119629e-06, "epoch": 2.177700348432056, "step": 47500}
{"loss": 2.0283549708127975, "learning_rate": 5.329176600036678e-06, "epoch": 2.2006235099944984, "st

INFO:transformers.trainer:Saving model checkpoint to ./myBERT_large\checkpoint-50000
INFO:transformers.configuration_utils:Configuration saved in ./myBERT_large\checkpoint-50000\config.json
INFO:transformers.modeling_utils:Model weights saved in ./myBERT_large\checkpoint-50000\pytorch_model.bin
INFO:transformers.trainer:Deleting older checkpoint [myBERT_large\checkpoint-30000] due to args.save_total_limit


{"loss": 2.1159435301516205, "learning_rate": 4.565071214621921e-06, "epoch": 2.3152393178067117, "step": 50500}
{"loss": 2.095642631396651, "learning_rate": 4.4122501375389696e-06, "epoch": 2.3381624793691547, "step": 51000}
{"loss": 2.13258510017395, "learning_rate": 4.259429060456018e-06, "epoch": 2.3610856409315972, "step": 51500}
{"loss": 2.089134459018707, "learning_rate": 4.106607983373067e-06, "epoch": 2.38400880249404, "step": 52000}
{"loss": 2.1241973213031886, "learning_rate": 3.9537869062901154e-06, "epoch": 2.406931964056483, "step": 52500}
{"loss": 2.1201589764542876, "learning_rate": 3.8009658292071645e-06, "epoch": 2.4298551256189254, "step": 53000}
{"loss": 2.0934932481348514, "learning_rate": 3.648144752124213e-06, "epoch": 2.452778287181368, "step": 53500}
{"loss": 2.147756223157048, "learning_rate": 3.495323675041262e-06, "epoch": 2.475701448743811, "step": 54000}
{"loss": 2.1293551406562328, "learning_rate": 3.342502597958311e-06, "epoch": 2.4986246103062535, "step

INFO:transformers.trainer:Saving model checkpoint to ./myBERT_large\checkpoint-60000
INFO:transformers.configuration_utils:Configuration saved in ./myBERT_large\checkpoint-60000\config.json
INFO:transformers.modeling_utils:Model weights saved in ./myBERT_large\checkpoint-60000\pytorch_model.bin
INFO:transformers.trainer:Deleting older checkpoint [myBERT_large\checkpoint-40000] due to args.save_total_limit


{"loss": 2.0714624010026457, "learning_rate": 1.5086496729628952e-06, "epoch": 2.7737025490555656, "step": 60500}
{"loss": 2.0736990784406664, "learning_rate": 1.3558285958799439e-06, "epoch": 2.7966257106180086, "step": 61000}
{"loss": 1.9879294981360436, "learning_rate": 1.2030075187969925e-06, "epoch": 2.819548872180451, "step": 61500}
{"loss": 2.089942114531994, "learning_rate": 1.0501864417140413e-06, "epoch": 2.8424720337428937, "step": 62000}
{"loss": 2.0671531477170064, "learning_rate": 8.9736536463109e-07, "epoch": 2.8653951953053367, "step": 62500}
{"loss": 2.0627119898572563, "learning_rate": 7.445442875481387e-07, "epoch": 2.8883183568677793, "step": 63000}
{"loss": 2.153741326868534, "learning_rate": 5.917232104651874e-07, "epoch": 2.911241518430222, "step": 63500}
{"loss": 2.0252604929804803, "learning_rate": 4.389021333822361e-07, "epoch": 2.934164679992665, "step": 64000}
{"loss": 2.1637557763159276, "learning_rate": 2.860810562992848e-07, "epoch": 2.9570878415551074, "

INFO:transformers.trainer:

Training completed. Do not forget to share your model on huggingface.co/models =)


INFO:transformers.trainer:Saving model checkpoint to ./myBERT_large
INFO:transformers.configuration_utils:Configuration saved in ./myBERT_large\config.json
INFO:transformers.modeling_utils:Model weights saved in ./myBERT_large\pytorch_model.bin


## POS tag + Sentence Position

In [2]:
def query2maskedQueries(query, tokenizer, mask_token='[MASK]'):
    # tokenize and POS tagging
    words = tokenizer.tokenize(query)
    tags = nltk.pos_tag(words)
    sentence_len = len(words)
    # mask each word
    for i in range(sentence_len):
        words[i] = mask_token
        if i <= sentence_len / 3:
            yield [' '.join(words), tags[i][0], tags[i][1] + ' head']
        elif i > sentence_len / 3 * 2:
            yield [' '.join(words), tags[i][0], tags[i][1] + ' tail']
        else:
            yield [' '.join(words), tags[i][0], tags[i][1] + ' middle']
        words[i] = tags[i][0]

In [3]:
with open('../dataset/geo/testset.txt', 'r', encoding='utf-8') as f:
    queries = f.readlines()

In [5]:
## GPU

def predict(queries, model, tokenizer, topk=10):
    
    with torch.no_grad():
        fill_mask = pipeline(
            "fill-mask",
            model=model,
            tokenizer=tokenizer,
            topk=topk,
            device=0
        )
        # predicting
        predict_tokens, ground_truth, tagsNpositions = [], [], []
        
        for query in tqdm(queries):
            _sentences = []
            for sentence, label, tagsNposition in query2maskedQueries(query, tokenizer):
                if label[0].isalpha():
                    _sentences.append(sentence)
                    ground_truth.append(label)
                    tagsNpositions.append(tagsNposition)
            for query_res in fill_mask(_sentences):
                predict_tokens += tokenizer.convert_ids_to_tokens([pred['token'] for pred in query_res])

    return predict_tokens, ground_truth, tagsNpositions

def predict_low_speed(queries, model, tokenizer, mask='[MASK]', topk=10):
    
    with torch.no_grad():
        fill_mask = pipeline(
            "fill-mask",
            model=model,
            tokenizer=tokenizer,
            topk=topk,
            device=0
        )
        # predicting
        predict_tokens, ground_truth, tagsNpositions = [], [], []
        
        _sentences = []
        for query in queries:
            for sentence, label, tagsNposition in query2maskedQueries(query, tokenizer, mask):
                if label[0].isalpha() or label[0] == '▁' or label[0] == 'Ġ':
                    _sentences.append(sentence)
                    ground_truth.append(label)
                    tagsNpositions.append(tagsNposition)
        
        stride = 10
        for i in tqdm(range(0, len(_sentences), stride)):
            end = i + stride
            if end > len(_sentences):
                end = len(_sentences)
            for query_res in fill_mask(_sentences[i:end]):
                if isinstance(query_res, list):
                    predict_tokens += tokenizer.convert_ids_to_tokens([pred['token'] for pred in query_res])
                else:
                    predict_tokens += tokenizer.convert_ids_to_tokens([query_res['token']])

    return predict_tokens, ground_truth, tagsNpositions

def embedding_cache(embedding_model, embedding_tokenizer, words):
    cache = {}
    embedding_model = embedding_model.to('cuda')
    stride = 100
    for i in tqdm(range(0, len(words), stride)):
        end = i + stride
        if end > len(words):
            end = len(words)
        ids = torch.tensor([embedding_tokenizer.convert_tokens_to_ids(words[i:end])]).to('cuda')
        embeds = embedding_model(ids)[0][0].cpu().detach().numpy()
        for j, word in enumerate(words[i:end]):
            cache[word] = embeds[j]
    return cache
    
def evaluate(predict_tokens, ground_truth, tagsNpositions, embedding_model, embedding_tokenizer):
    results = {}
    type_count = {}
    # embedding
    words = list(set(predict_tokens + ground_truth))
    cache = embedding_cache(embedding_model, embedding_tokenizer, words)
    
    for i in tqdm(range(len(ground_truth))):
        cos_sim = []
        ground_truth_embedding = cache[ground_truth[i]]
        tagsNposition = tagsNpositions[i]
        if tagsNposition not in results:
            results[tagsNposition] = 0
            type_count[tagsNposition] = 0
        for predict in predict_tokens[i*10:(i+1)*10]:
            predict_embedding = cache[predict]
            cos_sim.append(np.dot(ground_truth_embedding, predict_embedding) / np.linalg.norm(ground_truth_embedding) / np.linalg.norm(predict_embedding))
        results[tagsNposition] += max(cos_sim)
        type_count[tagsNposition] += 1

    for key in results.keys():
        results[key] /= type_count[key]
    return results

### BERT

In [6]:
pretrained_model = BertModel.from_pretrained('bert-base-uncased')
pretrained_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
embedding_model = pretrained_model
embedding_tokenizer = pretrained_tokenizer

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at C:\Users\Administrator/.cache\torch\transformers\4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin from cache at C:\Users\Administrator/.cache\tor

In [17]:
pretrained_model = BertModel.from_pretrained('bert-large-uncased')
pretrained_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
embedding_model = BertModel.from_pretrained('bert-base-uncased')
embedding_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json from cache at C:\Users\Administrator/.cache\torch\transformers\6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.788fed32bb8481a9b15ce726d41c53d5d5066b04c667e34ce3a7a3826d1573d8
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file https://cdn.huggingface.co/bert-large-uncased-pytorch_model.bin from cache at C:\Users\Administrator/.cache\

In [47]:
# test on this model!
pretrained_model.config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [7]:
print('testing pretrained model')
predict_tokens, ground_truth, tagsNpositions = predict_low_speed(queries, pretrained_model, pretrained_tokenizer)
result = evaluate(predict_tokens, ground_truth, tagsNpositions, embedding_model, embedding_tokenizer)

testing pretrained model


100%|████████████████████████████████████████████████████████████████████████████| 27075/27075 [17:44<00:00, 25.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 140/140 [00:03<00:00, 39.65it/s]
100%|███████████████████████████████████████████████████████████████████████| 270745/270745 [00:24<00:00, 10925.69it/s]


### Fine-Tuning

In [17]:
fine_tuning_model = BertForMaskedLM.from_pretrained('./myBERT')
fine_tuning_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
embedding_model = BertModel.from_pretrained('bert-base-uncased')
embedding_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

INFO:transformers.configuration_utils:loading configuration file ./myBERT\config.json
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file ./myBERT\pytorch_model.bin
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\Administrator/.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO:transform

In [35]:
print('testing fine-tuned model')

predict_tokens, ground_truth, tagsNpositions = predict_low_speed(queries, fine_tuning_model, fine_tuning_tokenizer)
result = evaluate(predict_tokens, ground_truth, tagsNpositions, embedding_model, embedding_tokenizer)

testing fine-tuned model


100%|████████████████████████████████████████████████████████████████████████████| 27075/27075 [32:07<00:00, 14.05it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 203/203 [00:05<00:00, 38.43it/s]
100%|███████████████████████████████████████████████████████████████████████| 270745/270745 [00:24<00:00, 10972.14it/s]


### evaluate

In [12]:
counter = {}
POS_counter = {}
for tagsNposition in tagsNpositions:
    if tagsNposition not in counter:
        counter[tagsNposition] = 0
    POS = tagsNposition.split(' ')[0]
    if POS not in POS_counter:
        POS_counter[POS] = 0
    counter[tagsNposition] += 1
    POS_counter[POS] += 1

In [13]:
total_score = 0
for k in sorted(counter, key=lambda k: -counter[k]):
    print(k, counter[k], result[k])
    total_score += counter[k] * result[k]
total_score /= sum(counter.values())

NN head 22859 0.8578997150384888
NN middle 20637 0.8571619530033625
NN tail 20107 0.8576473644372985
IN head 15035 0.8350105194542786
DT head 13955 0.8118574367035934
IN middle 13521 0.8338719840083599
JJ head 13398 0.8542414535994244
IN tail 11677 0.8352068005241514
JJ middle 11555 0.854528875133786
JJ tail 10762 0.855617246914034
DT middle 10141 0.8118928954065486
DT tail 8008 0.8117925959316405
NNS head 6931 0.860105493680651
NNS tail 6712 0.8604160737561526
NNS middle 6476 0.8600790793924069
VBD head 5367 0.8544084299414745
CC middle 5146 0.8021190590516538
CC tail 4329 0.7994905543756915
RB head 4265 0.8557051723475753
VBD middle 4132 0.8560149538522769
RB middle 3292 0.8535131675577106
VBN head 3181 0.8649062462776421
CC head 2931 0.8034162332441407
VBN middle 2790 0.8654335799823952
RB tail 2704 0.8534517345740598
VBD tail 2684 0.8546848179864102
VBN tail 2192 0.8661425693137367
TO middle 2167 0.81955615754605
VBZ head 1915 0.8536311236436311
VB middle 1838 0.8517110127230074
TO

In [14]:
total_score

0.8435845623789262

In [52]:
POS_result = {}
POS_count = {}
position_result = {}
position_count = {}
for key in result.keys():
    POS, position = key.split(' ')
    if POS not in POS_result:
        POS_result[POS] = 0
        POS_count[POS] = 0
    if position not in position_result:
        position_result[position] = 0
        position_count[position] = 0
    POS_result[POS] += result[key]
    POS_count[POS] += 1
    position_result[position] += result[key]
    position_count[position] += 1

for key in POS_result.keys():
    POS_result[key] /= POS_count[key]
for key in position_result.keys():
    position_result[key] /= position_count[key]

In [53]:
POS_result

{'IN': 0.8164507228091029,
 'DT': 0.7590527545858299,
 'JJ': 0.854006506637432,
 'NN': 0.8548732210070132,
 'VBZ': 0.8516416271741537,
 'RB': 0.8278739336735561,
 'VBN': 0.8611208665360882,
 'NNS': 0.8564696598191972,
 'CC': 0.8151586679297947,
 'VBP': 0.7262765500757057,
 'TO': 0.8601523128120591,
 'VB': 0.8543623682908379,
 'PRP': 0.8486366122051187,
 'VBG': 0.867899539820035,
 'JJS': 0.8450265457934139,
 'RBS': 0.8071210239999979,
 'VBD': 0.8386600828467209,
 'PRP$': 0.8506894727507722,
 'SYM': 0.8522878110293828,
 'JJR': 0.8531515305006355,
 'FW': 0.8507318463701611,
 'WP': 0.8725918824923683,
 'MD': 0.8493478157863806,
 'WRB': 0.8354810706960251,
 'RP': 0.8587108361629259,
 'WDT': 0.8391820580773115,
 'CD': 0.8263817516744624,
 'WP$': 0.7968932658893255,
 'PDT': 0.8522027058356222,
 'NNP': 0.8536616961702088,
 'RBR': 0.8549123446479031,
 'EX': 0.7089451461718856,
 'NNPS': 0.8293821703504634,
 '$': 0.8557611788764143,
 'UH': 0.8578662392165927,
 'POS': 0.9132225513458252}

In [54]:
position_result

{'head': 0.8374526887491365,
 'middle': 0.836231625667916,
 'tail': 0.8350202946898442}

## KNN

In [86]:
pretrained_model = BertModel.from_pretrained('bert-base-uncased')
pretrained_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at C:\Users\Administrator/.cache\torch\transformers\4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
INFO:transformers.configuration_utils:Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin from cache at C:\Users\Administrator/.cache\torch\transformers\f2ee78bdd635b758cc0a12352586868b

In [2]:
pretrained_model = BertModel.from_pretrained('./myBERT')
pretrained_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

INFO:transformers.configuration_utils:loading configuration file ./myBERT\config.json
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file ./myBERT\pytorch_model.bin
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\Administrator/.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [5]:
pretrained_model = BertModel.from_pretrained('./myBERT_large')
pretrained_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

INFO:transformers.configuration_utils:loading configuration file ./myBERT_large\config.json
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file ./myBERT_large\pytorch_model.bin
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt from cache at C:\Users\Administrator/.cache\torch\transformers\9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [48]:
pretrained_model = BertModel.from_pretrained('bert-large-uncased')
pretrained_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json from cache at C:\Users\Administrator/.cache\torch\transformers\6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.788fed32bb8481a9b15ce726d41c53d5d5066b04c667e34ce3a7a3826d1573d8
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file https://cdn.huggingface.co/bert-large-uncased-pytorch_model.bin from cache at C:\Users\Administrator/.cache\

In [4]:
with open('data/dataset', 'r', encoding='utf-8') as f:
    baseset = [line.strip() for line in f.readlines()]
with open('data/ann_set', 'r', encoding='utf-8') as f:
    queryset = [line.strip() for line in f.readlines()]

In [7]:
fill_mask = pipeline('feature-extraction', model=pretrained_model, tokenizer=pretrained_tokenizer, device=0)

In [8]:
base_embeddings = []
stride = 10
for i in tqdm(range(0, len(baseset), stride)):
    end = i + stride
    if end > len(baseset):
        end = len(baseset)
    for embed in fill_mask(baseset[i:end]):
        base_embeddings.append(embed[0])
print('n =', len(base_embeddings), 'dimension =', len(base_embeddings[0]))

100%|██████████████████████████████████████████████████████████████████████████████| 5462/5462 [08:09<00:00, 11.17it/s]


n = 54617 dimension = 1024


In [10]:
query_embeddings = []
stride = 10
for i in tqdm(range(0, len(queryset), stride)):
    end = i + stride
    if end > len(queryset):
        end = len(queryset)
    for embed in fill_mask(queryset[i:end]):
        query_embeddings.append(embed[0])
print('n =', len(query_embeddings), 'dimension =', len(query_embeddings[0]))

100%|██████████████████████████████████████████████████████████████████████████████| 5462/5462 [08:43<00:00, 10.43it/s]


n = 54618 dimension = 1024


In [93]:
nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(np.array(base_embeddings))
distances, indices = nbrs.kneighbors(np.array(query_embeddings))

In [95]:
count = 0
for i, index in enumerate(indices):
    if i in index:
        count += 1
print(count/len(indices))

0.5262367717602255
