In [1]:
from transformers import BertTokenizer, BertForPreTraining

In [2]:
import torch

In [3]:
from tqdm.notebook import tqdm as tq
import pandas as pd

In [4]:
import sys
sys.path.append('../')

from util import util

In [5]:
tokenize = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertForPreTraining.from_pretrained('bert-base-multilingual-uncased')

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Data

In [6]:
epo_df = util.decompress_pickle('../data/epo_10_20_df_wo_pre')
epo_df

Unnamed: 0,name,titles,en,de
0,EP03703859NWB1.xml,de\tMODULARER LEISTUNGSWANDLER MIT FLÜSSIGGEKÜ...,A modular power converter comprising a thermal...,"Modularer Leistungswandler, der Folgendes umfa..."
1,EP03703859NWB1.xml,de\tMODULARER LEISTUNGSWANDLER MIT FLÜSSIGGEKÜ...,"The modular power converter of claim 1, wherei...","Modularer Leistungswandler nach Anspruch 1, wo..."
2,EP03703859NWB1.xml,de\tMODULARER LEISTUNGSWANDLER MIT FLÜSSIGGEKÜ...,"The modular power converter of claim 1, wherei...","Modularer Leistungswandler nach Anspruch 1, wo..."
3,EP03703859NWB1.xml,de\tMODULARER LEISTUNGSWANDLER MIT FLÜSSIGGEKÜ...,"The modular power converter of claim 1, wherei...","Modularer Leistungswandler nach Anspruch 1, wo..."
4,EP03703859NWB1.xml,de\tMODULARER LEISTUNGSWANDLER MIT FLÜSSIGGEKÜ...,"The modular power converter of claim 1, wherei...","Modularer Leistungswandler nach Anspruch 1, wo..."
...,...,...,...,...
545093,EP17000010NWB1.xml,de\tSTREULICHTBLENDE EINES BILDERFASSUNGSGERÄT...,"Lens hood according to Claim 10, characterized...","Streulichtblende nach Anspruch 10, dadurch gek..."
545094,EP17000010NWB1.xml,de\tSTREULICHTBLENDE EINES BILDERFASSUNGSGERÄT...,"Lens hood according to one of Claims 9 to 11, ...",Streulichtblende nach einem der Ansprüche 9 bi...
545095,EP17000010NWB1.xml,de\tSTREULICHTBLENDE EINES BILDERFASSUNGSGERÄT...,Lens hood according to one of the preceding cl...,Streulichtblende nach einem der vorherigen Ans...
545096,EP17000010NWB1.xml,de\tSTREULICHTBLENDE EINES BILDERFASSUNGSGERÄT...,Lens hood according to one of the preceding cl...,Streulichtblende nach einem der vorherigen Ans...


In [7]:
corpus_en = epo_df.en.to_list()
corpus_de = epo_df.de.to_list()

len(corpus_en + corpus_de)

1090196

In [8]:
def remove_short_claims(en_claims, de_claims):
    en_claims_proc, de_claims_proc = [], []
    for en, de in zip(en_claims, de_claims):
        if len(en.split(' ')) <= 13:
            continue
        en_claims_proc.append(en)
        de_claims_proc.append(de)
    return en_claims_proc, de_claims_proc

In [9]:
corpus_en_proc, corpus_de_proc = remove_short_claims(corpus_en, corpus_de)

In [10]:
corpus_en_proc = list(set(corpus_en_proc))
corpus_de_proc = list(set(corpus_de_proc))
len(corpus_en_proc+corpus_de_proc)

700023

In [11]:
bag = corpus_en_proc + corpus_de_proc

In [12]:
epo_group_df = epo_df.groupby('name')
epo_group_df.first()

Unnamed: 0_level_0,titles,en,de
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EP00100977NWB1.xml,de\tAnhängekupplung\ten\tTrailer coupling\tfr\...,Trailer coupling for motor vehicles comprising...,Anhängekupplung für Kraftfahrzeuge umfassend e...
EP00100987NWB1.xml,"de\tAntriebssystem für Fahrzeuge, insbesondere...","A drive system 1 for vehicles, in particular c...","Antriebssystem 1 für Fahrzeuge, insbesondere N..."
EP00102859NWB1.xml,de\tZweikreis-Bremsventil für ein Lenkhilfssys...,A pressure balancing valve arrangement 22 comp...,"Druckausgleichventilanordnung 22, umfassend ei..."
EP00104159NWB1.xml,de\tMotorisch angetriebener Fensterheber mit F...,"A power window apparatus for an automobile, co...","Fensterhebervorrichtung für ein Automobil, mit..."
EP00104835NWB1.xml,de\tMotorregelungsystem für ein hybrides Fahrz...,An engine control system for a hybrid vehicle ...,Maschinensteuer Regelsystem für ein Hybridfahr...
...,...,...,...
EP99966772NWB1.xml,de\tSTOSSDÄMPFER\ten\tSHOCK ABSORBER\tfr\tAMOR...,A shock absorber comprising a first cylinder 1...,"Stoßdämpfer, umfassend einen ersten Zylinder 1..."
EP99968940NWB1.xml,de\tGERÄT UND VERFAHREN ZUM VERHINDERN EINER A...,A system 500 for performing an automatic seque...,Ein System 500 zur Ausführung einer automatisc...
EP99970064NWB1.xml,de\tFAHRHILFEVORRICHTUNG UND AUFZEICHNUNGSMEDI...,A driving-operation assist device in a vehicle...,"Fahrhilfevorrichtung in einem Fahrzeug, umfass..."
EP99971507NWB1.xml,de\tFAHRGASTDETECTOR\ten\tPASSENGER DETECTOR\t...,Passenger detector comprising a flexible suppo...,"Fahrgastdetektor, umfassend einen flexiblen Tr..."


In [13]:
names_list = list(set(epo_df.name.to_list()))

In [14]:
epo_filt = []
for name in tq(names_list):
    en_claims = epo_group_df.get_group(name).en.to_list()
    de_claims = epo_group_df.get_group(name).de.to_list()
    
    
    en_claims, de_claims = remove_short_claims(en_claims, de_claims)
    
    epo_filt.append(
        {
            'name': name,
            'en': en_claims,
            'de': de_claims
        }
    )
epo_filt_df = pd.DataFrame(epo_filt)
epo_filt_df

HBox(children=(FloatProgress(value=0.0, max=52660.0), HTML(value='')))




Unnamed: 0,name,en,de
0,EP11010288NWB1.xml,[A charging connector 10 which is to be connec...,"[Ein Ladestecker 10, der in einer Montagericht..."
1,EP06818602NWB1.xml,[A method for controlling an electrical system...,[Verfahren zum Steuern eines Bordnetzes eines ...
2,EP07120057NWB1.xml,[Tensioning device for tensioning a flexible e...,[Spannvorrichtung zum Spannen eines flexiblen ...
3,EP08863048NWB1.xml,"[Wiping system 32 having a fastening tube, by ...",[Wischanlage 32 mit einemdurch das ein Wischer...
4,EP09014669NWB1.xml,"[Connector device 32, in particular for an ele...","[Steckverbindungsvorrichtung 32, insbesondere ..."
...,...,...,...
52655,EP15775695NWB1.xml,[Pneumatic vehicle tyre of a radial design wit...,[Fahrzeugluftreifen in Radialbauart mit einem ...
52656,EP17204405NWB1.xml,"[Vehicle, comprising a drive motor 18 arrange...","[Fahrzeug, umfassend einen in einem Motorraum..."
52657,EP17181629NWB1.xml,"[Motor vehicle, with an interior 13 which is b...","[Kraftfahrzeug, mit einem durch eine Bodenstru..."
52658,EP16165359NWB1.xml,"[Battery charger, comprising three inductance...","[Batterieladegerät, aufweisend drei Induktore..."


In [15]:
bag_en = corpus_en_proc
bag_en_size = len(bag_en)

bag_de = corpus_de_proc
bag_de_size = len(bag_de)

print(bag_en_size, bag_de_size)

352136 347887


In [16]:
bag_size = len(bag)
bag_size

700023

In [17]:
print(epo_filt_df.loc[epo_filt_df['name'] == names_list[10]])

                  name                                                 en  \
10  EP05805437NWB1.xml  [A wiper blade 13, 41, 51, 61, 71, 81 for wipi...   

                                                   de  
10  [Wischerblatt 13, 41, 51, 61, 71, 81 zum Wisch...  


In [56]:
import random

sentence_a = []
sentence_b = []
labels = []

for name in tq(names_list):
    en_list = epo_filt_df.loc[epo_filt_df['name'] == name].en.to_list()[0]
    de_list = epo_filt_df.loc[epo_filt_df['name'] == name].de.to_list()[0]
    
    assert len(en_list) == len(de_list), 'List sizes are not consistent'
    num_sentences = len(en_list)
    
    if num_sentences > 1:
        for _ in range(num_sentences):
            sent_ind = random.randint(0, num_sentences-2)
            
            # EN
            sentence_a.append(en_list[sent_ind]) 
            if random.random() > 0.5:
                # for 50 % cases, select a random sentence from the list
                # label - 1 - IsNotNext
                sentence_b.append(bag[random.randint(0, bag_size-1)])
                labels.append(1)
            else:
                # for the other 50 % cases, select the next sentence from the list
                # label - 0 - IsNext
                # here, give the sentence from the other language in the smae index
                sentence_b.append(de_list[sent_ind])
                labels.append(0)
                
                
            # DE
            sentence_a.append(de_list[sent_ind]) 
            if random.random() > 0.5:
                # for 50 % cases, select a random sentence from the list
                # label - 1 - IsNotNext
                sentence_b.append(bag[random.randint(0, bag_size-1)])
                labels.append(1)
            else:
                # for the other 50 % cases, select the next sentence from the list
                # label - 0 - IsNext
                # here, give the sentence from the other language in the smae index                
                sentence_b.append(en_list[sent_ind])
                labels.append(0)            

HBox(children=(FloatProgress(value=0.0, max=52660.0), HTML(value='')))




In [57]:
len(sentence_a)

732376

In [58]:
for i in range(10):
    print('Label: %d\nSentences:\n%s\n%s\n-----' %(labels[i],
                                                  sentence_a[i],
                                                  sentence_b[i])
         )

Label: 1
Sentences:
A charging connector according to claim 1, wherein the connecting portion 13 includes a power supply terminal 20 for supplying power and the protection cap 30 60 includes at least one separation wall 35 38 extending from a facing surface of the facing wall 30A substantially opposite to the mounting direction MD to separate the power supply terminal 20 from another terminal.
Aktives Fahrwerk gemäß Anspruch 3, wobei das Trajektorieentwicklungssystem eine Glättungseinrichtung beinhaltet, um die Profildaten zur Entwicklung des Trajektorieplans zu glätten.
-----
Label: 0
Sentences:
Ein Ladestecker nach Anspruch 1, wobei der Anschlussabschnitt 13 zur Stromversorgung einen Stromversorgungsanschluss 20 beinhaltet und wobei die Schutzkappe 30 60 mindestens eine Trennwand 35 38 beinhaltet, die von einer Stirnfläche der Stirnwand 30A im Wesentlichen entgegen der Montagerichtung MD ausgeht, um den Stromversorgungsanschluss 20 von einem anderen Anschluss zu trennen.
A charging c

In [123]:
# %%time
# inputs = tokenize(sentence_a, sentence_b, return_tensors='pt',
#                  max_length=512, truncation=True, padding='max_length')
# inputs.keys()

CPU times: user 31min 33s, sys: 6.87 s, total: 31min 40s
Wall time: 31min 40s


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [124]:
# inputs['labels'] = torch.LongTensor([labels]).T

In [125]:
inputs

{'input_ids': tensor([[  101, 10144, 94120,  ...,     0,     0,     0],
        [  101, 10350, 20736,  ...,     0,     0,     0],
        [  101, 10103, 94120,  ...,     0,     0,     0],
        ...,
        [  101, 48324, 10123,  ...,     0,     0,     0],
        [  101,   143, 33714,  ...,     0,     0,     0],
        [  101, 48324, 10123,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[1],
        [0],
        [1],
        ...,
        [0],
        [0],
        [1]])}

In [126]:
%%time
util.compress_pickle('../data/inputs_NSP', inputs)

CPU times: user 5min 52s, sys: 2.56 s, total: 5min 54s
Wall time: 5min 54s


## Reload

In [19]:
%%time
inputs = util.decompress_pickle('../data/inputs_NSP')

CPU times: user 54.7 s, sys: 3.91 s, total: 58.6 s
Wall time: 58.7 s


In [20]:
inputs['input_ids'].shape

torch.Size([732376, 512])

In [21]:
inputs

{'input_ids': tensor([[  101, 10144, 94120,  ...,     0,     0,     0],
        [  101, 10350, 20736,  ...,     0,     0,     0],
        [  101, 10103, 94120,  ...,     0,     0,     0],
        ...,
        [  101, 48324, 10123,  ...,     0,     0,     0],
        [  101,   143, 33714,  ...,     0,     0,     0],
        [  101, 48324, 10123,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[1],
        [0],
        [1],
        ...,
        [0],
        [0],
        [1]])}

In [22]:
# here, reloaded inputs was created for NSP. labels should be renamed as next_sentence_label
inputs['next_sentence_label'] = inputs['labels']
inputs

{'input_ids': tensor([[  101, 10144, 94120,  ...,     0,     0,     0],
        [  101, 10350, 20736,  ...,     0,     0,     0],
        [  101, 10103, 94120,  ...,     0,     0,     0],
        ...,
        [  101, 48324, 10123,  ...,     0,     0,     0],
        [  101,   143, 33714,  ...,     0,     0,     0],
        [  101, 48324, 10123,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[1],
        [0],
        [1],
        ...,
        [0],
        [0],
        [1]]), 'next_sentence_label': tensor([[1],
        [0

## MLM

In [23]:
mask_arr = torch.rand(inputs.input_ids.shape)
selection = (mask_arr < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)
mask_indices = []
for idx in range(len(selection)):
    mask_indices.append(
        torch.flatten(selection[idx].nonzero()).tolist()
    )
inputs['labels'] = inputs.input_ids.detach().clone()
inputs['labels']

tensor([[  101, 10144, 94120,  ...,     0,     0,     0],
        [  101, 10350, 20736,  ...,     0,     0,     0],
        [  101, 10103, 94120,  ...,     0,     0,     0],
        ...,
        [  101, 48324, 10123,  ...,     0,     0,     0],
        [  101,   143, 33714,  ...,     0,     0,     0],
        [  101, 48324, 10123,  ...,     0,     0,     0]])

In [24]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, mask_indices[i]] = 103

In [25]:
inputs

{'input_ids': tensor([[  101,   103, 94120,  ...,     0,     0,     0],
        [  101, 10350, 20736,  ...,     0,     0,     0],
        [  101, 10103, 94120,  ...,     0,     0,     0],
        ...,
        [  101, 48324, 10123,  ...,     0,     0,     0],
        [  101,   143, 33714,  ...,     0,     0,     0],
        [  101, 48324, 10123,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[  101, 10144, 94120,  ...,     0,     0,     0],
        [  101, 10350, 20736,  ...,     0,     0,     0],
        [  101, 10103, 

## PyTorch Dataset Class

In [15]:
class EpoDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __len__(self):
        return self.encodings.input_ids.shape[0]
    
    def __getitem__(self, idx):
        return {key: tensor[idx] for key, tensor in self.encodings.items()}

In [16]:
dataset = EpoDataset(encodings=inputs)

In [17]:
dataloader = torch.utils.data.DataLoader(dataset, 
                                         batch_size=16,
                                         shuffle=True)

## Training loop

In [18]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [19]:
import torch.nn as nn
model= nn.DataParallel(model, device_ids = [0, 1])

In [20]:
model.to(device)
model.train()

DataParallel(
  (module): BertForPreTraining(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(105879, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
       

**Define optimizer** 

In [21]:
from transformers import AdamW

In [22]:
optim = AdamW(model.parameters(), lr=1e-5)

**Training loop**

In [23]:
from tqdm import tqdm

In [24]:
EPOCHS = 2
for epoch in range(EPOCHS):
    loop = tqdm(dataloader, leave=True)
    
    for batch in loop:
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        
        output = model(input_ids, 
                       token_type_ids=token_type_ids,
                       attention_mask=attention_mask, 
                       next_sentence_label=next_sentence_label,
                       labels=labels)
        
        loss = output.loss
        loss = loss.mean()
        loss.backward()
        
        optim.step()
        
        loop.set_description(f'Epoch: {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch: 0:   0%|                                              | 60/45774 [00:52<11:12:58,  1.13it/s, loss=0.541]


KeyboardInterrupt: 

In [None]:
torch.save(model, '../data/model_MLM_NSP.pt')