In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("redewiedergabe/bert-base-historical-german-rw-cased")

In [3]:
text = "schwul"
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens.
print (tokenized_text)

['[CLS]', 'schw', '##ul', '[SEP]']


In [4]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

In [5]:
indexed_tokens

[3, 1789, 236, 4]

In [6]:
segments_ids = [1] * len(tokenized_text)

In [7]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = AutoModel.from_pretrained("redewiedergabe/bert-base-historical-german-rw-cased")

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of the model checkpoint at redewiedergabe/bert-base-historical-german-rw-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [8]:
tokens_tensor

tensor([[   3, 1789,  236,    4]])

In [9]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

In [10]:
encoded_layers

'last_hidden_state'

In [11]:
from transformers import BertTokenizer, BertModel

import torch

tokenizer = BertTokenizer.from_pretrained("redewiedergabe/bert-base-historical-german-rw-cased")

model = BertModel.from_pretrained("redewiedergabe/bert-base-historical-german-rw-cased")

word = "schwul"

inputs = tokenizer(word, return_tensors="pt")

outputs = model(**inputs)

word_vect = outputs.pooler_output.detach().numpy()

Some weights of the model checkpoint at redewiedergabe/bert-base-historical-german-rw-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
w1 = model(**tokenizer("bank", return_tensors="pt")).pooler_output.detach().numpy()
w2 = model(**tokenizer("bank", return_tensors="pt")).pooler_output.detach().numpy()

In [13]:
w2.shape

(1, 768)

In [14]:
w1.shape

(1, 768)

In [22]:
import operator
def foo(w, l):
    w1 = model(**tokenizer(w, return_tensors="pt")).pooler_output.detach().numpy()

    output = {}

    for tw in l:
        w2 = model(**tokenizer(tw, return_tensors="pt")).pooler_output.detach().numpy()
        w2 = w2.T
        cos_sim = dot(w1, w2)/(norm(w1)*norm(w2))
        output[tw] = cos_sim[0][0] # schlauer mit np.squeeze

    return sorted(output.items(), key=lambda kv: kv[1], reverse=True)

In [23]:
l = ["lesbisch", "blöd", "nett", "normal"]
w = "schwul" 

In [24]:
foo(w, l)

[('blöd', 0.9539089),
 ('nett', 0.94451356),
 ('lesbisch', 0.93899447),
 ('normal', 0.8920188)]

In [27]:
w = "homosexuell"

schwul = foo(w, tokenizer.vocab)

In [28]:
schwul

[('Haare', 0.9445954),
 ('Schuhe', 0.9353191),
 ('Tier', 0.9337626),
 ('Doping', 0.9335826),
 ('Muskel', 0.93301064),
 ('Individual', 0.9325249),
 ('Fleisch', 0.9319345),
 ('Mode', 0.93151754),
 ('Hunde', 0.9291711),
 ('Fett', 0.9283857),
 ('Hund', 0.9276322),
 ('Gehirn', 0.9274954),
 ('Sex', 0.92734426),
 ('Körper', 0.9270321),
 ('Rasse', 0.9266794),
 ('Tuch', 0.92575765),
 ('##ologisch', 0.9257426),
 ('Rugby', 0.9257305),
 ('##mechanismus', 0.9246756),
 ('Kokain', 0.92444193),
 ('##izismus', 0.924358),
 ('Knochen', 0.92428106),
 ('Schönheit', 0.9242546),
 ('Krank', 0.923744),
 ('Trikot', 0.9234882),
 ('Messer', 0.9229476),
 ('Nerven', 0.9228459),
 ('Homosex', 0.92212844),
 ('Pflanze', 0.92203903),
 ('Kleider', 0.92188454),
 ('Schmutz', 0.9217084),
 ('Fahren', 0.9214889),
 ('Schweine', 0.9207692),
 ('Uniform', 0.92073226),
 ('Draht', 0.9207175),
 ('Weibchen', 0.92067057),
 ('Gift', 0.9202469),
 ('Tiere', 0.9202417),
 ('Nase', 0.92010725),
 ('Individuen', 0.9199351),
 ('Heirat', 0.9195