In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-german-cased')

In [3]:
text = "schwul"
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens.
print (tokenized_text)

['[CLS]', 'schw', '##ul', '[SEP]']


In [4]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

In [5]:
indexed_tokens

[3, 1789, 236, 4]

In [6]:
segments_ids = [1] * len(tokenized_text)

In [7]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = AutoModel.from_pretrained('bert-base-german-cased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [24]:
tokens_tensor

tensor([[   3, 1789,  236,    4]])

In [10]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

In [11]:
encoded_layers

'last_hidden_state'

bspwort: "schwul"

1. bert model laden ✅
2. embedding von bspwort extrahieren aus dem bert model (featur extraction) ✅
3. irgendwie most similar embeddings finden
4. diese embeddings in tokens umwandeln
5. profit

In [12]:
from transformers import BertTokenizer, BertModel

import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')

model = BertModel.from_pretrained('bert-base-german-cased')

word = "schwul"

inputs = tokenizer(word, return_tensors="pt")

outputs = model(**inputs)

word_vect = outputs.pooler_output.detach().numpy()

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
w1 = model(**tokenizer("bank", return_tensors="pt")).pooler_output.detach().numpy()
w2 = model(**tokenizer("bank", return_tensors="pt")).pooler_output.detach().numpy()

In [14]:
w2.shape

(1, 768)

In [15]:
w1.shape

(1, 768)

In [16]:
from numpy import dot
from numpy.linalg import norm

a = w1
b = w2.T

cos_sim = dot(a, b)/(norm(a)*norm(b))

# 1. das hier in ne funktion packen
# 2. liste von allen wörter aus bert übergeben
# für jedes wort (bzw. subtoken), ähnlichkeit berechnen wir hier in der zelle

In [17]:
cos_sim[0][0]

0.99999994

In [21]:
def foo(w, l):
    
    w1 = model(**tokenizer(w, return_tensors="pt")).pooler_output.detach().numpy()

    output = {}

    for tw in l:
        w2 = model(**tokenizer(tw, return_tensors="pt")).pooler_output.detach().numpy()
        w2 = w2.T
        cos_sim = dot(w1, w2)/(norm(w1)*norm(w2))
        output[tw] = cos_sim[0][0] # schlauer mit np.squeeze

    return output

In [22]:
l = ["lesbisch", "blöd", "nett", "normal"]
w = "schwul"

In [23]:
foo(w, l)

{'lesbisch': 0.8946189,
 'blöd': 0.8641924,
 'nett': 0.9194622,
 'normal': 0.9110072}

- idee 1: feste liste von wörtern nehmen, die mit bert vergleichen wie oben. dann most_similar bei word2vec machen und gucken, wieviele der top 100 wörter oder so in den top 100 vorkommen
- idee 2: mit word2vec anfangen, top 100 wörter ausgeben. die liste extrahieren und bert als liste übergeben. gucken, ob die verteilung der wörter übereinstimmt, d.h. die top 100 gleiche reihenfolge
- idee 3: alle deutschen wörter nehmen haha: https://www.google.com/search?q=liste+aller+deutschen+w%C3%B6rter&rlz=1C1GCEU_deDE945DE945&oq=liste+aller+deutschen+w%C3%B6rter&aqs=chrome..69i57j0i22i30l2.6242j0j7&sourceid=chrome&ie=UTF-8