In [14]:
import itertools

In [16]:
def unlist(l):
    return list(itertools.chain.from_iterable(l))

In [None]:
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased')

In [20]:
sentences = [
    "Who is Aimee Van Wynsberghe ?",
    "Aimee Van Wynsberghe is a Professor for Applied Ethics ."
]

# Tokenized input with special tokens around it (for BERT: [CLS] at the beginning and [SEP] at the end)
indexed_tokens = [tokenizer.encode(text, add_special_tokens=True) for text in sentences]

segments_ids = [[k] * len(text) for k, text in enumerate(indexed_tokens)]
segments_ids = unlist(segments_ids)

indexed_tokens = unlist(indexed_tokens)

In [24]:
# Convert inputs to PyTorch tensors
segments_tensors = torch.tensor([segments_ids])
tokens_tensor = torch.tensor([indexed_tokens])

model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-cased')

with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, token_type_ids=segments_tensors)

Using cache found in /home/tim/.cache/torch/hub/huggingface_pytorch-transformers_master
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Mask a token that we will try to predict back with `BertForMaskedLM`

masked_index = 8
indexed_tokens[masked_index] = tokenizer.mask_token_id
tokens_tensor = torch.tensor([indexed_tokens])

In [3]:
masked_lm_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForMaskedLM', 'bert-base-cased')

with torch.no_grad():
    predictions = masked_lm_model(tokens_tensor, token_type_ids=segments_tensors)

# Get the predicted token
predicted_index = torch.argmax(predictions[0][0], dim=1)[masked_index].item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
assert predicted_token == 'Jim'

Using cache found in /home/tim/.cache/torch/hub/huggingface_pytorch-transformers_master
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Questions

In [None]:
question_answering_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-large-uncased-whole-word-masking-finetuned-squad')
question_answering_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-large-uncased-whole-word-masking-finetuned-squad')

In [42]:
def answer_question(question, paragraph):
    sentences = [paragraph, question]
    
    indexed_tokens = [
        question_answering_tokenizer.encode(text, add_special_tokens=True)
        for text in sentences
    ]

    segments_ids = [[k] * len(text) for k, text in enumerate(indexed_tokens)]
    segments_ids = unlist(segments_ids)

    indexed_tokens = unlist(indexed_tokens)
    
    segments_tensors = torch.tensor([segments_ids])
    tokens_tensor = torch.tensor([indexed_tokens])

    # Predict the start and end positions logits
    with torch.no_grad():
        out = question_answering_model(tokens_tensor, token_type_ids=segments_tensors)

    # get the highest prediction
    answer = question_answering_tokenizer.decode(
        indexed_tokens[torch.argmax(out.start_logits) : torch.argmax(out.end_logits) + 1]
    )
    
    return answer

In [44]:
paragraph = "Aimee Van Wynsberghe is a Professor for Applied Ethics at the University of Bonn. " \
            "She moved to Bonn in the beginning of 2021." \
            "There she launched the Sustainable AI Lab"

In [52]:
answer_question("Who is Aimee Van Wynsberghe?", paragraph)

'professor for applied ethics'

In [53]:
answer_question("Where does Aimee Van Wynsberghe work?", paragraph)

'university of bonn'

In [54]:
answer_question("In which city does Aimee Van Wynsberghe live?", paragraph)

'bonn'

In [55]:
answer_question("When did Aimee move to bonn?", paragraph)

'2021'

In [56]:
answer_question("", paragraph)

'applied ethics'

## Presentation

0. Who am I and why am I here?
    0.1. Who am I
    0.2. Challenged by Aimee -- This is why I'm here
1. Contents
2. A primer on AI using NLP
    2.1. AI -- Taking the magic out of it
    2.2. NLP -- Why its useful and why its already in your pockets (smartphones)
    2.3. Concepts BERT -- Specific models
    2.4. Example -- Hands on programming
3. The Project
    3.1 Concepts
    3.2 Current Status
    3.3 Future
4. Final Remarks
    4.1 Questions
    4.2 How to reach me

In [38]:
sentences = [
    "Aimee Van Wynsberghe is a Professor for Applied Ethics at the University of Bonn.",
    "Who is Aimee Van Wynsberghe ?",
]

indexed_tokens = [
    question_answering_tokenizer.encode(text, add_special_tokens=True)
    for text in sentences
]

segments_ids = [[k] * len(text) for k, text in enumerate(indexed_tokens)]
segments_ids = unlist(segments_ids)

indexed_tokens = unlist(indexed_tokens)

The format is paragraph first and then question
text_1 = "Aimee van wynsberghe is a Professor for Applied Ethics of AI"
text_2 = "Who was Jim Henson ?"
indexed_tokens = question_answering_tokenizer.encode(text_1, text_2, add_special_tokens=True)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

In [39]:
segments_tensors = torch.tensor([segments_ids])
tokens_tensor = torch.tensor([indexed_tokens])

# Predict the start and end positions logits
with torch.no_grad():
    out = question_answering_model(tokens_tensor, token_type_ids=segments_tensors)

# get the highest prediction
answer = question_answering_tokenizer.decode(
    indexed_tokens[torch.argmax(out.start_logits) : torch.argmax(out.end_logits) + 1]
)

In [41]:
answer

'professor for applied ethics'

In [33]:
assert answer == "puppeteer"

# Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions (set model to train mode before if used for training)
start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
multiple_choice_loss = question_answering_model(
    tokens_tensor,
    token_type_ids=segments_tensors,
    start_positions=start_positions,
    end_positions=end_positions,
)

Using cache found in /home/tim/.cache/torch/hub/huggingface_pytorch-transformers_master


Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Using cache found in /home/tim/.cache/torch/hub/huggingface_pytorch-transformers_master


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]