In [None]:
import torch
import evaluate

from datasets import load_dataset
import numpy as np
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import pipeline
from transformers import AutoModel, AutoModelForSequenceClassification, BertConfig, BertModel
from transformers import AutoTokenizer, AdamW, DataCollatorWithPadding, TrainingArguments, Trainer
from transformers import get_scheduler, CamembertTokenizer, CamembertForMaskedLM, AutoModelForMaskedLM

In [None]:
text = """
    America has changed dramatically during recent years. Not only has the number of 
    graduates in traditional engineering disciplines such as mechanical, civil, 
    electrical, chemical, and aeronautical engineering declined, but in most of 
    the premier American universities engineering curricula now concentrate on 
    and encourage largely the study of engineering science. As a result, there 
    are declining offerings in engineering subjects dealing with infrastructure, 
    the environment, and related issues, and greater concentration on high 
    technology subjects, largely supporting increasingly complex scientific 
    developments. While the latter is important, it should not be at the expense 
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other 
    industrial countries in Europe and Asia, continue to encourage and advance 
    the teaching of engineering. Both China and India, respectively, graduate 
    six and eight times as many traditional engineers as does the United States. 
    Other industrial countries at minimum maintain their output, while America 
    suffers an increasingly serious decline in the number of engineering graduates 
    and a lack of well-educated engineers.
"""

def tutorial_1(): 
    classifier = pipeline("sentiment-analysis")
    print(classifier("I've been waiting for a HuggingFace course my whole life."))
    print(classifier(["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]))
    
    classifier = pipeline("zero-shot-classification")
    print(classifier("This is a course about the Transformers library", candidate_labels=["education", "politics", "business"],))
    
    generator = pipeline("text-generation")
    print(generator("In this course, we will teach you how to"))
    
    unmasker = pipeline("fill-mask")
    print(unmasker("This course will teach you all about <mask> models.", top_k=2))
    
    ner = pipeline("ner", grouped_entities=True)
    print(ner("My name is Sylvain and I work at Hugging Face in Brooklyn."))
    
    question_answerer = pipeline("question-answering")
    print(question_answerer(question="Where do I work?", context="My name is Sylvain and I work at Hugging Face in Brooklyn",))
    
    summarizer = pipeline("summarization")
    print(summarizer(text))
    
    translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")
    print(translator("Ce cours est produit par Hugging Face."))

In [None]:
def tutorial_2():
    unmasker = pipeline("fill-mask", model="bert-base-uncased")
    result = unmasker("This man works as a [MASK].")
    print([r["token_str"] for r in result])

    result = unmasker("This woman works as a [MASK].")
    print([r["token_str"] for r in result])

In [None]:
def tutorial_3():
    checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
    ]
    inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
    print(inputs)
    
    model = AutoModel.from_pretrained(checkpoint)
    outputs = model(**inputs)
    print(outputs.last_hidden_state.shape)
    
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
    outputs = model(**inputs)
    print(outputs.logits.shape)
    print(outputs.logits)
    
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    print(predictions)
    
    print (model.config.id2label)

In [None]:
def tutorial_4():
    config = BertConfig()
    model = BertModel(config)
    print(config)
    
    model = BertModel.from_pretrained("bert-base-cased")
    model.save_pretrained("pretrained_models/bert-custom")
    
    sequences = ["Hello!", "Cool.", "Nice!"]
    encoded_sequences = [
    [101, 7592, 999, 102],
    [101, 4658, 1012, 102],
    [101, 3835, 999, 102], ]
    
    model_inputs = torch.tensor(encoded_sequences)
    output = model(model_inputs)
    print(output)

In [None]:
def tutorial_5():
    tokenized_text = "Jim Henson was a puppeteer".split()
    print(tokenized_text)
    
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    print(tokenizer("Using a Transformer network is simple"))
    tokenizer.save_pretrained("directory_on_my_computer")
    
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    sequence = "Using a Transformer network is simple"
    tokens = tokenizer.tokenize(sequence)
    print(tokens)
    
    ids = tokenizer.convert_tokens_to_ids(tokens)
    print(ids)
    
    decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
    print(decoded_string)

In [None]:
def tutorial_6():
    checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

    sequence = "I've been waiting for a HuggingFace course my whole life."
    tokens = tokenizer.tokenize(sequence)
    ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = torch.tensor([ids])
    print("Input IDs:", input_ids)
    output = model(input_ids)
    print("Logits:", output.logits)
    
    sequence1_ids = [[200, 200, 200]]
    sequence2_ids = [[200, 200]]
    batched_ids = [
        [200, 200, 200],
        [200, 200, tokenizer.pad_token_id],
    ]

    print(model(torch.tensor(sequence1_ids)).logits)
    print(model(torch.tensor(sequence2_ids)).logits)
    print(model(torch.tensor(batched_ids)).logits)
    
    batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
    ]
    attention_mask = [
        [1, 1, 1],
        [1, 1, 0],
    ]
    outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
    print(outputs.logits)

In [None]:
def tutorial_7():
    checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
    sequence = "I've been waiting for a HuggingFace course my whole life."

    model_inputs = tokenizer(sequence)
    print(model_inputs["input_ids"])

    tokens = tokenizer.tokenize(sequence)
    ids = tokenizer.convert_tokens_to_ids(tokens)
    print(ids)
    print(tokenizer.decode(model_inputs["input_ids"]))
    print(tokenizer.decode(ids))
    
    sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
    tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
    output = model(**tokens)
    print(output.logits)

In [None]:
def tutorial_8():
    checkpoint = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
    sequences = [
        "I've been waiting for a HuggingFace course my whole life.",
        "This course is amazing!",
    ]
    batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

    # This is new
    batch["labels"] = torch.tensor([1, 1])

    optimizer = AdamW(model.parameters())
    loss = model(**batch).loss
    loss.backward()
    optimizer.step()
    
    raw_datasets = load_dataset("glue", "mrpc")
    print(raw_datasets)
    raw_train_dataset = raw_datasets["train"]
    print(raw_train_dataset[0])
    print(raw_train_dataset.features)
    
    checkpoint = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
    tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
    inputs = tokenizer("This is the first sentence.", "This is the second one.")
    print(inputs)
    
    tokenizer.convert_ids_to_tokens(inputs["input_ids"])
    
    tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
    )
    
    def tokenize_function(example):
        return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
    
    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    print(tokenized_datasets)
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    samples = tokenized_datasets["train"][:8]
    samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
    print([len(x) for x in samples["input_ids"]])
    
    batch = data_collator(samples)
    print({k: v.shape for k, v in batch.items()})
    

In [None]:
def tutorial_9():
    raw_datasets = load_dataset("glue", "mrpc")
    checkpoint = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    def tokenize_function(example):
        return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    training_args = TrainingArguments("test-trainer")
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    
    trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    )
    
    trainer.train()
    
    predictions = trainer.predict(tokenized_datasets["validation"])
    print(predictions.predictions.shape, predictions.label_ids.shape)
    
    def compute_metrics(eval_preds):
        metric = evaluate.load("glue", "mrpc")
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)
    
    print(compute_metrics(predictions))

In [None]:
def tutorial_10():
    raw_datasets = load_dataset("glue", "mrpc")
    checkpoint = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    
    def tokenize_function(example):
        return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
    tokenized_datasets.set_format("torch")
    print(tokenized_datasets["train"].column_names)
    
    train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator)
    eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator)
    
    for batch in train_dataloader:
        break
    
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    outputs = model(**batch)
    print(outputs.loss, outputs.logits.shape)
    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    num_epochs = 1
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    print(num_training_steps)
    
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    print(device)
    
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            
    metric = evaluate.load("glue", "mrpc")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    print(metric.compute())

In [None]:
def tutorial_11():
    camembert_fill_mask = pipeline("fill-mask", model="camembert-base")
    results = camembert_fill_mask("Le camembert est <mask> :)")
    print(results)
    
    tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
    model = CamembertForMaskedLM.from_pretrained("camembert-base")
    
    tokenizer = AutoTokenizer.from_pretrained("camembert-base")
    model = AutoModelForMaskedLM.from_pretrained("camembert-base")

In [None]:
def tutorial():
    tutorial_1()
    tutorial_2()
    tutorial_3()
    tutorial_4()
    tutorial_5()
    tutorial_6()
    tutorial_7()
    tutorial_8()
    #tutorial_9() # tutorials with trainings takes a lot of time
    #tutorial_10() 
    tutorial_11()

#tutorial() # Run all tutorials, takes a lot of time