In [1]:
from datasets import load_dataset
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


## NLP 2  LAB 3: HuggingFace Transformers
Auhors : 
- Jonathan Poelger
- Ethan Machavoine
- Aurelien Rouxel

In [2]:
"""
Load the data from IMDB , creates a suitable tokenizer and tokenize the dataset.

input: 
    - String: the name of the checkpoint to use for the tokenizer
    
output:
    - Object: The tokenized dataset
    - Object: The tokenizer
    - String: The checkpoint name
"""
def load_data(checkpoint):
    raw_datasets = load_dataset("imdb")
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    tokenized_datasets = raw_datasets.map(lambda example: tokenizer.__call__(example["text"], truncation=True), batched=True)
    return tokenized_datasets, tokenizer, checkpoint

"""
Creates a model using the outputs of load_data.

input:
    - Object: A tokenized dataset
    - Object: A tokenizer
    - String: The checkpoint name
    - Int: Number of epoch to train
    
output:
    - Object: A trainer for the model
    - Object: The tokenized dataset
"""
def create_model(tokenized_datasets, tokenizer, checkpoint ,num_train_epochs=1):
    training_args = TrainingArguments("test-trainer", num_train_epochs=num_train_epochs, report_to="none")
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator= DataCollatorWithPadding(tokenizer=tokenizer),
    tokenizer=tokenizer,
    )
    return trainer, tokenized_datasets

In [3]:
"""
Fine tune the given model checkpoint with the IMDB dataset

input:
    - String: the name of the checkpoint to fine tune

output:
    - Object: The trainer for the model
    - Object: The tokenized dataset used for fine tunig
"""
def fine_tune(checkpoint):
    trainer, tokenized_datasets = create_model(*load_data(checkpoint))
    trainer.train()
    predictions = trainer.predict(tokenized_datasets["test"])
    print(predictions.predictions.shape, predictions.label_ids.shape)
    return trainer,tokenized_datasets


"""
Compute the accuracy of the given predictions

input:
    - Object: A tokenized dataset
    - Object: A trainer for the model to evaluate

output:
    - Object: The trainer
    - Object: the dataset
"""
def compute_metrics(trainer, dataset):
    predictions = trainer.predict(dataset["test"])
    predicted_labels = np.argmax(predictions.predictions, axis=1)
    accuracy = np.mean(predicted_labels == dataset["test"]["label"]) * 100
    print("Accuracy:", accuracy)
    return trainer, dataset


In [None]:
# Create a model with the bert-base-uncased checkpoint
# Fine tune it with the IMDB dataset on one epoch
# Compute the accuracy of the model
print(compute_metrics(*fine_tune("distilbert-base-uncased"))) # very long to run

In [None]:
# Checkpoint for fine tuned model
mvw_chechpoint = "mvonwyl/distilbert-base-uncased-imdb"

# Load the fine tuned model and compute its accuracy
trainer, dataset = compute_metrics(*create_model(*load_data(mvw_chechpoint)))

In [None]:
# Print the first 3 wrong predictions
X = dataset["test"]["text"]
actual_labels = dataset["test"]["label"]
predicted_labels = np.argmax(trainer.predict(dataset["test"]).predictions, axis=1)
wrong = [[X[i],predicted_labels[i], actual_labels[i]] for i in range(len(X)) if predicted_labels[i] != actual_labels[i]]
wrong[:3]

## Wrongly classified comments


* Predicted positive, actual negative :\
First off let me say, If you haven't enjoyed a Van Damme movie since bloodsport, you probably will not like this movie. Most of these movies may not have the best plots or best actors but I enjoy these kinds of movies for what they are. This movie is much better than any of the movies the other action guys (Segal and Dolph) have thought about putting out the past few years. Van Damme is good in the movie, the movie is only worth watching to Van Damme fans. It is not as good as Wake of Death (which i highly recommend to anyone of likes Van Damme) or In hell but, in my opinion it's worth watching. It has the same type of feel to it as Nowhere to Run. Good fun stuff!")


To be honest, as a human I thought it was a positive comment when reading it, but it's labeled negative in IMDB. The reason why the model didn't classified this comment as negative is because the autor puts emphasis on the fact that it would be enjoyable to Van Damme fans. The model can be tricked by expressions like :
"Good fun stuff!"
"This movie is much better than any of the movies [...]"
"in my opinion it's worth watching."


* Predicted positive, actual negative :\
The only reason this movie is not given a 1 (awful) vote is that the acting of both Ida Lupino and Robert Ryan is superb. Ida Lupino who is lovely, as usual, becomes increasingly distraught as she tries various means to rid herself of a madman. Robert Ryan is terrifying as the menacing stranger whose character, guided only by his disturbed mind, changes from one minute to the next. Seemingly simple and docile, suddenly he becomes clever and threatening. Ms. Lupino's character was in more danger from that house she lived in and her own stupidity than by anyone who came along. She could not manage to get out of her of her own house: windows didn't open, both front and back doors locked and unlocked from the inside with a key. You could not have designed a worse fire-trap if you tried. She did not take the precaution of having even one extra key. Nor could she figure out how to summon help from nearby neighbors or get out of her own basement while she was locked in and out of sight of her captor. I don't know what war her husband was killed in, but if it was World War II, the furnishings in her house, the styles of the clothes, especially the children and the telephone company repairman's car are clearly anachronistic. I recommend watching this movie just to see what oddities you can find.


The author once again litteraly says in this comment that they recommend watching the movie. They clearly are laughing at what seems like a B-movie with 2 good actors that saves it a bit but the clear compliments like "both Ida Lupino and Robert Ryan is superb", "Ida Lupino who is lovely, as usual" and "I recommend watching this movie" may lead the model to classifyit as positive. 

* Predicted negative, actual positive :\
Although Bullet In The Brain is, without question, superior amongst short films, it largely seems more like a short piece of writing than a film. And it is a little hard to feel too sorry for the teacher when his smart ass remarks get him shot. But after the bullet enters his brain we begin to understand a little bit about why he became so jaded with life in the first place. There is an awful amount of detail packed into this reasonably short film and this is what makes me feel that it should have been extended a little bit - it seems like there's almost too much to take in at once as the details come flying at you so fast. A slightly more relaxed pace and a less po-faced narrator in the final section would have benefitted this film a little bit. Despite these complaints, there is no denying that Bullet In The Brain is a quite stupendous work compared to many short, and even full length films. The makers should be applauded for trying to make such a basically emotional and literate film in the current climate of quick jokes and Hollywood action.


In this comment the author tries to hint on some ways to make the movie better. They tought that it would have a good potential, but also that it's a bit wasted. The fact that they use constructive criticism throughout the whole comment may be why it was labelled negative.

## Advantages of Transformers:

1. Capturing Long-Term Dependencies: Transformers are good at capturing long-term dependencies in sequential data due to their self-attention mechanism. They can learn dependencies between distant positions in the input, which is good for tasks such as machine translation or document classification.

2. Parallelizable Computation: Transformers can process inputs in parallel, unlike recurrent models that process inputs sequentially. This parallelization makes transformers more efficient during training and inference and faster processing times.

3. Scalability to Large Datasets: Transformers can handle large datasets effectively. They can be trained on massive amounts of data, taking advantage of distributed computing and GPUs, which helps in achieving better performance on complex tasks.

4. No Sequential Constraints: Transformers do not have sequential constraints, meaning they can process the entire input sequence at once. This makes them more flexible in handling different input lengths and allows for easier implementation and optimization.

## Disadvantages of Transformers:
1. High Computational Requirements: Transformers require significant computational resources due to their large number of parameters and self-attention mechanism. Training and fine-tuning transformer models can be computationally expensive and time-consuming, especially for large-scale models.

2. Complex Architecture and Training: Transformers have a more complex architecture compared to simpler models like naive Bayes. They require careful hyperparameter tuning, large amounts of training data, and longer training times to achieve optimal performance.

3. Lack of Interpretable Features: Transformers learn complex representations of the input data, making it challenging to interpret the model's internal workings and understand the specific features it relies on for predictions. Naive Bayes, on the other hand, provides interpretable features based on conditional probabilities.

4. Data Dependency: Transformers heavily rely on large amounts of labeled data for training. They might not perform well in scenarios where limited labeled data is available, especially in niche domains lacking annotated datasets.

## Comparing with Naive Bayes:
- Advantage: Transformers can capture complex relationships and dependencies in data, whereas Naive Bayes assumes independence between features. Transformers are more suitable for tasks that require understanding the context and capturing long-range dependencies.
- Advantage: Transformers do not have the restrictive assumptions of Naive Bayes, allowing them to handle a wider range of data distributions and complex patterns.
- Disadvantage: Transformers require significantly more computational resources, training data, and training time compared to Naive Bayes, making them less suitable for simple tasks or scenarios with limited resources.

## Comparing with Recurrent Models (RNNs/LSTMs):
- Advantage: Transformers can capture long-term dependencies more effectively than recurrent models. RNNs/LSTMs suffer from vanishing or exploding gradients and struggle with long-range dependencies, while transformers use self-attention to attend to all positions in the input sequence.
- Advantage: Transformers have parallelizable computation, making them more efficient for processing long sequences and larger datasets. RNNs/LSTMs process inputs sequentially, which can be a bottleneck for training and inference speed.
- Disadvantage: Transformers generally require more computational resources and training time compared to recurrent models. Transformers have a higher model complexity and may require larger amounts of data for training.
- Disadvantage: Recurrent models have a sequential nature, making them better suited for tasks that involve sequential processing or modeling temporal dependencies, such as time series forecasting or text generation.

The choice between using a transformer, naive Bayes, or a recurrent model depends on the specific task, available resources, dataset size, and the nature of the data. It's important to consider these factors when selecting the appropriate model for production use.