In [1]:
!pip install transformers
!pip install datasets
!pip install sacremoses
!pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
from datasets import load_dataset
from transformers import FlaubertTokenizer, FlaubertForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch

In [2]:
# Load a subset of the IMDB dataset
dataset = load_dataset('imdb', split='train[:1000]')

# Load the FlauBERT tokenizer and model
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=2)

Some weights of the model checkpoint at flaubert/flaubert_base_cased were not used when initializing FlaubertForSequenceClassification: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing FlaubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [3]:
# Preprocess the data
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=128, padding='max_length')

# Map the dataset with the preprocessing function
encoded_dataset = dataset.map(preprocess_function, batched=True)

# Define the labels
def compute_labels(examples):
    labels = examples['label']
    if isinstance(labels, list):  # handling both single and batched data
        return {'labels': [0 if label == 'neg' else 1 for label in labels]}
    else:
        return {'labels': [0 if labels == 'neg' else 1]}

# Map the dataset with the compute labels function
encoded_dataset = encoded_dataset.map(compute_labels)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [4]:
encoded_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [5]:
# Define the training arguments
training_args = TrainingArguments(
    "test-trainer",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=5,
    disable_tqdm=True
)

In [6]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset,
)

# Train the model
trainer.train()



{'eval_loss': 2.2649762065896084e-09, 'eval_runtime': 1.9957, 'eval_samples_per_second': 501.08, 'eval_steps_per_second': 62.635, 'epoch': 1.0}
{'eval_loss': 1.4305113760215704e-09, 'eval_runtime': 1.9858, 'eval_samples_per_second': 503.578, 'eval_steps_per_second': 62.947, 'epoch': 2.0}
{'eval_loss': 9.53674250681047e-10, 'eval_runtime': 1.9545, 'eval_samples_per_second': 511.652, 'eval_steps_per_second': 63.957, 'epoch': 3.0}
{'eval_loss': 5.960463789200787e-10, 'eval_runtime': 1.9971, 'eval_samples_per_second': 500.738, 'eval_steps_per_second': 62.592, 'epoch': 4.0}
{'eval_loss': 5.960463789200787e-10, 'eval_runtime': 1.9865, 'eval_samples_per_second': 503.407, 'eval_steps_per_second': 62.926, 'epoch': 5.0}
{'train_runtime': 41.7744, 'train_samples_per_second': 119.691, 'train_steps_per_second': 7.541, 'train_loss': 0.0031767073131742933, 'epoch': 5.0}


TrainOutput(global_step=315, training_loss=0.0031767073131742933, metrics={'train_runtime': 41.7744, 'train_samples_per_second': 119.691, 'train_steps_per_second': 7.541, 'train_loss': 0.0031767073131742933, 'epoch': 5.0})

In [7]:
import torch

# Let's say you have the following sentences:
sentences = ["I love this movie!"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# You would preprocess these sentences using the same steps as during training:
inputs = tokenizer(sentences, truncation=True, padding=True, return_tensors="pt")

# Move the inputs tensors to the same device as the model
inputs = {key: value.to(device) for key, value in inputs.items()}

# You can then feed these inputs to the model:
outputs = model(**inputs)

# The model returns the logits, which are the raw output values from the last layer of the model
logits = outputs.logits

# To get the predicted class, you can apply a softmax to the logits and take the argmax:
probs = torch.nn.functional.softmax(logits, dim=-1)
predictions = torch.argmax(probs, dim=-1)

In [8]:
predictions

tensor([1], device='cuda:0')