In [1]:
pip install transformers datasets torch evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
!mkdir -p ~/.kaggle
!echo '{"wandb_api_key": "5af3f6dab5b4be7bfb38dadf6554a3d40a09ada9"}' > ~/.kaggle/secrets.json
!chmod 600 ~/.kaggle/secrets.json


In [3]:
import json
import wandb

# Load API Key từ Kaggle Secrets
with open("/root/.kaggle/secrets.json", "r") as f:
    secrets = json.load(f)
    wandb_api_key = secrets["wandb_api_key"]

# Đăng nhập vào W&B
wandb.login(key=wandb_api_key)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtruongminhphuc08102005[0m ([33mtruongminhphuc08102005-hanoi-university-of-science-and-t[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
# Import necessary libraries
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate

# 1. Load the MultiNLI dataset
dataset = load_dataset("nyu-mll/multi_nli")

# 2. Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 3. Define a tokenization function
# Tokenizes premise and hypothesis pairs without padding (dynamic padding handled by data collator)
def tokenize_function(examples):
    return tokenizer(
        examples['premise'],
        examples['hypothesis'],
        truncation=True,
        max_length=512,  # BERT's max length
        padding=False    # Padding will be handled dynamically by DataCollatorWithPadding
    )


README.md:   0%|          | 0.00/8.89k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

(…)alidation_matched-00000-of-00001.parquet:   0%|          | 0.00/4.94M [00:00<?, ?B/s]

(…)dation_mismatched-00000-of-00001.parquet:   0%|          | 0.00/5.10M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
# 4. Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 5. Rename 'label' column to 'labels' (required by Trainer)
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')

# 6. Remove unnecessary columns
# Keep only 'input_ids', 'attention_mask', 'token_type_ids', and 'labels'
columns_to_remove = [
    'promptID', 'pairID', 'genre', 'premise', 'hypothesis',
    'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse'
]
tokenized_datasets = tokenized_datasets.remove_columns(columns_to_remove)

# 7. Prepare train and validation datasets
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['validation_matched']

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

In [6]:


# 8. Load the BERT model for sequence classification with 3 labels
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# 9. Define training arguments
training_args = TrainingArguments(
    output_dir='./results',                  # Directory to save model outputs
    num_train_epochs=3,                     # Number of training epochs
    per_device_train_batch_size=16,         # Batch size for training
    per_device_eval_batch_size=16,          # Batch size for evaluation
    warmup_steps=500,                       # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                      # Weight decay for regularization
    logging_dir='./logs',                   # Directory for training logs
    logging_steps=10,                       # Log every 10 steps
    evaluation_strategy='epoch',            # Evaluate at the end of each epoch
    save_strategy='epoch',                  # Save model at the end of each epoch
    load_best_model_at_end=True             # Load the best model based on evaluation metric
)

# 10. Load the accuracy metric
accuracy_metric = evaluate.load("accuracy")

# 11. Define a function to compute metrics (e.g., accuracy)
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [7]:
# 12. Initialize the Trainer
from transformers import DataCollatorWithPadding

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer),
)

# 13. Fine-tune the model
trainer.train()

[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250320_082934-cgqemrwc[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m./results[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/truongminhphuc08102005-hanoi-university-of-science-and-t/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/truongminhphuc08102005-hanoi-university-of-science-and-t/huggingface/runs/cgqemrwc[0m


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4941,0.442611,0.825267
2,0.3838,0.437519,0.83892
3,0.2142,0.552263,0.840041




TrainOutput(global_step=36816, training_loss=0.345966265795478, metrics={'train_runtime': 14427.0772, 'train_samples_per_second': 81.659, 'train_steps_per_second': 2.552, 'total_flos': 5.654354625166968e+16, 'train_loss': 0.345966265795478, 'epoch': 3.0})

In [8]:
# 14. Evaluate the model on the validation set
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)



Evaluation results: {'eval_loss': 0.43751901388168335, 'eval_accuracy': 0.838920020376974, 'eval_runtime': 38.981, 'eval_samples_per_second': 251.789, 'eval_steps_per_second': 7.876, 'epoch': 3.0}


In [9]:
# Define a directory in Kaggle to save the fine-tuned model and tokenizer
save_directory = "/kaggle/working/BERT-Sentencepairclassi"

# Save the fine-tuned model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to /kaggle/working/BERT-Sentencepairclassi
