In [9]:
%cd /workspace
from glob import glob

import numpy as np
import torch
from datasets import load_metric
from transformers import (AutoModelForSequenceClassification, 
                          AutoTokenizer, DataCollatorWithPadding, 
                          TrainingArguments, Trainer)

from baseline.evaluate import calculate_score
from baseline.tokenizer import tokenize
from baseline.limesoda import LimeSodaDataset, read_limesoda

LIMESODA_DIR = "/workspace/dataset/LimeSoda"
DELIMITER = ""
BATCH_SIZE = 24
EPOCHS = 200
LEARNING_RATE = 2e-6
TRAIN_PERCENTAGE = 1.

/workspace


In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [12]:
SAVE_DIR = "./results/raw-simcse/200epochs-100percent/"

models = sorted(glob("weights/raw/*"))

for model_name in models:
    model_name = 
    print(">"*10)
    print(model_name.split("/")[-1])
    print(">"*10)
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_dataset = LimeSodaDataset(f"{LIMESODA_DIR}/../tempLimesoda/train_v1.jsonl", tokenizer)
    val_dataset = LimeSodaDataset(f"{LIMESODA_DIR}/../tempLimesoda/val_v1.jsonl", tokenizer)
    test_dataset = LimeSodaDataset(f"{LIMESODA_DIR}/../tempLimesoda/test_v1.jsonl", tokenizer)
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    if torch.cuda.is_available():
        model = model.cuda()
        
    metric = load_metric("accuracy")

    training_args = TrainingArguments(
        output_dir=SAVE_DIR,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=16,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        save_steps=5000,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    model.save_pretrained(SAVE_DIR + "final")
    
    # evaluate
    result = trainer.predict(val_dataset)
    val_result = calculate_score(dataset["val"]["label"].values, result.predictions.argmax(-1))
    _ = val_result.pop("prediction")
    
    result = trainer.predict(test_dataset)
    test_result = calculate_score(dataset["test"]["label"].values, result.predictions.argmax(-1))
    _ = test_result.pop("prediction")
    
    print("VALIDATION RESULT")
    print(val_result)
    
    print("TEST RESULT")
    print(test_result)
    break

Didn't find file weights/raw/wangchanberta-simcse-raw-bs16-epoch1-lr0.0001/added_tokens.json. We won't load it.
loading file weights/raw/wangchanberta-simcse-raw-bs16-epoch1-lr0.0001/sentencepiece.bpe.model
loading file weights/raw/wangchanberta-simcse-raw-bs16-epoch1-lr0.0001/tokenizer.json
loading file None
loading file weights/raw/wangchanberta-simcse-raw-bs16-epoch1-lr0.0001/special_tokens_map.json
loading file weights/raw/wangchanberta-simcse-raw-bs16-epoch1-lr0.0001/tokenizer_config.json


>>>>>>>>>>
wangchanberta-simcse-raw-bs16-epoch1-lr0.0001
>>>>>>>>>>
Loading data...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2698/2698 [00:00<00:00, 5372.75it/s]


Loading data...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 5691.18it/s]


Loading data...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2765/2765 [00:00<00:00, 9293.96it/s]
loading configuration file weights/raw/wangchanberta-simcse-raw-bs16-epoch1-lr0.0001/config.json
Model config CamembertConfig {
  "_name_or_path": "weights/raw/wangchanberta-simcse-raw-bs16-epoch1-lr0.0001",
  "architectures": [
    "CamembertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "a

Step,Training Loss



KeyboardInterrupt

