In [1]:
#imports
from datasets import load_dataset, load_metric
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from thai2transformers.preprocess import process_transformers
from thai2transformers.metrics import classification_metrics

In [2]:
#parameters
class Args:
    model_name_or_path = "xlm-roberta-base"
    dataset_name_or_path = 'wisesight_sentiment'
    feature_col = 'texts'
    label_col = 'category'
    output_dir = 'wisesight_sentiment_xlm-roberta-base'
    batch_size = 8
    warmup_percent = 0.1
    learning_rate = 1e-5
    num_train_epochs = 5
    weight_decay = 0.01
    metric_for_best_model = 'f1_micro'
    seed = 1412

args = Args()

In [3]:
#load dataset
dataset = load_dataset(args.dataset_name_or_path)
dataset = dataset.map(lambda examples: {'labels': examples[args.label_col]}, batched=True)
num_labels = len(set(dataset['train']['labels']))
dataset

Reusing dataset wisesight_sentiment (/home/admin/.cache/huggingface/datasets/wisesight_sentiment/wisesight_sentiment/1.0.0/4bb1772cff1a0703d72fb9e84dff9348e80f6cdf80b0f6c0f59bcd85fc5a3537)
Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/wisesight_sentiment/wisesight_sentiment/1.0.0/4bb1772cff1a0703d72fb9e84dff9348e80f6cdf80b0f6c0f59bcd85fc5a3537/cache-b1e61619243fa239.arrow
Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/wisesight_sentiment/wisesight_sentiment/1.0.0/4bb1772cff1a0703d72fb9e84dff9348e80f6cdf80b0f6c0f59bcd85fc5a3537/cache-4bb6c5a17b7550bd.arrow
Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/wisesight_sentiment/wisesight_sentiment/1.0.0/4bb1772cff1a0703d72fb9e84dff9348e80f6cdf80b0f6c0f59bcd85fc5a3537/cache-3c55e248148458c8.arrow


DatasetDict({
    train: Dataset({
        features: ['category', 'labels', 'texts'],
        num_rows: 21628
    })
    validation: Dataset({
        features: ['category', 'labels', 'texts'],
        num_rows: 2404
    })
    test: Dataset({
        features: ['category', 'labels', 'texts'],
        num_rows: 2671
    })
})

In [4]:
# #clean dataset
# def clean_function(examples):
#     examples[args.feature_col] = process_transformers(examples[args.feature_col])
#     return examples

# cleaned_dataset = dataset.map(clean_function)
cleaned_dataset = dataset

In [5]:
#encode dataset
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
def encode_function(examples):
    return tokenizer(examples[args.feature_col], truncation=True)
encoded_dataset = dataset.map(encode_function, batched=True)

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/wisesight_sentiment/wisesight_sentiment/1.0.0/4bb1772cff1a0703d72fb9e84dff9348e80f6cdf80b0f6c0f59bcd85fc5a3537/cache-b34c3a25829a1402.arrow
Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/wisesight_sentiment/wisesight_sentiment/1.0.0/4bb1772cff1a0703d72fb9e84dff9348e80f6cdf80b0f6c0f59bcd85fc5a3537/cache-c53abb5eeacad2b1.arrow
Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/wisesight_sentiment/wisesight_sentiment/1.0.0/4bb1772cff1a0703d72fb9e84dff9348e80f6cdf80b0f6c0f59bcd85fc5a3537/cache-c286deae827fdba7.arrow


In [6]:
#create model
model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path, num_labels=num_labels)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [7]:
train_args = TrainingArguments(
    output_dir = args.output_dir,
    evaluation_strategy = "epoch",
    learning_rate=args.learning_rate,
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size,
    num_train_epochs=args.num_train_epochs,
    warmup_steps = int(len(encoded_dataset['train']) * args.num_train_epochs // args.batch_size * args.warmup_percent),
    weight_decay=args.weight_decay,
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model=args.metric_for_best_model,
    seed = args.seed
)

#freezing is a bad idea
# for param in model.base_model.parameters():
#     param.requires_grad = False

In [8]:
trainer = Trainer(
    model,
    train_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    tokenizer=tokenizer,
    compute_metrics=classification_metrics
)

In [9]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mcstorm125[0m (use `wandb login --relogin` to force relogin)


wisesight_sentiment_xlm-roberta-base


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,Precision Micro,Recall Micro,F1 Macro,Precision Macro,Recall Macro,Nb Samples
1,0.755465,0.714553,0.709651,0.709651,0.709651,0.709651,0.509633,0.76642,0.49465,2404
2,0.669246,0.680504,0.72213,0.72213,0.72213,0.72213,0.611838,0.618685,0.626453,2404
3,0.567587,0.690913,0.745424,0.745424,0.745424,0.745424,0.633034,0.659459,0.657481,2404
4,0.516233,0.729669,0.743344,0.743344,0.743344,0.743344,0.634781,0.639645,0.633707,2404
5,0.45249,0.763208,0.740433,0.740433,0.740433,0.740433,0.629061,0.635529,0.629316,2404


TrainOutput(global_step=13520, training_loss=0.6317224798117869)

In [10]:
#test
import pandas as pd
preds  = trainer.predict(encoded_dataset['test'])
pd.DataFrame.from_dict(preds[2],orient='index').transpose()

Unnamed: 0,eval_loss,eval_accuracy,eval_f1_micro,eval_precision_micro,eval_recall_micro,eval_f1_macro,eval_precision_macro,eval_recall_macro,eval_nb_samples
0,0.717752,0.730064,0.730064,0.730064,0.730064,0.624195,0.64073,0.65116,2671.0


In [None]:
from datasets import load_dataset
d = load_dataset('prachathai67k')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1499.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042.0, style=ProgressStyle(description…


Downloading and preparing dataset prachathai67k/prachathai67k (download: 242.46 MiB, generated: 1.01 GiB, post-processed: Unknown size, total: 1.25 GiB) to /home/admin/.cache/huggingface/datasets/prachathai67k/prachathai67k/1.1.0/2eeb3bfaf307043e606a58f1f2af8b3d6bbf8a2d0b957d7bfafaf1dc1ef4b5ac...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=254240975.0, style=ProgressStyle(descri…