### DeBERTa Finetuning



#### Google Drive mount & Set up Data Folder & Set up libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers[torch] datasets scikit-learn pandas accelerate -U

In [None]:
# GPUの確認
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

#### chABSA-dataset を使用

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split, ParameterGrid
import numpy as np

# CSVファイルのロード
data = pd.read_csv('./chABSA_posneg.csv')

# データをトレーニングセットとテストセットに分割
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['text'].tolist(), data['label'].tolist(), test_size=0.2, random_state=42
)

# データセットの作成
train_data = pd.DataFrame({'text': train_texts, 'label': train_labels})
test_data = pd.DataFrame({'text': test_texts, 'label': test_labels})

# Hugging Face datasets形式に変換
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

In [None]:
train_dataset

In [None]:
# 日本語の事前学習済みモデルとトークナイザーをロード
model_name = "izumi-lab/deberta-v2-base-japanese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # ラベル数を2に設定

# モデルをGPUに移動
model.to(device)

# データの前処理
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# データセットのフォーマット
train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
# 評価メトリックの定義
metric = load_metric("f1", trust_remote_code=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    f1 = metric.compute(predictions=preds, references=labels, average='macro')
    return f1

# パラメータグリッドの定義
param_grid = {
    'learning_rate': [3e-5, 2e-5, 5e-5],
    'per_device_train_batch_size': [8, 16],
    'num_train_epochs': [2, 3]
}

best_f1 = 0
best_params = {}

In [None]:
# グリッドサーチの実行
for params in ParameterGrid(param_grid):
    training_args = TrainingArguments(
        output_dir="./DeBERTa/results_f1",
        evaluation_strategy="epoch",
        learning_rate=params['learning_rate'],
        per_device_train_batch_size=params['per_device_train_batch_size'],
        per_device_eval_batch_size=params['per_device_train_batch_size'],
        num_train_epochs=params['num_train_epochs'],
        weight_decay=0.01,
        logging_dir='./DeBERTa/logs_f1',
        logging_steps=10,
        save_strategy="epoch"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    # トレーニングの実行
    trainer.train()

    # 評価の実行
    results = trainer.evaluate()
    print(f"Params: {params}, Results: {results}")

    # ベストパラメータの更新
    if results['eval_f1'] > best_f1:
        best_f1 = results['eval_f1']
        best_params = params

# ベストパラメータで再トレーニング
training_args = TrainingArguments(
    output_dir="./DeBERTa/best_model_f1",
    evaluation_strategy="epoch",
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=best_params['per_device_train_batch_size'],
    num_train_epochs=best_params['num_train_epochs'],
    weight_decay=0.01,
    logging_dir='./DeBERTa/logs_f1',
    logging_steps=10,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# トレーニングの実行
trainer.train()

# モデルの保存
model.save_pretrained("./DeBERTa/best_model_f1")
tokenizer.save_pretrained("./DeBERTa/best_model_f1")

print(f"Best Params: {best_params}, Best F1: {best_f1}")