In [3]:
import pandas

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
TEST_SIZE = 0.2
objectiv_col = "isFake"
#トレーニングデータと検証用のテストデータに分割
def split_data_frame(df:pandas.DataFrame, objectiv_col:str):
  df_train, df_val =train_test_split(df, test_size=TEST_SIZE, random_state=42)
  train_y = df_train[objectiv_col]
  train_x = df_train.drop(objectiv_col, axis=1)

  val_y = df_val[objectiv_col]
  val_x = df_val.drop(objectiv_col, axis=1)
  return [train_x, train_y, val_x, val_y]

def clean_text(text):
  return text.replace(' ', '').replace('　', '').replace('__BR__', '\n').replace('\xa0', '').replace('\r', '').lstrip('\n')

df_hate_train = pandas.read_csv("hate/train.csv", index_col=0)
df_hate_train.drop("source", axis=1)
df_hate_test = pandas.read_csv("hate/test.csv", index_col=0)
df_hate_test.drop("source", axis=1)
df_hate_train['text'] = df_hate_train['text'].apply(clean_text)
df_hate_test['text'] = df_hate_test['text'].apply(clean_text)

train_x, train_y, val_x, val_y = split_data_frame(df_hate_train, "label")

df_tweet = pandas.read_csv("hate/tweet.csv", index_col=0)
df_tweet['text'] = df_tweet['text'].apply(clean_text)

## luke
#難点；計算にGPU必要
import torch
from torch.utils.data import Dataset
from torchinfo import summary
from transformers import (
    AutoTokenizer, Trainer, TrainingArguments,
    LukeTokenizer, LukeForSequenceClassification,
    pipeline
)
from datasets import load_metric
MODEL_NAME = "studio-ousia/luke-japanese-base-lite"
model = LukeForSequenceClassification.from_pretrained(MODEL_NAME)


tokenizer = LukeTokenizer.from_pretrained(MODEL_NAME)

print("tokenize train")
train_vectors = tokenizer( train_x['text'].tolist(), return_tensors='pt', padding="max_length", truncation=True)
print("tokenize val")
val_vectors   = tokenizer(   val_x['text'].tolist(), return_tensors='pt', padding="max_length", truncation=True)
print("tokenize tweet")
tweet_vectors = tokenizer(df_tweet['text'].tolist(), return_tensors='pt', padding="max_length", truncation=True)

class MyDataset(Dataset):
    """トークン入力データセット"""
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, index):
        input = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        if self.labels is not None:
            input["labels"] = torch.tensor(self.labels[index])

        return input

#train/valid/testのデータセットをそれぞれ作成する 
train_ds = MyDataset(train_vectors, train_y.tolist())
val_ds   = MyDataset(val_vectors, val_y.tolist())
tweet_ds = MyDataset(tweet_vectors)

metric_name = "roc_auc"
metric = load_metric(metric_name, trust_remote_code=True)
import numpy

def compute_metrics(pred):

    predictions, labels = pred
    predictions = numpy.argmax(predictions, axis=1)

    # 'micro', 'macro', etc. are for multi-label classification. If you are running a binary classification, leave it as default or specify "binary" for average
    roc_auc = metric.compute(prediction_scores=predictions, references=labels, average="binary")
    return {
        "accuracy": accuracy_score(labels, predictions),
        "roc_auc": roc_auc["roc_auc"]
    }

train_args = TrainingArguments(
    output_dir='luke_log',
    overwrite_output_dir        = False, #logを上書きするか
    load_best_model_at_end      = True, #EarlyStoppingを使用するならTrue
    metric_for_best_model       = metric_name, #EarlyStoppingの判断基準。7-1. compute_metricsのものを指定
    save_total_limit            = 1, #output_dirに残すチェックポイントの数
    save_strategy               = "steps", #いつ保存するか？
    evaluation_strategy         = "steps", #いつ評価するか？
    logging_strategy            = "steps", #いつLOGに残すか？
    label_names                 = ['labels'], #分類ラベルのkey名称(デフォルトはlabelsなので注意)
    lr_scheduler_type           = "linear", #学習率の減衰設定(デフォルトlinearなので設定不要)
    learning_rate               = 5e-5, #学習率(デフォルトは5e-5)
    num_train_epochs            = 2, #epoch数
    per_device_train_batch_size = 16, #学習のバッチサイズ
    per_device_eval_batch_size  = 12, #バリデーション/テストのバッチサイズ
    seed                        = 42, #seed
)
trainer = Trainer(
    model=model, #モデル
    args=train_args, #TrainingArguments
    tokenizer=tokenizer, #tokenizer
    train_dataset=train_ds, #学習データセット
    eval_dataset=val_ds, #validデータセット
    compute_metrics = compute_metrics, #compute_metrics
)

trainer.train()

#trainer.predictで評価可能
print("predict")
print("正解率:" + str(accuracy_score(df_tweet["label"], trainer.predict(tweet_ds))))

Some weights of LukeForSequenceClassification were not initialized from the model checkpoint at studio-ousia/luke-japanese-base-lite and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenize train
tokenize val
tokenize tweet


  input = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
