In [None]:
import pandas as pd

dataset_path = "../data/dataset.csv"
dataset_df = pd.read_csv(dataset_path)

dataset_df.head() 

In [None]:
from datasets import Dataset
# datasets形式に変換
dataset = Dataset.from_pandas(dataset_df)

print(dataset)

In [None]:
from transformers import AutoTokenizer
# カテゴリを数値にマップ
unique_category = sorted(set(dataset["category"]))
label2id = {label: i for i, label in enumerate(unique_category)}
id2label = {i: label for label, i in label2id.items()}

# トークナイザー
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# トークナイズとラベル変換をまとめて実行
def preprocess(example):
    tokens = tokenizer(
        example["texts"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    tokens["label"] = label2id[example["category"]]
    return tokens

# datasetに前処理を実行
torkenized_dataset = dataset.map(preprocess)

# PyTorch形式に変換
torkenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
print(label2id)
print(id2label)

In [None]:
torkenized_dataset[0]

In [None]:
# データを分割
torkenized_dataset = torkenized_dataset.train_test_split(test_size=0.1)
train_dataset = torkenized_dataset["train"]
eval_dataset = torkenized_dataset["test"]

In [None]:
# モデルを読み込む
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",        # 各エポックごとに評価
    logging_strategy="epoch",           # ログのタイミング
    save_strategy="epoch",              # モデル保存
    num_train_epochs=3,                 # エポック数
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,        # ベストモデル自動読み込み
    metric_for_best_model="accuracy",   # ベスト判断基準
    greater_is_better=True,  
)

# 評価指標（必要に応じて）
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# ベストモデルを "model/" に保存
trainer.save_model("model")
tokenizer.save_pretrained("model")

In [None]:
# ベストモデルを "model/" に保存
trainer.save_model("model")
tokenizer.save_pretrained("model")

In [None]:
test_text = "This movie had stunning visual effects and great acting."
inputs = tokenizer(test_text, return_tensors="pt", padding=True, truncation=True)
outputs = model(**inputs)
predicted_label_id = outputs.logits.argmax(dim=1).item()
predicted_label = id2label[predicted_label_id]
print("Predicted label:", predicted_label)