<a href="https://colab.research.google.com/github/tsdoii/-3/blob/main/Untitled35_ipynb_%E3%81%AE%E3%82%B3%E3%83%94%E3%83%BC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# ライブラリのインストール
!pip install transformers datasets accelerate -U

# 必要なライブラリのインポートとデバイスの設定
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, pipeline
from datasets import load_dataset
from accelerate import Accelerator

accelerator = Accelerator()
device = accelerator.device

# データセットのロードと準備
dataset = load_dataset("imdb")

# データセットの1%をサンプリング
train_sample = dataset['train'].shuffle(seed=42).select(range(int(0.01 * len(dataset['train']))))
test_sample = dataset['test'].shuffle(seed=42).select(range(int(0.01 * len(dataset['test']))))

# トークナイザーをロード
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# データセットのトークナイズ
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_sample = train_sample.map(tokenize_function, batched=True)
tokenized_test_sample = test_sample.map(tokenize_function, batched=True)

# 不要なカラムを削除し、フォーマットを設定
tokenized_train_sample = tokenized_train_sample.remove_columns(["text"])
tokenized_test_sample = tokenized_test_sample.remove_columns(["text"])
tokenized_train_sample.set_format("torch")
tokenized_test_sample.set_format("torch")

# モデルのロード
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model = accelerator.prepare(model)

# トレーニング設定
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # バッチサイズを大きく
    per_device_eval_batch_size=16,
    num_train_epochs=1,  # 学習時間を短縮するためにエポック数を減らす
    weight_decay=0.01,
    fp16=True,  # 半精度トレーニング
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_sample,
    eval_dataset=tokenized_test_sample,
)

# モデルのトレーニング
trainer.train()

# モデルの評価
results = trainer.evaluate()
print(results)

# チャットボットの実装
chatbot = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# チャットボットと対話する関数
def chat_with_bot(prompt):
    response = chatbot(prompt)
    label = response[0]['label']
    score = response[0]['score']
    return f"Label: {label}, Score: {score:.4f}"

# チャットボットとの対話例
user_input = "I really enjoyed this movie!"
print(chat_with_bot(user_input))

user_input = "This film was terrible and boring."
print(chat_with_bot(user_input))



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]