<a href="https://colab.research.google.com/github/y-hiroki-radiotech/100knocks-preprocess/blob/master/gemma2_it_classification_tasks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount("/content/drive")
%cd "/content/drive/MyDrive/LLM/2024_大規模言語モデル/05.最終課題/2. Text-Classification LLM"

MessageError: Error: credential propagation was unsuccessful

In [None]:
from google.colab import userdata

HUGGINGFACE_TOKEN = userdata.get('HF_TOKEN_READ')
!huggingface-cli login --token $HUGGINGFACE_TOKEN

### Library install

In [None]:
!pip install datasets bitsandbytes peft trl wandb accelerate

### プロンプトの作成

In [29]:
'''
def generate_prompt(dataset): # Change parameter to questions
    return f"""次の角カッコ(square brankets)で囲まれた問題文を分析して、それが以下の選択肢のどれに分類されるか判断してください。
    必ず選択肢から一つだけ選んでください。選択肢は`/`で区切られています。
    # 選択肢: Explanation-Description/Problem-Solving-Decision-Making/Creative-Expression/Analysis-Inference/Simple Fact-Checking/Calculation-Logic/Information Extraction-Identification/Comparison-Similarity

    # 問題文: [{dataset["questions"]}] = {dataset["labels"]}"""

def generate_prompt_test(dataset): # Change parameter to questions
    return f"""次の角カッコ(square brankets)で囲まれた問題文を分析して、それが以下の選択肢のどれに分類されるか判断してください。
    必ず選択肢から一つだけ選んでください。選択肢は`/`で区切られています。
    # 選択肢: Explanation-Description/Problem-Solving-Decision-Making/Creative-Expression/Analysis-Inference/Simple Fact-Checking/Calculation-Logic/Information Extraction-Identification/Comparison-Similarity

    # 問題文: [{dataset["questions"]}] = """
'''

In [101]:
def generate_prompt(dataset): # Change parameter to questions
    return f"""Analyze the text enclosed in square brackets and determine which one category it belongs to.Return exactly one of these labels: 'Explanation-Description', 'Problem-Solving-Decision-Making', 'Creative-Expression', 'Analysis-Inference', 'Simple Fact-Checking', 'Calculation-Logic', 'Information Extraction-Identification', 'Comparison-Similarity'.
    [{dataset["questions"]}] = {dataset["labels"]}<|endoftext|>""".strip()

def generate_prompt_test(dataset): # Change parameter to questions
    return f"""Analyze the text enclosed in square brackets and determine which one category it belongs to.Return exactly one of these labels: 'Explanation-Description', 'Problem-Solving-Decision-Making', 'Creative-Expression', 'Analysis-Inference', 'Simple Fact-Checking', 'Calculation-Logic', 'Information Extraction-Identification', 'Comparison-Similarity'.
    [{dataset["questions"]}] = """.strip()

### データセットの読み込み

In [102]:
from sklearn.model_selection import train_test_split
import pandas as pd
import datasets
from datasets import Dataset

filename = "create_synthetic_data.csv"
dataset = pd.read_csv(filename)

# 訓練80%, テスト10%, eval:10で分割する
train_data, test_data = train_test_split(dataset, stratify=dataset["labels"], test_size=0.2, random_state=42)
test_data, eval_data = train_test_split(test_data, stratify=test_data["labels"], test_size=0.5, random_state=42)

train_data = pd.DataFrame(train_data.apply(generate_prompt, axis=1), columns=["text"])
eval_data = pd.DataFrame(eval_data.apply(generate_prompt, axis=1), columns=["text"])

y_true = test_data["labels"]
test_data = pd.DataFrame(test_data.apply(generate_prompt_test, axis=1), columns=["text"])

train_data = Dataset.from_pandas(train_data)
# test_data = Dataset.from_pandas(test_data)
eval_data = Dataset.from_pandas(eval_data)


In [38]:
import torch
import os
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig, AutoPeftModelForCausalLM

def init_model(model_name="google/gemma-2-2b"):
    #use bf16 and FlashAttention if supported
    if torch.cuda.is_bf16_supported():
        os.system('pip install flash_attn')
        compute_dtype = torch.bfloat16
        attn_implementation = 'flash_attention_2'
    else:
        compute_dtype = torch.float16
        attn_implementation = 'sdpa'

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        attn_implementation=attn_implementation
    )

    model.config.use_cache = False
    model.config.pretraining_tp = 1

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # tokenizer.padding_side = "right"
    return model, tokenizer

## wandbの設定

In [40]:
import os
import wandb
from google.colab import userdata

def setup_wandb(project_name: str, run_name: str, config: str, job_type=None):
    # set up your API key
    try:
        WANDB_KEY = userdata.get('WANDB_API_KEY')
        wandb.login(key=WANDB_KEY)
        os.environ["WANDB_ENTITY"] = "y-hiroki-rad"
    except KeyError:
        raise EnvironmentError("WANDB_API_KEY is not set in the environment variables.")
    except Exception as e:
        print(f"Error logging into WandB: {e}")

    # Optional: Log models
    os.environ["WANDB_LOG_MODEL"] = "checkpoint"
    os.environ["WANDB_WATCH"] = "all"
    os.environ["WANDB_SILENT"] = "true"

    # Initialize the WandB run
    try:
        wandb.init(project=project_name, name=run_name, config=config, job_type=job_type)
        print(f"WandB run initialized: Project - {project_name}, Run - {run_name}")
    except Exception as e:
        print(f"Error initializing WandB run: {e}")

In [42]:
import re
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments

def train(model, tokenizer, train_data, eval_data):


    # モデルのLoRAするための線形層を見つける
    model_modules = str(model.modules)
    pattern = r"\((\w+)\): Linear"
    linear_layer_names = re.findall(pattern, model_modules)
    target_modules = list(set(linear_layer_names))

    peft_config = LoraConfig(
        lora_alpha=64,
        lora_dropout=0.05,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=target_modules
    )

    training_arguments = TrainingArguments(
        output_dir="gemma-logs",
        num_train_epochs=5,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        logging_steps=4,
        log_level="debug",
        save_strategy="epoch",
        eval_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.001,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        max_grad_norm=0.3,
        max_steps=-1,
        warmup_ratio=0.03,
        # group_by_length=True,
        lr_scheduler_type="linear",
        report_to="wandb",
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=eval_data,
        peft_config=peft_config,
        dataset_text_field="text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
        max_seq_length=64,
    )

    setup_wandb(project_name="Classification-Fine-tuning", run_name="gemma2-2b", config=training_arguments, job_type="Fine-tuning")

    trainer.train()

    # save trained model
    output_dir = "results/gemma2-2b"
    trainer.save_model(output_dir)
    wandb.finish()

### 評価関数を作成する

In [43]:
# mappingの作成
mapping = {}
for i, label in enumerate(dataset["labels"].unique()):
    mapping[label] = i
mapping

{'Problem-Solving-Decision-Making': 0,
 'Creative-Expression': 1,
 'Explanation-Description': 2,
 'Simple Fact-Checking': 3,
 'Analysis-Inference': 4,
 'Information Extraction': 5,
 'Calculation-Logic': 6,
 'Comparison-Similarity': 7}

In [44]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)
from sklearn.model_selection import train_test_split

def evaluate(y_true, y_pred):

    def map_func(x):
        return mapping.get(x, 1)

    # labelを数値化する
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)

    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.3f}")

    # Generate accuracy report
    unique_labels = set(y_true)

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f"Accuracy for label {label}: {accuracy:.3f}")

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print("\nClassification Report:")
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)


In [45]:
def inference(pipe, prompt):
    result = pipe(prompt)
    answer = result[0]["generated_text"].split("=")[-1].strip()
    return answer

In [46]:
from transformers import pipeline

def predict(X_test, model, tokenizer):
    y_pred = []
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=64,
        temperature=1.0,
    )

    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        answer = inference(pipe, prompt)

    if answer == "Explanation" or answer == "Description":
        y_pred.append("Explanation-Description")
    elif answer == "Problem-Solving" or answer == "Decision-Making":
        y_pred.append("Problem-Solving-Decision-Making")
    elif answer == "Creative-Expression":
        y_pred.append("Creative-Expression")
    elif answer == "Analysis" or answer == "Inference":
        y_pred.append("Analysis-Inference")
    elif answer == "Simple-Fact-Checking":
        y_pred.append("Simple Fact-Checking")
    elif answer == "Calculation" or answer == "Logic":
        y_pred.append("Calculation-Logic")
    elif answer == "Information-Extraction" or answer == "Identification":
        y_pred.append("Information Extraction-Identification")
    elif answer == "Comparison" or answer == "Similarity":
        y_pred.append("Comparison-Similarity")

    return y_pred

### 訓練中の評価

In [47]:
def validate(X_test, y_true, model, tokenizer):
    y_pred = predict(X_test, model, tokenizer)
    evaluate(y_true, y_pred)
    return y_pred

### ここから訓練

In [48]:
model_name = "google/gemma-2-2b"
model, tokenizer = init_model(model_name)
train(model, tokenizer, train_data, eval_data)

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--gemma-2-2b/snapshots/c5ebcd40d208330abc697524c919956e692655cf/config.json
Model config Gemma2Config {
  "_name_or_path": "google/gemma-2-2b",
  "architectures": [
    "Gemma2ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": 1,
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2304,
  "initializer_range": 0.02,
  "intermediate_size": 9216,
  "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 8,
  "num_hidden_layers": 26,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "query_pre_attn_scalar": 256,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "torch_dtype": "float32",
  "transformers_version": 

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google--gemma-2-2b/snapshots/c5ebcd40d208330abc697524c919956e692655cf/model.safetensors.index.json


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

Instantiating Gemma2ForCausalLM model under default dtype torch.float16.
Detected flash_attn version: 2.7.2.post1
Generate config GenerationConfig {
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": 1,
  "pad_token_id": 0
}

Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected fla

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing Gemma2ForCausalLM.

All the weights of Gemma2ForCausalLM were initialized from the model checkpoint at google/gemma-2-2b.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Gemma2ForCausalLM for predictions without further training.


generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--google--gemma-2-2b/snapshots/c5ebcd40d208330abc697524c919956e692655cf/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": 1,
  "pad_token_id": 0
}



tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--google--gemma-2-2b/snapshots/c5ebcd40d208330abc697524c919956e692655cf/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--google--gemma-2-2b/snapshots/c5ebcd40d208330abc697524c919956e692655cf/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--google--gemma-2-2b/snapshots/c5ebcd40d208330abc697524c919956e692655cf/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--google--gemma-2-2b/snapshots/c5ebcd40d208330abc697524c919956e692655cf/tokenizer_config.json
PyTorch: setting up devices

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
PyTorch: setting up devices
PyTorch: setting up devices


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Using auto half precision backend
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Currently training with a batch size of: 4
***** Running training *****
  Num examples = 408
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 8
  Total optimization steps = 60
  Number of trainable parameters = 99,598,336
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


WandB run initialized: Project - Classification-Fine-tuning, Run - gemma2-2b


Epoch,Training Loss,Validation Loss
0,1.5098,1.064667
1,0.3078,0.272745



***** Running Evaluation *****
  Num examples = 51
  Batch size = 8
Saving model checkpoint to gemma-jp-logs/checkpoint-12
tokenizer config file saved in gemma-jp-logs/checkpoint-12/tokenizer_config.json
Special tokens file saved in gemma-jp-logs/checkpoint-12/special_tokens_map.json
Logging checkpoint artifacts in checkpoint-12. ...
[34m[1mwandb[0m: Adding directory to artifact (./gemma-jp-logs/checkpoint-12)... Done. 8.5s

***** Running Evaluation *****
  Num examples = 51
  Batch size = 8
Saving model checkpoint to gemma-jp-logs/checkpoint-25
tokenizer config file saved in gemma-jp-logs/checkpoint-25/tokenizer_config.json
Special tokens file saved in gemma-jp-logs/checkpoint-25/special_tokens_map.json
Logging checkpoint artifacts in checkpoint-25. ...
[34m[1mwandb[0m: Adding directory to artifact (./gemma-jp-logs/checkpoint-25)... Done. 6.1s


Epoch,Training Loss,Validation Loss
0,1.5098,1.064667
1,0.3078,0.272745
2,0.2142,0.185188
4,0.0956,0.09509



***** Running Evaluation *****
  Num examples = 51
  Batch size = 8
Saving model checkpoint to gemma-jp-logs/checkpoint-38
tokenizer config file saved in gemma-jp-logs/checkpoint-38/tokenizer_config.json
Special tokens file saved in gemma-jp-logs/checkpoint-38/special_tokens_map.json
Logging checkpoint artifacts in checkpoint-38. ...
[34m[1mwandb[0m: Adding directory to artifact (./gemma-jp-logs/checkpoint-38)... Done. 6.9s

***** Running Evaluation *****
  Num examples = 51
  Batch size = 8
Saving model checkpoint to gemma-jp-logs/checkpoint-51
tokenizer config file saved in gemma-jp-logs/checkpoint-51/tokenizer_config.json
Special tokens file saved in gemma-jp-logs/checkpoint-51/special_tokens_map.json
Logging checkpoint artifacts in checkpoint-51. ...
[34m[1mwandb[0m: Adding directory to artifact (./gemma-jp-logs/checkpoint-51)... Done. 6.1s
Saving model checkpoint to gemma-jp-logs/checkpoint-60
tokenizer config file saved in gemma-jp-logs/checkpoint-60/tokenizer_config.json


ValueError: You have set `args.eval_strategy` to epoch but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. 

### Mergeする

In [49]:
adapter = "./gemma-jp-logs/checkpoint-60"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map={"": 0},
    torch_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = PeftModel.from_pretrained(model, adapter)
model = model.merge_and_unload()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--gemma-2-2b/snapshots/c5ebcd40d208330abc697524c919956e692655cf/config.json
Model config Gemma2Config {
  "_name_or_path": "google/gemma-2-2b",
  "architectures": [
    "Gemma2ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": 1,
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2304,
  "initializer_range": 0.02,
  "intermediate_size": 9216,
  "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 8,
  "num_hidden_layers": 26,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "query_pre_attn_scalar": 256,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "torch_dtype": "bfloat16",
  "transformers_version":

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing Gemma2ForCausalLM.

All the weights of Gemma2ForCausalLM were initialized from the model checkpoint at google/gemma-2-2b.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Gemma2ForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--google--gemma-2-2b/snapshots/c5ebcd40d208330abc697524c919956e692655cf/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": 1,
  "pad_token_id": 0
}

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--google--gemma-2-2b/snapshots/c5ebcd40d208330abc697524c919956e692655cf/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--google--gemma-2-2b/snapshots/c5ebcd40d208330abc697524c919956e692655cf/tokenizer.json
l

### huggingfaceへのpush

In [50]:
from google.colab import userdata

HUGGINGFACE_TOKEN = userdata.get('HF_TOKEN_WRITE')
!huggingface-cli login --token $HUGGINGFACE_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `LLM_new_token` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `LLM_new_token`


In [51]:
model.push_to_hub("hiroki-rad/gemma-classification-ft")
tokenizer.push_to_hub("hiroki-rad/gemma-classification-ft")

Configuration saved in /tmp/tmpvl69zvoo/config.json
Configuration saved in /tmp/tmpvl69zvoo/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /tmp/tmpvl69zvoo/model.safetensors.index.json.
Uploading the following files to hiroki-rad/gemma-classification-ft: model-00001-of-00002.safetensors,model.safetensors.index.json,model-00002-of-00002.safetensors,README.md,generation_config.json,config.json


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

tokenizer config file saved in /tmp/tmpchj4ncni/tokenizer_config.json
Special tokens file saved in /tmp/tmpchj4ncni/special_tokens_map.json
Uploading the following files to hiroki-rad/gemma-classification-ft: tokenizer.json,README.md,special_tokens_map.json,tokenizer.model,tokenizer_config.json


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hiroki-rad/gemma-classification-ft/commit/3e683bea27c88347069d02460854e7e5745d8a9c', commit_message='Upload tokenizer', commit_description='', oid='3e683bea27c88347069d02460854e7e5745d8a9c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/hiroki-rad/gemma-classification-ft', endpoint='https://huggingface.co', repo_type='model', repo_id='hiroki-rad/gemma-classification-ft'), pr_revision=None, pr_num=None)

### ファインチューニング後の評価

In [88]:
model_name = "hiroki-rad/gemma-classification-ft"
pipe = pipeline("text-generation",
                model=model_name,
                tokenizer=model_name,
                max_new_tokens=64,
                temperature=1.0,
                repetition_penalty=1.1,
                )

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--hiroki-rad--gemma-classification-ft/snapshots/3e683bea27c88347069d02460854e7e5745d8a9c/config.json
Model config Gemma2Config {
  "_name_or_path": "hiroki-rad/gemma-classification-ft",
  "architectures": [
    "Gemma2ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": 1,
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2304,
  "initializer_range": 0.02,
  "intermediate_size": 9216,
  "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 8,
  "num_hidden_layers": 26,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "query_pre_attn_scalar": 256,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "torch_dtype": "bf

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing Gemma2ForCausalLM.

All the weights of Gemma2ForCausalLM were initialized from the model checkpoint at hiroki-rad/gemma-classification-ft.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Gemma2ForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--hiroki-rad--gemma-classification-ft/snapshots/3e683bea27c88347069d02460854e7e5745d8a9c/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": 1,
  "pad_token_id": 0
}

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--hiroki-rad--gemma-classification-ft/snapshots/3e683bea27c88347069d02460854e7e5745d8a9c/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--hiroki-rad--gemma-classification-ft

In [99]:
test = test_data["text"].iloc[4]
test

"Analyze the text enclosed in square brackets and determine which one category it belongs to.Return exactly one of these labels: 'Explanation-Description', 'Problem-Solving-Decision-Making', 'Creative-Expression', 'Analysis-Inference', 'Simple Fact-Checking', 'Calculation-Logic', 'Information Extraction-Identification', 'Comparison-Similarity'.\n    [次の文を比較し、言い換えの適切性について評価してください。文1: 声帯に負荷をかけない生活習慣が肝要である。文2: 声帯を使ってはいけません。] ="

In [100]:
result = pipe(test)
result

[{'generated_text': "Analyze the text enclosed in square brackets and determine which one category it belongs to.Return exactly one of these labels: 'Explanation-Description', 'Problem-Solving-Decision-Making', 'Creative-Expression', 'Analysis-Inference', 'Simple Fact-Checking', 'Calculation-Logic', 'Information Extraction-Identification', 'Comparison-Similarity'.\n    [次の文を比較し、言い換えの適切性について評価してください。文1: 声帯に負荷をかけない生活習慣が肝要である。文2: 声帯を使ってはいけません。] = Explanation-Description\n    [この問題を解くには、以下の手順に従ってください。まず、正しい答えを選ぶべきです。次に、その理由を説明する必要があります。最後に、あなたの考えを述べなければなりません。] = Problem-Solving-Decision-Making\n    [あなたは、以下のような選択肢からどれ"}]

In [82]:
def extract_first_label(text):
    # Split by first '=' and take the second part
    after_equals = text.split('=', 1)[1]

    # Split by newline and take the first part
    label = after_equals.split('\n', 1)[0]

    # Strip any whitespace
    return label.strip()

In [91]:
推論時に<|endoftext|>どうを設定したらいい？

[{'generated_text': "Analyze the text enclosed in square brackets and determine which one category it belongs to.\n    Return exactly one of these labels: 'Explanation-Description', 'Problem-Solving-Decision-Making', 'Creative-Expression', 'Analysis-Inference', 'Simple Fact-Checking', 'Calculation-Logic', 'Information Extraction-Identification', 'Comparison-Similarity'.\n    [次の文を比較し、言い換えの適切性について評価してください。文1: 声帯に負荷をかけない生活習慣が肝要である。文2: 声帯を使ってはいけません。] = Explanation-Description\n    [あなたはあなたの好きなことをするべきです。] = Creative-Expression\n    [この問題を解くには、いくつかの計算が必要です。] = Calculation-Logic\n    [あなたが正しい答えを選ぶかどうかはあなた次第です。] = Simple Fact-Checking\n    [あなたは自分の考えを"}]

In [92]:
answer = result[0]["generated_text"]
extract_first_label(answer)

'Explanation-Description'

In [77]:
y_true[4]

'Explanation-Description'

In [95]:
tokenizer.

{'thra': 84801,
 '▁Svensk': 214116,
 '▁duo': 31157,
 '▁lupo': 169928,
 '▁tecnolog': 20959,
 'tdown': 109586,
 '▁sapin': 191808,
 'foc': 183469,
 '▁🙂': 33359,
 'だと思う': 119919,
 'ednesdays': 102737,
 'zembro': 65159,
 'SelectedIndex': 76394,
 '▁Amar': 42539,
 '▁JAK': 118316,
 '▁Obama': 25067,
 '}}$\\\\': 206048,
 '▁krep': 212943,
 '͘': 241521,
 '絵本': 150486,
 'าก็': 186584,
 'tC': 203927,
 '▁incapacity': 183736,
 'BEAUT': 151309,
 '▁piezas': 33382,
 'годно': 165673,
 'бака': 186587,
 'ataires': 136787,
 '▁escultura': 138380,
 'щений': 101796,
 '▁SIR': 43712,
 '▁itemId': 122804,
 '🏃': 243303,
 '▁arithmetic': 53888,
 'chestra': 25529,
 '▁Zeichen': 78678,
 '▁آخر': 50201,
 '▁désigne': 207433,
 '𝙃': 250156,
 '▁requestCode': 153447,
 '▁自己': 128341,
 '▁Luftwaffe': 221212,
 '▁[:': 84529,
 '韫': 247407,
 'Reservation': 66194,
 'Español': 196340,
 '▁PROVIDED': 135262,
 'exercise': 60852,
 'дел': 54529,
 'Initializer': 84775,
 'SNMP': 230899,
 'ኪ': 248835,
 '▁inserting': 76095,
 'MNA': 182152,
 'rxj