In [1]:
%cd ..
%pwd

/tmp/working


'/tmp/working'

In [2]:
import os
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Union

import hydra
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf
from transformers import (
    AutoModel,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)
from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase

import wandb

sys.path.append(os.pardir)



In [3]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../yamls"):
    c = compose(config_name="config", overrides=["exp=305/000"], return_hydra_config=True)
    OmegaConf.resolve(c)  # debugやseedを解決
    cfg = c.exp
    runtime_choices = c.hydra.runtime.choices
    exp_name = f"{Path(sys.argv[0]).stem}/{runtime_choices.check}"
    output_path = Path(f"./output/{exp_name}")  #
    cfg.training_args.output_dir = str(output_path)
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 7
early_stopping_patience: 3
training_args:
  fp16: true
  warmup_ratio: 0.8
  learning_rate: 5.0e-06
  weight_decay: 0.01
  dataloader_num_workers: 8
  per_device_train_batch_size: 2
  per_device_eval_batch_size: 2
  num_train_epochs: 30
  logging_strategy: steps
  evaluation_strategy: steps
  save_strategy: steps
  logging_steps: 5000
  save_steps: 5000
  metric_for_best_model: map@3
  save_total_limit: 1
  load_best_model_at_end: true
  report_to: wandb
  output_dir: output/ipykernel_launcher/000
  seed: 7
  gradient_accumulation_steps: 2
  gradient_checkpointing: true
  optim: adamw_bnb_8bit
model_name: tiiuae/falcon-7b
sep_token:
- SEP
max_length: 150
max_length_valid: 300
data0_paths:
- preprocessed/331_retrieve_b/000/data0_0.csv
- preprocessed/331_retrieve_b/000/data0_10000.csv
- preprocessed/331_retrieve_b/000/data0_20000.csv
- preprocessed/331_retrieve_b/000/data0_30000.csv
- preprocessed/331_retrieve_b/000/data0_40000.csv
- preprocessed/331_retrieve_b/000/d

In [5]:
%pwd

'/tmp/working'

In [6]:
df_train = pd.read_csv("./preprocessed/336_retrieve_b_improve/b_bge_10_4_3/data1.csv").head().reset_index(drop=True)
df_valid = pd.read_csv("./preprocessed/336_retrieve_b_improve/b_bge_10_4_3/train.csv").head().reset_index(drop=True)

In [7]:
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch


def clean_text(text):
    text = text.replace('"', "")
    text = text.replace("“", "")
    text = text.replace("”", "")
    return text

In [8]:
def preprocess_df(df, mode="train"):
    max_length = cfg.max_length if mode == "train" else cfg.max_length_valid  # 推論時はtokenを長く取る
    df["context"] = df["context"].apply(lambda x: " ".join(x.split()[:max_length]))

    # 空を埋める
    options = ["A", "B", "C", "D", "E"]
    for option in options:
        df[option] = df[option].fillna("")
    return df


df_train = preprocess_df(df_train)
df_valid = preprocess_df(df_valid, mode="valid")
dataset_train = Dataset.from_pandas(df_train)
dataset_valid = Dataset.from_pandas(df_valid)

In [9]:
from langchain.prompts import PromptTemplate

template = """## Context\n
{context}

## Prompt\n
Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D, E]

## Question\n
{prompt}\n
A) {a}\n
B) {b}\n
C) {c}\n
D) {d}\n
E) {e}\n

## Answer: {answer}
"""

prompt = PromptTemplate(template=template, input_variables=["context", "prompt", "a", "b", "c", "d", "e", "answer"])

# sample
from IPython.display import Markdown, display

sample = dataset_train[0]
display(
    Markdown(
        prompt.format(
            context=sample["context"],
            prompt=sample["prompt"],
            a=sample["A"],
            b=sample["B"],
            c=sample["C"],
            d=sample["D"],
            e=sample["E"],
            answer=f'{sample["answer"]}) {sample[sample["answer"]]}',
        )
    )
)


def format_text(example):
    text = prompt.format(
        context=sample["context"],
        prompt=sample["prompt"],
        a=sample["A"],
        b=sample["B"],
        c=sample["C"],
        d=sample["D"],
        e=sample["E"],
        answer=f'{sample["answer"]}) {sample[sample["answer"]]}',
    )
    return {"text": text}


tokenized_dataset_train = dataset_train.map(format_text)
tokenized_dataset_valid = dataset_valid.map(format_text)

## Context

Henry Clay (disambiguation) > Henry Clay (1777–1852) was an American politician from Kentucky. If an internal link led you here, you may wish to change the link to point directly to the intended article. Henry Clay > Many monuments, memorials, and even high schools have been erected and named in honor of Clay. Sixteen counties, one each in Alabama, Florida, Georgia, Illinois, Indiana, Kansas, Kentucky, Minnesota, Mississippi, Missouri, Nebraska, North Carolina, South Dakota, Tennessee, Texas, and West Virginia, are named for Clay. Communities named for Clay include Clay, Kentucky, Claysville, Alabama and Claysville, Pennsylvania. The United States Navy named a submarine, the USS Henry Clay, in his honor. List of things named for Henry Clay > Sixteen Clay counties in the United States, in Alabama, Florida, Georgia, Illinois, Indiana, Kansas, Kentucky, Minnesota, Mississippi, Missouri, Nebraska, North Carolina, South Dakota, Tennessee, Texas, and West Virginia. (Clay County, Iowa is named for

## Prompt

Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D, E]

## Question

Whose name was Clay County in Kansas named after?

A) Henry Clay, a famous scientist

B) Henry Clay, a renowned painter

C) Henry Clay, a famous explorer

D) Henry Clay, an influential U.S. Senator from Kentucky

E) Henry Clay, a former president of the United States


## Answer: D) Henry Clay, an influential U.S. Senator from Kentucky


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [10]:
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k


def map_k(true_items, predictions, K=3):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_k = 0.0
    for u in range(U):
        user_preds = predictions[u]
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), K)):
            map_at_k += precision_at_k(user_results, k + 1) * user_results[k]
    return map_at_k / U


def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)  # Sortting indices in descending order
    top_answer_indices = sorted_answer_indices[:, :]  # Taking the first three indices for each row
    top_answers = np.vectorize(index_to_option.get)(
        top_answer_indices
    )  # Transforming indices to options - i.e., 0 --> A
    return np.apply_along_axis(lambda row: " ".join(row), 1, top_answers)


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    y_pred = predictions_to_map_output(logits)
    y_true = [index_to_option[label] for label in labels]
    return {cfg.training_args.metric_for_best_model: map_k(y_true, y_pred)}

In [11]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
tokenizer.pad_token = tokenizer.eos_token

In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

In [15]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
    task_type="CAUSAL_LM",
)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

In [16]:
base_model = AutoModelForCausalLM.from_pretrained(
    cfg.model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    revision="2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5",  # Using this version because running the new version gives error
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [17]:
training_args = TrainingArguments(
    output_dir="./SFT-Falcon-7b",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    logging_steps=20,
    logging_strategy="steps",
    max_steps=100,
    optim="paged_adamw_8bit",
    fp16=True,
    run_name="baseline-falcon-sft",
)

In [18]:
from trl import DataCollatorForCompletionOnlyLM, SFTTrainer

supervised_finetuning_trainer = SFTTrainer(
    base_model,
    train_dataset=tokenized_dataset_train,
    args=training_args,
    tokenizer=tokenizer,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=2048,
    data_collator=DataCollatorForCompletionOnlyLM(tokenizer=tokenizer, response_template="### Answer:"),
)



Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [19]:
supervised_finetuning_trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkami[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
