In [1]:
!pip uninstall torch --y
!pip install --upgrade torch==2.0.1 --index-url https://download.pytorch.org/whl/cu118
!pip install -q -U trl transformers==4.35.0 git+https://github.com/huggingface/peft.git
# !pip install -q -U trl git+https://github.com/huggingface/peft.git
!pip install bitsandbytes accelerate evaluate rouge_score
# !pip install -U flash-attn
!huggingface-cli login --token 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

Found existing installation: torch 2.0.0
Uninstalling torch-2.0.0:
  Successfully uninstalled torch-2.0.0
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.0.1
  Downloading https://download.pytorch.org/whl/cu118/torch-2.0.1%2Bcu118-cp310-cp310-linux_x86_64.whl (2267.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m487.6 kB/s[0m eta [36m0:00:00[0m
Collecting triton==2.0.0 (from torch==2.0.1)
  Downloading https://download.pytorch.org/whl/triton-2.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmake (from triton==2.0.0->torch==2.0.1)
  Downloading https://download.pytorch.org/whl/cmake-3.25.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m

In [2]:
%%writefile train.py

import torch
from torch import nn
import numpy as np
import pandas as pd
import json
from pathlib import Path
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    BloomForSequenceClassification,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig,
    BitsAndBytesConfig,
    DataCollatorWithPadding
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
)
from transformers import PreTrainedModel, TrainingArguments, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput

if __name__ == '__main__':
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(torch.cuda.current_device())
    
    model_path = "bigscience/bloomz-7b1-mt"

    def get_context(row):
        row_id = str(row['index'])
        context = contexts[row_id]
        context = ' '.join([' '.join(c) for c in context])
        if context[-1] != '.':
            context += '.'
        return context

    contexts = json.load(open('/kaggle/input/72hour/ft/context_dict.json', 'r'))

    import datasets
    accuracy = datasets.load_metric('accuracy')

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return {'accuracy': accuracy.compute(predictions=predictions, references=labels)}

    from datetime import datetime
    import wandb
    wandb.login(key="5d1dc32c97d754cc8ce0b6666d9f85d9f7618ccd", force=True)

    import gc
    with torch.no_grad():
        model = None
        torch.cuda.empty_cache()
        gc.collect()

    class BloomClassifier(PreTrainedModel):
        def __init__(self, backbone):
            super(BloomClassifier, self).__init__(backbone.config)
            self.backbone = backbone

            self.in_features = self.backbone.lm_head.out_features
            self.linear = nn.Linear(in_features=self.in_features, out_features=1)

        def forward(self,
                input_ids=None,
                attention_mask=None,
                labels=None,
                return_dict=None,
                **kwargs):
            
            # input_ids = input_ids.to(device)
            # attention_mask = attention_mask.to(device)
            # self.backbone.to(input_ids.device)
            out = self.backbone(
                input_ids=input_ids.squeeze(1), 
                attention_mask=attention_mask.squeeze(1)).logits
            # out = out[:, -1, :].float()

            pooled = [] 
            for jjj in range(len(out)):
                #att_idx = attention_mask.sum(dim=1)[jjj] - 1
                pooled.append(out[jjj, -1, :].float().unsqueeze(0)) # (bs, 1, in_features)
            del out; gc.collect(); torch.cuda.empty_cache()
            pooled = torch.cat(pooled) # (bs, in_features)
            # self.linear.to(pooled.device)
            logits = self.linear(pooled) # (bs, 1)

            loss = None
            if labels is not None:
                loss_fn = nn.BCEWithLogitsLoss()
                loss = loss_fn(logits, labels.unsqueeze(1).to(logits.device))

            # preds = torch.sigmoid(logits).squeeze(-1).detach().cpu().numpy()
            return SequenceClassifierOutput(loss=loss, logits=logits)

    # out = torch.rand(4, 2048, 250880)
    # pooled = [] 
    # for jjj in range(len(out)):
    #     att_idx = 2048-1
    #     pooled.append(out[jjj, att_idx, :].float().unsqueeze(0))
    # del out; gc.collect(); torch.cuda.empty_cache()
    # pooled = torch.cat(pooled)

    # linear = nn.Linear(in_features=backbone.lm_head.out_features, out_features=1)
    # logits = linear(pooled)
    # labels = torch.rand(4, dtype=float)
    # loss_fn = nn.BCEWithLogitsLoss()
    # loss_fn(logits, labels.unsqueeze(1))

    # Tokenizer.
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    tokenizer.pad_token_id = 0
    tokenizer.padding_side = "left"

    lora_config = LoraConfig(
        r= 8,
        lora_alpha= 32,
        bias = "none",
        task_type = "SEQ_CLS",
        lora_dropout = 0.05,
    )
    # Model.
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True, 
        bnb_4bit_compute_dtype='float16', 
        bnb_4bit_quant_type='fp4', 
        bnb_4bit_use_double_quant=True)
    backbone = BloomForSequenceClassification.from_pretrained(  
        model_path, 
        torch_dtype=torch.float16,
        device_map='auto',
        quantization_config=quantization_config,
        problem_type='single_label_classification'
    )
    backbone = prepare_model_for_kbit_training(backbone)
    backbone = get_peft_model(backbone, lora_config)
    # backbone.to(device)
    backbone.print_trainable_parameters()

    def preprocess(example):
        inputs = tokenizer(
            example['prompt'],
            return_tensors=None,
            add_special_tokens=False,
            truncation=True,
            padding="max_length",
            max_length=2048
        )
        #for k,v in inputs.items():
        #    inputs[k] = v[0] #.squeeze(0)
        # inputs['labels'] = torch.tensor(example['labels'], dtype=torch.float).unsqueeze(0)
        inputs['labels'] = example['labels']
        return inputs

    instructions = pd.read_json('/kaggle/input/72hour/dev.json')
    instructions['context'] = instructions.apply(get_context, axis=1)
    instructions['labels'] = instructions['grade'].apply(lambda x: x == 'Đúng').astype(int) #.astype(str)
    instructions['prompt'] = (
        'Bối cảnh: ' + instructions['context'] + '\n\n'
        
        "Câu hỏi: Bệnh nào khó chữa nhất?\n"
        "Câu trả lời đề xuất: Không có câu trả lời.\n"
        "Điểm:1\n\n"

        "Câu hỏi: Cách nào để phòng ngừa bệnh tim mạch?\n"
        "Câu trả lời đề xuất: Tập thể dục đều đặn.\n"
        "Điểm:1\n\n"
        
        'Câu hỏi: ' + instructions['question'] + '\n'
        'Câu trả lời đề xuất: ' + instructions['answer'] + '.\n'
        'Điểm:'
    )
    train_df, valid_df = train_test_split(instructions, test_size=0.2, random_state=42)

    remove_columns = ['index', 'question', 'answer', 'grade', 'context', 'prompt', '__index_level_0__',]
    train_dataset = Dataset.from_pandas(train_df).map(preprocess, remove_columns=remove_columns)
    valid_dataset = Dataset.from_pandas(valid_df).map(preprocess, remove_columns=remove_columns)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
    
    class MyDataset(torch.utils.data.Dataset):
        def __init__(self, df, tokenizer):
            self.prompt = df['prompt'].values
            self.labels = df['labels'].values
            self.tokenizer = tokenizer

        def __getitem__(self, idx):
            example = self.prompt[idx]
            inputs = tokenizer(
                example,
                return_tensors="pt",
                add_special_tokens=False,
                truncation=True,
                padding="max_length",
                max_length=2048
            )
            #for k, v in inputs.items():
            #    inputs[k] = torch.tensor(v, dtype = torch.long).to(device)
            inputs['labels'] = torch.tensor(self.labels[idx], dtype = torch.float)
            return inputs

        def __len__(self):
            return len(self.labels)

    #train_dataset = MyDataset(train_df, tokenizer)
    #valid_dataset = MyDataset(valid_df, tokenizer)

    training_args = TrainingArguments(
        # label_names=['labels'],
        warmup_ratio=0.1, 
        learning_rate=2e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=1,
        output_dir = f'/kaggle/working/',
        overwrite_output_dir=True,
        fp16=True,
        optim="adamw_torch",
        gradient_accumulation_steps=8,
        logging_steps=10,
        evaluation_strategy='steps',
        eval_steps=50,
        save_strategy="steps",
        save_steps=50,
        load_best_model_at_end=False,
        metric_for_best_model='accuracy',
        lr_scheduler_type='cosine',
        weight_decay=0.01,
        report_to="wandb",
        run_name=f"bloomz-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
        remove_unused_columns=False
    )

    import gc
    with torch.no_grad():
        model = None
        torch.cuda.empty_cache()
    gc.collect()

    # model = BloomClassifier(backbone)
    trainer = Trainer(
        model=backbone,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        # preprocess_logits_for_metrics=lambda logits, labels: torch.sigmoid(logits).squeeze(-1).detach().cpu().numpy()
    )
    # for i in model.named_parameters():
    #    print(f"{i[0]} -> {i[1].device}")

    trainer.train()
    trainer.save_model('bloomz_binary')

Writing train.py


In [3]:
!python train.py

0
Downloading builder script: 3.19kB [00:00, 2.63MB/s]                            
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
Downloading tokenizer_config.json: 100%|███████| 199/199 [00:00<00:00, 1.17MB/s]
Downloading tokenizer.json: 100%|██████████| 14.5M/14.5M [00:01<00:00, 13.8MB/s]
Downloading (…)cial_tokens_map.json: 100%|████| 85.0/85.0 [00:00<00:00, 393kB/s]
Downloading config.json: 100%|█████████████████| 747/747 [00:00<00:00, 4.37MB/s]
Downloading pytorch_model.bin: 100%|███████| 14.1G/14.1G [05:57<00:00, 39.6MB/s]
Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloomz-7b1-mt and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
trainable params: 3,940,352 || all param