In [1]:
import warnings

import flwr as fl
import torch
import torch.nn as nn
import os
import numpy as np

import evaluate
from evaluate import load as load_metric

import datasets
from datasets import load_dataset
from utils.prompter import Prompter
from utils.data_utils import tokenize

from peft import (
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
)

import logging
logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])

import transformers
from transformers import AutoModelForSequenceClassification, AdamW, get_scheduler
from transformers import AutoModelForCausalLM, AutoTokenizer
from arguments import parse_args
from data_loader import load_data

datasets.utils.logging.set_verbosity_error()

transformers.logging.set_verbosity_error()
warnings.filterwarnings("ignore", category=UserWarning)

os.environ['WANDB_DISABLED'] = 'true'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
net = AutoModelForCausalLM.from_pretrained(
        "google-bert/bert-base-cased",
        trust_remote_code=True
    )

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
tokenizer.pad_token_id = (
    0
)
tokenizer.padding_side = "left"
prompter = Prompter()

print(tokenizer)

BertTokenizerFast(name_or_path='google-bert/bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [3]:
peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False, 
    target_modules=["query", "key", "value"],
    # target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    r=16, 
    lora_alpha=16, 
    lora_dropout=0.1)

In [4]:
net = get_peft_model(net, peft_config)
net.print_trainable_parameters()

trainable params: 884,736 || all params: 109,225,540 || trainable%: 0.8100


In [6]:
import json

In [7]:
with open('./pubmed/data/training_0.json') as f:
    data = json.load(f)

# Create lists for instructions, inputs, and outputs
instructions = [item['instruction'] for item in data]
inputs = [' '.join(item['input']) for item in data]  # Combining input fields into one string
outputs = [item['output'] for item in data]

# Create a dataset
dataset_dict = {
    'instruction': instructions,
    'input': inputs,
    'output': outputs  # Rename to 'labels' for the target variable
}
dataset = Dataset.from_dict(dataset_dict)

NameError: name 'Dataset' is not defined

: 

In [83]:
def tokenize_function(example):
    return tokenizer(example["instruction"], example["input"], example["output"], truncation=True, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Prepare data for Trainer
train_dataset = tokenized_dataset.remove_columns(['instruction', 'input', "output"])
train_dataset.set_format("torch")

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map: 100%|██████████| 450/450 [00:00<00:00, 780.95 examples/s]


In [84]:
train_dataset[1]

{'input_ids': tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,   101,  2825,
         23971,   140,  3843,  2703,  2918,  2489,  9318,  1107,  4420,  1114,
          6703, 22869,  1116,   136,   

In [86]:
with open('./pubmed/data/eval_0.json') as f:
    data = json.load(f)

# Create lists for instructions, inputs, and outputs
instructions = [item['instruction'] for item in data]
inputs = [' '.join(item['input']) for item in data]  # Combining input fields into one string
outputs = [item['output'] for item in data]

# Create a dataset
dataset_dict = {
    'instruction': instructions,
    'input': inputs,
    'output': outputs  # Rename to 'labels' for the target variable
}
eval_dataset = Dataset.from_dict(dataset_dict)

tokenized_dataset = eval_dataset.map(tokenize_function, batched=True)

# Prepare data for Trainer
eval_dataset = tokenized_dataset.remove_columns(['instruction', 'input', "output"])
eval_dataset.set_format("torch")

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map: 100%|██████████| 50/50 [00:00<00:00, 658.62 examples/s]


In [87]:
eval_dataset[0]

{'input_ids': tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,   101,  2181,   172, 25669, 26218,
         11745,  1179, 13280, 13601, 13

In [66]:
train_path = os.path.join("./pubmed/data", train_data_name)
eval_path = os.path.join("./pubmed/data", eval_data_name)
test_path = os.path.join("./pubmed/data", test_data_name)

train_data = load_dataset("json", data_files=train_path)
eval_data = load_dataset("json", data_files=eval_path)
test_data = load_dataset("json", data_files=test_path)

In [67]:
train_data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'ID'],
        num_rows: 450
    })
})

In [54]:
train_path = os.path.join("./pubmed/data", train_data_name)
eval_path = os.path.join("./pubmed/data", eval_data_name)
test_path = os.path.join("./pubmed/data", test_data_name)

train_data = load_dataset("json", data_files=train_path)
eval_data = load_dataset("json", data_files=eval_path)
test_data = load_dataset("json", data_files=test_path)

train_data = train_data["train"].shuffle().map(generate_and_tokenize_prompt)
eval_data = eval_data["train"].shuffle().map(generate_and_tokenize_prompt)
test_data = test_data["train"].shuffle().map(generate_and_tokenize_prompt)

train_data.set_format("torch")
eval_data.set_format("torch")
test_data.set_format("torch")


print("train data is")
print(train_data)
print(len(train_data))

Map: 100%|██████████| 450/450 [00:00<00:00, 615.16 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 597.41 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 430.17 examples/s]

train data is
Dataset({
    features: ['instruction', 'input', 'output', 'ID', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 450
})
450





In [88]:
def build_local_trainer(net,
                        local_train_dataset,
                        local_eval_dataset,
                        optim,
                        tokenizer,
                        local_micro_batch_size,
                        gradient_accumulation_steps,
                        local_num_epochs,
                        local_learning_rate,
                        group_by_length,
                        warmup=0,
                        density=None,
                        lambd=None,
                        reg=None):

    class reg_Trainer(transformers.Trainer):
        def compute_loss(net, inputs, return_outputs=False):
            outputs = net(**inputs)
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
            regularizer = 0
            count = 0
            loss += lambd * regularizer
            return (loss, outputs) if return_outputs else loss

    def compute_metrics(pred):
        pred_ids, labels_ids = pred
        labels_ids = np.where(labels_ids != -100, labels_ids, tokenizer.pad_token_id)
        pred_ids = np.where(pred_ids != -100, pred_ids, tokenizer.pad_token_id)
        pred_ids = np.argmax(pred_ids, axis=-1)
        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
        rouge = evaluate.load('./evaluate/metrics/rouge/rouge.py')
        rouge_output = rouge.compute(predictions=pred_str, references=label_str, use_aggregator=True)
        return {
            'rouge1': round(rouge_output["rouge1"], 4),
            'rouge2': round(rouge_output["rouge2"], 4),
            'rougeL': round(rouge_output["rougeL"], 4),
            'rougeLsum': round(rouge_output["rougeLsum"], 4),
        }
    
    train_args = transformers.TrainingArguments(
        per_device_train_batch_size=local_micro_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=warmup,
        num_train_epochs=local_num_epochs,
        learning_rate=local_learning_rate,
        do_train=True,
        do_eval=True,
        logging_steps=1,
        optim=optim,
        evaluation_strategy="epoch",
        save_strategy="no",
        output_dir="./outputs",
        ddp_find_unused_parameters=False,
        group_by_length=False,
        dataloader_drop_last=False,
        report_to="none"
    )
    local_trainer = transformers.Trainer(model=net,
                                         train_dataset=local_train_dataset,
                                         eval_dataset=local_eval_dataset,
                                         args=train_args,
                                         data_collator=transformers.DataCollatorForSeq2Seq(
                                             tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
                                         ),
                                         compute_metrics=compute_metrics,
                                        )
    return local_trainer

In [89]:
local_trainer=build_local_trainer(net=net,
                                    local_train_dataset=train_dataset,
                                    local_eval_dataset=eval_dataset,
                                    optim="adamw_torch",
                                    tokenizer=tokenizer,
                                    local_micro_batch_size=4,
                                    gradient_accumulation_steps=32//4,
                                    local_num_epochs=10,
                                    local_learning_rate=1e-5,
                                    group_by_length=False,
                                    warmup=0,
                                    )



In [90]:
result = local_trainer.train()

print(f"trained on {len(train_data)} number of dataset")
print(local_trainer.state.log_history[-2])
print(local_trainer.state.log_history[-1])
print(result.metrics)

{'loss': 24.1684, 'grad_norm': 5.404697418212891, 'learning_rate': 9e-06, 'epoch': 0.5333333333333333}
{'loss': 24.3437, 'grad_norm': 5.508152008056641, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.0666666666666667}
{'loss': 24.2475, 'grad_norm': 5.479374408721924, 'learning_rate': 7e-06, 'epoch': 1.6}
{'loss': 24.1153, 'grad_norm': 5.488589763641357, 'learning_rate': 6e-06, 'epoch': 2.1333333333333333}
{'loss': 24.2509, 'grad_norm': 5.552163124084473, 'learning_rate': 5e-06, 'epoch': 2.6666666666666665}
{'loss': 24.1805, 'grad_norm': 5.482497215270996, 'learning_rate': 4.000000000000001e-06, 'epoch': 3.2}
{'loss': 24.1211, 'grad_norm': 5.541659832000732, 'learning_rate': 3e-06, 'epoch': 3.7333333333333334}
{'loss': 24.0958, 'grad_norm': 5.609340190887451, 'learning_rate': 2.0000000000000003e-06, 'epoch': 4.266666666666667}
{'loss': 24.1912, 'grad_norm': 5.588045120239258, 'learning_rate': 1.0000000000000002e-06, 'epoch': 4.8}
{'loss': 23.909, 'grad_norm': 5.494493007659912, 'lea

In [91]:
def test(net, tokenizer, test_data, epoch, local_micro_batch_size):
    test_args = transformers.TrainingArguments(
        output_dir="./outputs",
        do_train=False,
        do_eval=True,
        # fp16=True,
        per_device_eval_batch_size=local_micro_batch_size,
        dataloader_drop_last=False,
        eval_accumulation_steps=4,
    )

    def compute_metrics(pred):
        pred_ids, labels_ids = pred
        labels_ids = np.where(labels_ids != -100, labels_ids, tokenizer.pad_token_id)
        pred_ids = np.where(pred_ids != -100, pred_ids, tokenizer.pad_token_id)
        pred_ids = np.argmax(pred_ids, axis=-1)
        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
        from pprint import pprint
        print(pred_str[0])
        print(label_str[0])
        rouge = evaluate.load('./evaluate/metrics/rouge/rouge.py')
        rouge_output = rouge.compute(predictions=pred_str, references=label_str, use_aggregator=True)
        return {
            'rouge1': round(rouge_output["rouge1"], 4),
            'rouge2': round(rouge_output["rouge2"], 4),
            'rougeL': round(rouge_output["rougeL"], 4),
            'rougeLsum': round(rouge_output["rougeLsum"], 4),
        }

    # init trainer
    tester = transformers.Trainer(
        model=net,
        args=test_args,
        data_collator=transformers.DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
        ),
        compute_metrics=compute_metrics
    )
    # test_dataset = self.test_data["train"].shuffle().map(self.generate_and_tokenize_prompt)
    # test_dataset = self.local_test_dataset
    eval_dataset = test_data
    # test_results = tester.evaluate(test_dataset)
    eval_results = tester.evaluate(eval_dataset)
    # logging.info('For client ' + str( self.client_id) + ', the test result is:')
    # logging.info(test_results)
    print('For client ' + str(0) + ', the eval result is:')
    print(eval_results)
    return eval_results

In [92]:
net.eval()

eval_results = test(net, tokenizer, eval_dataset, 1, 4)
print(eval_results)

INFO:absl:Using default tokenizer.


. and The The Micro micro The The Testing The Testing o o the....taining inin. Is Select Is for and, or the disease for The micro,,, c Cy Testing micro card for a am of disease disease both cancer both the o., 7 and This micro hem The meta p testing in The both both o endopha, for micro p. is The is For See.. This For o.. C What Is How What -. : :....... anti How c isost.... :. Is cytokeratin immunoreactivity useful in the diagnosis of short - segment Barrett's oesophagus in Korea?. Cytokeratin 7 / 20 staining has been reported to be helpful in diagnosing Barrett's oesophagus and gastric intestinal metaplasia. However, this is still a matter of some controversy. To determine the diagnostic usefulness of cytokeratin 7 / 20 immunostaining for short - segment Barrett's oesophagus in Korea. In patients with Barrett's oesophagus, diagnosed endoscopically, at least two biopsy specimens were taken from just below the squamocolumnar junction. If goblet cells were found histologically with alci

: 

In [None]:
from pprint import pprint
print(train_data)

Dataset({
    features: ['instruction', 'input', 'output', 'ID', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 450
})
