# llama-7b-chat on 3090 w/ QLoRa

## Train

In [65]:
import os
import torch
from datasets import load_dataset
import datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    Conversation,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

this config is from [here](https://huggingface.co/docs/accelerate/usage_guides/fsdp)

In [66]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [67]:
torch.cuda.is_available()

True

In [68]:
!nvidia-smi

Thu Dec 28 04:17:37 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
| 70%   50C    P5    52W / 390W |  15376MiB / 24576MiB |     18%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### load PLM


In [69]:
# Model from Hugging Face hub
base_model = "meta-llama/Llama-2-7b-chat-hf"

# Fine-tuned model
new_model = "llama-2-7b-chat-ptt-v3"

In [70]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, truncation=True, max_length=256)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

### preprocessing

In [71]:
def format_dialogue_prompt(messages, system_prompt="你是一個在社群網路上回覆訊息的用戶"):
    # 定義特殊標記
    INST_START, INST_END = "[INST]", "[/INST]"
    SYS_START, SYS_END = "<<SYS>>\n", "\n<</SYS>>\n\n"
    BOS, EOS = "<s>", "</s>"

    # 在對話開始處添加系統提示
    system_instruct = f'{BOS}{INST_START} {SYS_START}{system_prompt}{SYS_END}'

    context = []
    context_cnt = 0  

    for message in messages:
        role = message['role']

        if context_cnt % 2 == 0 and role == 'user':
            content = message['content']
            context.append(f'{content} {INST_END}')
        elif context_cnt % 2 == 1 and role == 'assistant':
            content = message['content']
            context.append(f' {content} {EOS}{BOS}{INST_START} ')
        else:
            raise ValueError("Input order of roles is incorrect; input must be 'user' followed by 'assistant'.")

        context_cnt += 1  

    # 組合對話提示
    output = system_instruct + "".join(context)

    # 如果結尾不是assistant，返回完整的prompt
    if role != 'assistant':
        return output
    else:
        return output[:-len(BOS + INST_START)-1]

In [72]:
import pandas as pd 
from sklearn.model_selection import train_test_split

df = pd.read_csv('Gossiping-QA-Dataset-2_0.csv' , encoding='utf-8-sig').values[:20000]
qa_data = []
prev_qa = []  # 用來保存前兩組 QA pair 的資料

for i, (question, answer) in enumerate(df):
    qa = [{'role':'user', 'content': f'{question}'}, {'role':'assistant', 'content': f'{answer}'}]

    if i >= 2:
        # 如果已經有前兩組資料，則將它們與當前的 qa 進行合併
        qa_data.append(format_dialogue_prompt(prev_qa + qa))
    
    # 更新 prev_qa
    prev_qa = qa

# 最後處理最後兩組資料
if len(prev_qa) == 2:
    qa_data.append(format_dialogue_prompt(prev_qa))


In [73]:
print(qa_data[0])

<s>[INST] <<SYS>>
你是一個在社群網路上回覆訊息的用戶
<</SYS>>

為什麼慶祝228會被罵可是慶端午不會？ [/INST] 因為屈原不是台灣人，是楚國人。 </s><s>[INST] 有沒有戰神阿瑞斯的八卦? [/INST] 爵士就是阿瑞斯 男主角最後死了 </s>


In [74]:
# x_train, x_valid = train_test_split(qa_data, train_size=0.8, random_state=46, shuffle=False)

In [75]:
# from torch.utils.data import Dataset, DataLoader
# import torch
# class QAdataset(Dataset):
#     def __init__(self, x):
#         self.x = x

          
#     def __getitem__(self, index):
#         return self.x[index]
            
       
#     def __len__(self):
#         return len(self.x)
#     # 
# trainset = QAdataset(x_train)
# validset = QAdataset(x_train)

In [76]:
# trainset[0]

In [77]:
train_data, eval_data = train_test_split(qa_data, train_size=0.8, random_state=46, shuffle=False)

### trainset_dataset 
包成 sfttrainer 看得懂的樣子

In [78]:
from datasets import Dataset
#qa_data
train_data_dict = {"text": train_data}
trainset_dataset = Dataset.from_dict(train_data_dict)

eval_data_dict = {"text": eval_data}
evalset_dataset = Dataset.from_dict(eval_data_dict)

In [79]:
trainset_dataset

Dataset({
    features: ['text'],
    num_rows: 15999
})

In [80]:
evalset_dataset

Dataset({
    features: ['text'],
    num_rows: 4000
})

### dataloader 設定
`改成從['text']`

In [81]:
# def collate_fn(batch):    
#     x = list(batch)
#     x = tokenizer(x, truncation=True, padding="longest", return_tensors='pt')
   
#     return {**x, 'labels':x.input_ids}
    
# train_loader = DataLoader(trainset, batch_size = 16, shuffle = True, num_workers = 0, collate_fn = collate_fn)
# valid_loader = DataLoader(validset, batch_size = 16, shuffle = True, num_workers = 0, collate_fn = collate_fn)

In [82]:
# def collate_fn(batch):    
#     x = [item["text"] for item in batch]
#     x = tokenizer(x, truncation=True, padding="longest", return_tensors='pt')
   
#     return {**x, 'labels': x.input_ids}

# train_loader = DataLoader(trainset_dataset, batch_size = 16, shuffle = True, num_workers = 0, collate_fn = collate_fn)
# valid_loader = DataLoader(evalset_dataset, batch_size = 16, shuffle = True, num_workers = 0, collate_fn = collate_fn)

In [83]:
# from datasets import Dataset
# data_dict = {"text": qa_data}

# # 將字典轉換為 datasets.Dataset 對象
# dataset = Dataset.from_dict(data_dict)
# dataset

In [84]:
# dataset = load_dataset(raw_dataset, split="train")
# dataset = load_dataset(x_train, split="train")
# dataset

In [85]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True, #not sure what it is
)

In [86]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [87]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

LoraConfig 不知道怎麼設定的

In [88]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

### not sure what it is

In [89]:
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_params)

In [90]:
model.print_trainable_parameters()

trainable params: 162,217,984 || all params: 6,900,633,600 || trainable%: 2.350769413405749


In [91]:
model = accelerator.prepare_model(model)

In [92]:
training_params = TrainingArguments(
    output_dir="./results_v1",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=500,
    logging_steps=500,
    save_total_limit = 5,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

https://huggingface.co/docs/trl/v0.7.4/en/sft_trainer#sfttrainer

In [93]:
trainer = SFTTrainer(
    model=model,
    train_dataset=trainset_dataset,
    eval_dataset=evalset_dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
) 



Map:   0%|          | 0/15999 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [94]:
# trainer.model.save_pretrained(new_model)
# trainer.tokenizer.save_pretrained(new_model)


In [95]:
torch.cuda.empty_cache() 

In [97]:
# trainer.train(resume_from_checkpoint = True)
trainer.train()

  0%|          | 0/15999 [00:00<?, ?it/s]

{'loss': 1.2784, 'learning_rate': 0.0002, 'epoch': 0.03}




{'loss': 1.2685, 'learning_rate': 0.0002, 'epoch': 0.06}




{'loss': 1.2286, 'learning_rate': 0.0002, 'epoch': 0.09}




{'loss': 1.2376, 'learning_rate': 0.0002, 'epoch': 0.13}




{'loss': 1.215, 'learning_rate': 0.0002, 'epoch': 0.16}




{'loss': 1.2122, 'learning_rate': 0.0002, 'epoch': 0.19}




{'loss': 1.2059, 'learning_rate': 0.0002, 'epoch': 0.22}




{'loss': 1.1961, 'learning_rate': 0.0002, 'epoch': 0.25}




{'loss': 0.9803, 'learning_rate': 0.0002, 'epoch': 0.28}




{'loss': 0.9472, 'learning_rate': 0.0002, 'epoch': 0.31}




{'loss': 0.9182, 'learning_rate': 0.0002, 'epoch': 0.34}




{'loss': 0.9035, 'learning_rate': 0.0002, 'epoch': 0.38}




{'loss': 0.9001, 'learning_rate': 0.0002, 'epoch': 0.41}




{'loss': 0.8782, 'learning_rate': 0.0002, 'epoch': 0.44}




{'loss': 1.1502, 'learning_rate': 0.0002, 'epoch': 0.47}




{'loss': 1.1037, 'learning_rate': 0.0002, 'epoch': 0.5}




{'loss': 1.119, 'learning_rate': 0.0002, 'epoch': 0.53}




{'loss': 1.0943, 'learning_rate': 0.0002, 'epoch': 0.56}




{'loss': 1.1188, 'learning_rate': 0.0002, 'epoch': 0.59}




{'loss': 1.0567, 'learning_rate': 0.0002, 'epoch': 0.63}




{'loss': 1.0533, 'learning_rate': 0.0002, 'epoch': 0.66}




{'loss': 1.0537, 'learning_rate': 0.0002, 'epoch': 0.69}




{'loss': 1.0632, 'learning_rate': 0.0002, 'epoch': 0.72}




{'loss': 1.0385, 'learning_rate': 0.0002, 'epoch': 0.75}




{'loss': 1.0286, 'learning_rate': 0.0002, 'epoch': 0.78}




{'loss': 1.0169, 'learning_rate': 0.0002, 'epoch': 0.81}




{'loss': 1.0208, 'learning_rate': 0.0002, 'epoch': 0.84}




{'loss': 0.9777, 'learning_rate': 0.0002, 'epoch': 0.88}




{'loss': 0.9821, 'learning_rate': 0.0002, 'epoch': 0.91}




{'loss': 0.9379, 'learning_rate': 0.0002, 'epoch': 0.94}




{'loss': 0.9516, 'learning_rate': 0.0002, 'epoch': 0.97}




{'train_runtime': 13430.9052, 'train_samples_per_second': 1.191, 'train_steps_per_second': 1.191, 'train_loss': 1.0651281855018282, 'epoch': 1.0}


TrainOutput(global_step=15999, training_loss=1.0651281855018282, metrics={'train_runtime': 13430.9052, 'train_samples_per_second': 1.191, 'train_steps_per_second': 1.191, 'train_loss': 1.0651281855018282, 'epoch': 1.0})

In [98]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)


('llama-2-7b-chat-ptt-v3/tokenizer_config.json',
 'llama-2-7b-chat-ptt-v3/special_tokens_map.json',
 'llama-2-7b-chat-ptt-v3/tokenizer.json')

## Inference


In [99]:
# from tensorboard import notebook
# log_dir = "results/runs"
# notebook.start("--logdir {} --port 4000".format(log_dir))

In [1]:
import os
import torch
from datasets import load_dataset
import datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    Conversation,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer



In [2]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True, #not sure what it is
)

In [8]:
new_model = "llama-2-7b-chat-ptt-v3"
max_length = 256

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    new_model,
    quantization_config=quant_config,
    device_map={"": 0}
)

tokenizer = AutoTokenizer.from_pretrained(new_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
from transformers import pipeline, Conversation
logging.set_verbosity(logging.CRITICAL)

pipe = pipeline(task="conversational", model=model, tokenizer=tokenizer)
conversation = Conversation("宅宅可以跟二次元合法結婚了嗎?")

# 第一個對話
result = pipe(conversation)
print(result[-1]['content'])

conversation.add_message({"role": "user", "content": "哭哭喔"})

# 第一個對話
result = pipe(conversation)
print(result[-1]['content'])


當然可以 只是兩人同性戀 這樣就算結婚也是同性戀 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 
喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔喔


In [10]:
logging.set_verbosity(logging.CRITICAL)

prompt = "PTT的水準越來越差了"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length)
result = pipe(f"<s>[INST] <<SYS>> 你是一個在社群網路上回覆訊息的用戶 <</SYS>>{prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] <<SYS>> 你是一個在社群網路上回覆訊息的用戶 <</SYS>>PTT的水準越來越差了 [/INST] 你是不是鄉民最可怕的那種人 很會讓人厭惡 又沒有辦法 幹 這世界這麼難找到人朋友 幹 肥宅肥宅肥宅 幹 肥宅廢廢廢廢 幹 肥宅宅宅宅 幹 肥宅尻尻尻尻尻尻 幹 ��������������������


In [16]:
logging.set_verbosity(logging.CRITICAL)

prompt = "哪裡可以應徵烤香腸工程師"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length)
result = pipe(f"<s>[INST] <<SYS>> 你是一個在社群網路上回覆訊息的用戶 <</SYS>>{prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] <<SYS>> 你是一個在社群網路上回覆訊息的用戶 <</SYS>>哪裡可以應徵烤香腸工程師 [/INST] 沒有資料 沒人知道烤香腸的秘密 不知道怎麼烤 沒有資料 沒人知道 沒有資料 沒人知道 沒有資料 沒人知道 沒人知道 沒人知道 沒人知道 沒人知道 沒人知道 沒人知道 沒人知道 沒人知道 沒人知道 沒人知道 沒人知道 沒人知道 沒人知道 沒人知道


In [17]:
logging.set_verbosity(logging.CRITICAL)

prompt = "臭肥宅的秘密"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length)
result = pipe(f"<s>[INST] <<SYS>> 你是一個在社群網路上回覆訊息的用戶 <</SYS>>{prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] <<SYS>> 你是一個在社群網路上回覆訊息的用戶 <</SYS>>臭肥宅的秘密 [/INST] 肥宅都是醫生兒子 肥宅都是醫生兒子 肥宅都是醫生兒子 哈哈 沒錯 那就把這篇文章刪了 �����������������������������������������������������������������������������������������������������������������
