## 라이브러리 로드

In [1]:
## 시스템 관련 라이브러리 로드 

import os
import gc
import sys

## 데이터 관련 라이브러리 로드 

import pandas as pd 
import numpy as np
import re
from tqdm import tqdm

from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets


## LLM, 딥러닝  관련 라이브러리 로드 

import torch 

from transformers import AutoTokenizer #토크나이저 
from transformers import LlamaForCausalLM,  AutoModelForCausalLM
 # LLM 모델 
from transformers import BitsAndBytesConfig # 양자화 라이브러리 
from transformers import GenerationConfig
from transformers import DataCollatorForLanguageModeling

from peft import PeftModel
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training # 효율적 학습을 위한 라이브러리 , LORA 관련 라이브러리 
from transformers import Trainer, TrainingArguments # 학습 관련된 모델 





### Configure 설정

In [2]:
## 메모리 정리 

torch.cuda.empty_cache()
gc.collect()

0

In [3]:
## 사용한 Base model 


base_model = 'beomi/OPEN-SOLAR-KO-10.7B'


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device



device(type='cuda')

In [4]:
!nvidia-smi

Thu Feb  1 16:40:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.36                 Driver Version: 546.33       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0  On |                  Off |
|  0%   38C    P8              10W / 450W |   1212MiB / 24564MiB |     14%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
## 4bit quantaziation 

bnb_4bit_compute_dtype = "bfloat16"
use_4bit = True

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
compute_dtype

In [None]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.bfloat16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

### 토크나이저 로드

In [12]:
tokenizer = AutoTokenizer.from_pretrained(base_model, padding_side = 'right')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
tokenizer

### 학습 데이터 로드 & 학습용 데이터로 만들기

In [None]:
df = pd.read_excel('data/ver0_2_1dataset.xlsx')
data1 = df.iloc[0:90, :].copy()

In [None]:
data1

In [None]:
##에러 값 처리 하나가 nan으로 되어있었음. 

hi = """풀이과정 : 천천히 단계별로 풀어볼게요 

[1단계] 식을 단순화 해볼게요.  
이차방정식 $2x^2 + 6x + 11$ 를 $a(x - h)^2 + k$ 를 완성시키기 위해서 우리는 x에 대해서 완전 제곱식의 형태로 나타내야 해요 . 

먼저 x항에 대해서 묶으면  $2(x^2 + 3x) + 11$ 다음과 같이 정리할 수 있어요 

[2 단계] 완전 제곱식 만들기 

완전 제곱식을 만들기 위해서는 우리는 x 계수의 절반값을 제곱해야 해요. 
식으로 나타내서 풀면  $\frac{3}{2}\right)^2 = \frac{9}{4}$ 이렇게 풀 수 있어요.  

해당 식을 소괄호 안에는 더해주고, 같은 값을 유지하기 위해 소괄호 밖에서는 빼준다면  $2(x^2 + 3x + \frac{9}{4} - \frac{9}{4}) + 11$ 처럼 표현할 수 있겠죠 

식을 단순화하면 $2(x + \frac{3}{2})^2 - \frac{9}{2} + 11$ 가 될거에요. 

[3 단계] 비교하여 계수 구하기 

$a(x - h)^2 + k$ 와 비교하면 h의 값을 구할 수 있습니다. 

### 정답 : $-\frac{3}{2}$

"""

In [None]:
data1['response'][54] = hi 

In [None]:
data1['response'][54]

In [None]:
prompt_for_data1 = """ 당신은 친절한 수학 선생님입니다. 절대 비난하거나, 조급해하지 않고, 학생에 질문에 깊게 생각하고 대답합니다. 
아래 #문제와 #풀이는 옳은 한쌍입니다. 다음 문제와 풀이를 보고 #정답을 맞춰보세요. 

###문제
%s

###%s

"""

In [None]:
for idx, tmp in enumerate(data1['response']):
    try:
        tmp.lstrip('\n')
    except: 
        print(tmp)
        print(idx)

In [None]:
prompt_input = []


for query, response in zip(data1['query'], data1['response']):
    prompt_input.append(prompt_for_data1%(query, response.lstrip('\n')))
    
    



len(prompt_input), len(data1)


In [None]:
data1['input_prompt'] = prompt_input

In [None]:
## 해당 데이터는 queation answer을 통해 한국 수학 용어에 대한 이해를 높일 생각이다. 

df1 = pd.read_excel('data/math_norm.xlsx')
data2 = df1.copy()

data2

In [None]:
prompt_for_data2 = """ 당신은 한국에서 수학을 배우고 있는 학생입니다. 절대 비난하거나, 조급해하지 않고, 질문에 깊게 생각하고 대답합니다. 
아래 #설명에 대하여 #답변은 옳은 한쌍입니다. 다음 #설명을 보고 #답변을 맞춰보세요. 

###설명
%s

###정답 : %s

"""

In [None]:
for idx, tmp in enumerate(data2['content']):
    try:
        tmp.lstrip('\n')
    except: 
        print(tmp)
        print(idx)

In [None]:
data2.dropna(inplace = True, ignore_index = True)

In [None]:
prompt2_input = []


for query, response in zip(data2['title'], data2['content']):
    prompt2_input.append(prompt_for_data1%(query, response.lstrip('\n')))
    
    



len(prompt2_input), len(data2)


In [None]:
data2['input_prompt'] = prompt2_input

In [None]:
# 두 df를 합치기 위해 col name 맞춰준다. 

data2.columns = data1.columns

In [None]:
final_data = pd.concat([data1, data2],axis =0 , ignore_index =True)

In [None]:
len(final_data)

#### hugging face의 dataset class로 데이터를 묶기

In [None]:
## huggingface 의 dataset class로 묶어서 합쳐야한다. 

In [None]:
dataset = Dataset.from_pandas(final_data)
dataset.shuffle()

In [None]:
dataset = dataset.train_test_split(test_size = 0.1, seed = 42 )

In [None]:
dataset

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
tokenizer.pad_token

In [None]:
tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.vocab_size


In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
def tokenize(element):
    
    outputs = tokenizer(
        element['input_prompt'],
        truncation=True,
        max_length=2048
    )

    return {"input_ids": outputs["input_ids"]}


In [None]:
tokenized_datasets = dataset['train'].map(
    tokenize, batched=True, remove_columns=dataset['train'].column_names
)

In [None]:
val_tokenized_datasets = dataset['test'].map(
    tokenize, batched=True, remove_columns=dataset['test'].column_names
)

#### 외부 데이터 가지고 와서 합치기

In [None]:
dataset2 = load_dataset('traintogpb/aihub-koen-translation-integrated-tiny-100k')

dataset2

In [None]:
dataset2_prompt = """ 당신은 영어에 매우 능통한 한국인입니다. 아래의 영어문장을 보고 한국어 문장으로 번역해 보세요. 


### 영어문장 : %s 

### 한국어문장 : %s 

"""

In [None]:

def gen_prompt(element):
    return DatasetDict({'input': dataset2_prompt%(element['en'], element['ko'])})


dataset2['train'] = dataset2['train'].map(gen_prompt)

In [None]:
dataset2['validation'] = dataset2['validation'].map(gen_prompt)
dataset2['test'] = dataset2['test'].map(gen_prompt)


In [None]:
def tokenize(element):
    
    outputs = tokenizer(
        element['input'],
        truncation=True,
        max_length=2048
    )

    return {"input_ids": outputs["input_ids"]}


In [None]:
dataset2['train'] = dataset2['train'].map(
    tokenize, batched=True, remove_columns=dataset2['train'].column_names
)

In [None]:
dataset2['validation'] = dataset2['validation'].map(
    tokenize, batched=True, remove_columns=dataset2['validation'].column_names
)


dataset2['test'] = dataset2['test'].map(
    tokenize, batched=True, remove_columns=dataset2['test'].column_names
)

In [None]:
dataset2

In [None]:
dataset3 = load_dataset('kyujinpy/KOR-gugugu-platypus-set')

In [None]:
dataset3['train'][0]

In [None]:
dataset3_prompt = """ ### %s

### %s 

### %s 

"""

In [None]:

def gen_prompt(element):
    return DatasetDict({'tmp_promt': dataset3_prompt%(element['input'], element['instruction'], element['output'])})


dataset3['train'] = dataset3['train'].map(gen_prompt)

In [None]:
dataset3

In [None]:
def tokenize(element):
    
    outputs = tokenizer(
        element['tmp_promt'],
        truncation=True,
        max_length=2048
    )

    return {"input_ids": outputs["input_ids"]}


In [None]:
dataset3['train'] = dataset3['train'].map(
    tokenize, batched=True, remove_columns=dataset3['train'].column_names
)

In [None]:
final_dataset = concatenate_datasets([tokenized_datasets, val_tokenized_datasets, dataset2['train'], dataset3['train']])

In [None]:
final_dataset

In [None]:
final_dataset = final_dataset.train_test_split(test_size = 0.2, shuffle =True)

In [None]:
final_dataset

### LLM 로드

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

model = LlamaForCausalLM.from_pretrained(base_model, quantization_config=bnb_config, device_map = 'auto')


### PEFT 학습

In [None]:
list(TaskType)

In [None]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)  # Explicitly specify!


In [None]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])


    # lm_head is often excluded.
    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


modules = find_all_linear_names(model)
modules

In [None]:
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
                        inference_mode=False, # 학습하는지  
                        r=16, # 작을 수록 trainable 한 파라미터의 개수가 낮아진ㄷ.ㅏ  
                        lora_alpha=16,  # scaling factor 
                        lora_dropout=0.1) # dropout 

model = get_peft_model(model, peft_config)


In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")


In [None]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max length: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


# Change the max length depending on hardware constraints.
max_length = get_max_length(model)
print(max_length)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


In [None]:
model.config.use_cache = False


In [None]:

args = TrainingArguments(
    output_dir="llamata_ver_1",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=3000,
    logging_steps=1000,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=False,
    optim = "adamw_torch",
    save_strategy = "steps",
    save_steps = 300,
    save_total_limit=2

)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=final_dataset['train'],
    eval_dataset=final_dataset['test']
)

In [None]:
### VITRUV LLM VER 1  학습 시작. 

In [None]:
trainer.train()

## 저장 

In [7]:
#base_model 

model = AutoModelForCausalLM.from_pretrained(base_model, 
                                            return_dict = True, 
                                            torch_dtype = torch.float16,
                                            device_map = device
                                            )


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

TypeError: module name should be a string. Got device

In [None]:
tokenizer = 

In [10]:


model = PeftModel.from_pretrained(model, r'llamata_ver_1/checkpoint-36600')

model = model.merge_and_unload()

In [13]:
final_save_folder = './vitruc_final'


model.save_pretrained(final_save_folder)
tokenizer.save_pretrained(final_save_folder)

('./vitruc_final/tokenizer_config.json',
 './vitruc_final/special_tokens_map.json',
 './vitruc_final/tokenizer.json')

In [15]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/vitruv/.cache/huggingface/token
Login successful


In [16]:
model.push_to_hub('vitruv/vitruv_1', token = True)
tokenizer.push_to_hub('vitruv/vitruv_1', token = True)

model-00004-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/vitruv/vitruv_1/commit/d66ad8043b9c01530149b6d93cf3e278552fa442', commit_message='Upload tokenizer', commit_description='', oid='d66ad8043b9c01530149b6d93cf3e278552fa442', pr_url=None, pr_revision=None, pr_num=None)

In [5]:
from transformers import AutoConfig, AutoModel, AutoTokenizer



config = AutoConfig.from_pretrained("vitruv/vitruv_1")
model = AutoModel.from_pretrained("vitruv/vitruv_1")
tokenizer = AutoTokenizer.from_pretrained("vitruv/vitruv_1")

config.json:   0%|          | 0.00/705 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/87.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.59M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
