In [1]:
!nvidia-smi

Mon Mar 11 10:51:17 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:19:00.0 Off |                  Off |
|  0%   29C    P8              32W / 450W |   1744MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        Off | 00000000:1A:00.0 Off |  

### Library Import

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import json
import random
import transformers
import torch

from tqdm.auto import notebook_tqdm
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, StoppingCriteria, StoppingCriteriaList
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel, PeftConfig
from datasets import Dataset, load_dataset
from trl import SFTTrainer

[2024-03-11 10:51:24,208] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


### Data Preprocessing

In [3]:
csv_file = './data/train.csv'
data = pd.read_csv(csv_file)

json_data = []

for _, row in notebook_tqdm(data.iterrows()):
    question_1 = row['질문_1']
    question_2 = row['질문_2']
    
    for i in range(1, 6):
        answer = row[f'답변_{i}']
        json_data.append({
            "question": question_1+"\n"+question_2,
            "answer": answer
        })

json_string = json.dumps(json_data, ensure_ascii=False, indent=4)
with open('train_json.json', 'w', encoding='utf-8') as file:
    file.write(json_string)

print("Done!")

0it [00:00, ?it/s]

Done!


### Augmented Data

In [4]:
with open('./train_json.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

In [5]:
def augment_data(data_list):
    # 데이터를 랜덤하게 선택
    pair = random.sample(data_list, 2)

    # 연결한 문구를 랜덤하게 선택
    conjunctions = [" 그리고 ", " 또한 ", " 과 ", " 와 "]
    conjunction = random.choice(conjunctions)

    # 두 데이터를 이어붙임
    combined_question = pair[0]['question'] + conjunction + pair[1]['question']
    combined_answer = pair[0]['answer'] + conjunction + pair[1]['answer']

    new_data = {
        "question": combined_question,
        "answer": combined_answer
    }
    return new_data

In [6]:
augmented_data = [augment_data(data) for _ in range(500)]
augmented_full_data = data + augmented_data

In [7]:
with open('./augmented_data.json', 'w', encoding='utf-8') as file:
    json.dump(augmented_full_data, file, ensure_ascii=False, indent=4)

### Model Finetuning

In [None]:
model_id = "beomi/llama-2-koen-13b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    eos_token="<|endoftext|>"
    )

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    rope_scaling = {"type": "dynamic", "factor": 2}
)

model.config.use_cache=False
model.config.pretraining_tp=1

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
def trans(x):
    return {'text':f"당신은 질문에 답변하는 역할을 하는 챗봇입니다. 사용자의 질문에 올바른 답변을 하세요.\n### 질문: {x['question']}\n### 답변: {x['answer']}<|endoftext|>"}
    
data = Dataset.from_json('augmented_data.json')
train_data = data.map(lambda x: trans(x))

In [None]:
model.train()
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

output_path = "./result/llama2-13b/"

config = LoraConfig(
    lora_alpha=256,
    lora_dropout=0.05,
    r=128,
    target_modules=['v_proj', 'up_proj', 'down_proj', 'k_proj', 'o_proj', 'q_proj', 'gate_proj'],
    bias="none",
    task_type="CAUSAL_LM"
)

train_params = TrainingArguments(
    output_dir=output_path,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    save_strategy="epoch", 
    optim="paged_adamw_8bit",
    learning_rate=1e-4,
    logging_steps=100,
    weight_decay=0.01,
    max_grad_norm=0.3,
    warmup_ratio=0.1,
    fp16=True,
    lr_scheduler_type="cosine",
    seed=42
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    peft_config=config,
    dataset_text_field='text',
    tokenizer=tokenizer,
    args=train_params,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()
trainer.model.save_pretrained(output_path)

### Inference

In [None]:
model_id = "beomi/llama-2-koen-13b"
peft_path = './result/llama2-13b/'
config = PeftConfig.from_pretrained(peft_path)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id,
                                         eos_token="<|endoftext|>")
model = AutoModelForCausalLM.from_pretrained(model_id,
                                            quantization_config=bnb_config,
                                            device_map="auto",
                                            trust_remote_code=True,
                                            rope_scaling = {"type": "dynamic", "factor": 2})
model = PeftModel.from_pretrained(model, peft_path)
model.eval()

In [None]:
class LocalStoppingCriteria(StoppingCriteria):

    def __init__(self, tokenizer, stop_words=[]):
        super().__init__()

        stops = [tokenizer(stop_word, return_tensors='pt', add_special_tokens=False)['input_ids'].squeeze() for
                 stop_word in stop_words]
        print('stop_words', stop_words)
        print('stop_words_ids', stops)
        self.stop_words = stop_words
        self.stops = [stop.cuda() for stop in stops]
        self.tokenizer = tokenizer

    def _compare_token(self, input_ids):
        for stop in self.stops:
            if len(stop.size()) != 1:
                continue
            stop_len = len(stop)
            if torch.all((stop == input_ids[0][-stop_len:])).item():
                return True

        return False

    def _compare_decode(self, input_ids):
        input_str = self.tokenizer.decode(input_ids[0])
        for stop_word in self.stop_words:
            if input_str.endswith(stop_word):
                return True
        return False

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        input_str = self.tokenizer.decode(input_ids[0])
        for stop_word in self.stop_words:
            if input_str.endswith(stop_word):
                return True
        return False

In [None]:
stop_words = ["<|endoftext|>", "###", "</s>"]
stopping_criteria = StoppingCriteriaList([LocalStoppingCriteria(tokenizer=tokenizer, stop_words=stop_words)])

In [None]:
def gen(x):
    q = f"당신은 질문에 답변하는 역할을 하는 챗봇입니다. 사용자의 질문에 올바른 답변을 하세요.\n### 질문: {x}\n### 답변:"

    gened = model.generate(
        **tokenizer(
            q, 
            return_tensors='pt',
        ),
        do_sample=True,
        temperature=1.0,
        num_beams=3,
        top_p=1.0,
        top_k=20,
        epsilon_cutoff=9e-4,
        eta_cutoff=2e-3,
        penalty_alpha=1.0,
        max_new_tokens=1024,
        early_stopping=True,
        eos_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2,
        stopping_criteria=stopping_criteria
    )
    
    translation = tokenizer.decode(gened[0], skip_special_tokens=True)
    translation = translation.split("### 답변:")[1].split("<|endoftext|>")[0].strip()
    print('-------------------------------------------------------------------------------')
    print("Question:",x)
    print('-------------------------------------------------------------------------------')
    print("Answer:",translation)
    print("")
    return translation

In [None]:
test_df = pd.read_csv('./data/test.csv')
test_df.head()

temp_df = pd.DataFrame(columns=['answer'])
temp_list = []

for i in tqdm(range(len(test_df))):
    temp_list.append(gen(test_df['질문'][i]))

temp_df['answer'] = temp_list
temp_df.to_csv('answer.csv', index=False, encoding='utf-8')

### Embedding

In [None]:
test_data = pd.read_csv('answer.csv', encoding='utf-8')
submission_df = pd.read_csv('./data/sample_submission.csv')

model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

for i in tqdm(range(len(test_data))):
    pred_embed = model.encode(test_data['answer'][i])
    submission_df.loc[i, submission_df.columns[1:]] = pred_embed

submission_df.to_csv('submission.csv', index=False)