In [None]:
# 모델 예측 전 확인해야 할 사항

# 모델 GPU 성능에 맞춰서 batch_size, new_prompt_1의 numb을 조정해야 함

In [None]:
import subprocess
import os
import tqdm as notebook_tqdm

from sklearn.model_selection import train_test_split
import datasets
import pickle

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline, logging, Trainer

from peft import LoraConfig
from trl import SFTTrainer

# Huggingface에서 LLaMA 모델 권한 신청 이후, 권한 신청한 계정의 토큰을 [your token]에 입력
subprocess.run(["huggingface-cli", "login", "--token", "[your token]"])

with open(file='/data/log-data-2024/prompt_data.pickle', mode='rb') as f:
    data = pickle.load(f)
    
# Model
torch_dtype = torch.float16
    
model_ckpt = "meta-llama/Meta-Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=False
)

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenizer.pad_token = tokenizer.eos_token

print(tokenizer.pad_token, tokenizer.pad_token_id)
print(tokenizer.eos_token, tokenizer.eos_token_id)

model = AutoModelForCausalLM.from_pretrained(
    model_ckpt,
    low_cpu_mem_usage=True,
    quantization_config=bnb_config,
    torch_dtype=torch_dtype,
    device_map='auto'
)

model.config.use_cache = False

# Dataset
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = datasets.Dataset.from_dict({key: [item[key] for item in train_data] for key in train_data[0]})
test_dataset = datasets.Dataset.from_dict({key: [item[key] for item in test_data] for key in test_data[0]})

# Tokenizing

def tokenize_function(examples):
    inputs = tokenizer(examples["prompt"], padding="max_length", truncation=True, max_length=200) # 200까지 줄여도 될듯
    outputs = tokenizer(examples["completion"], padding="max_length", truncation=True, max_length=20) # 20까지 줄여도 될듯
    inputs["labels"] = outputs["input_ids"]
    return inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
import matplotlib.pyplot as plt

# Calculate token lengths for each dataset
train_input_lengths = [len(tokenizer(example['prompt'], truncation=True, max_length=500)["input_ids"]) for example in train_data]
train_output_lengths = [len(tokenizer(example['completion'], truncation=True, max_length=50)["input_ids"]) for example in train_data]

test_input_lengths = [len(tokenizer(example['prompt'], truncation=True, max_length=500)["input_ids"]) for example in test_data]
test_output_lengths = [len(tokenizer(example['completion'], truncation=True, max_length=50)["input_ids"]) for example in test_data]

# Plot histograms for train dataset
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(train_input_lengths, bins=30, edgecolor='black')
plt.title('Train Dataset Input Lengths')
plt.xlabel('Token Length')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(train_output_lengths, bins=30, edgecolor='black')
plt.title('Train Dataset Output Lengths')
plt.xlabel('Token Length')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Plot histograms for test dataset
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(test_input_lengths, bins=30, edgecolor='black')
plt.title('Test Dataset Input Lengths')
plt.xlabel('Token Length')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(test_output_lengths, bins=30, edgecolor='black')
plt.title('Test Dataset Output Lengths')
plt.xlabel('Token Length')

In [None]:
!python prediction.py