In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("saltlux/Ko-Llama3-Luxia-8B")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cpu")

In [3]:
df = pd.read_csv("./data/train.csv")
df.head()

Unnamed: 0,text,label
0,"self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, ...",코드2
1,현재 이미지를 여러개 업로드 하기 위해 자바스크립트로 동적으로 폼 여러개 생성하는데...,웹
2,glob.glob(PATH) 를 사용할 때 질문입니다.\n\nPATH에 [ ] 가 ...,코드2
3,"tmpp = tmp.groupby(by = 'Addr1', as_index=Fals...",코드2
4,filename = TEST_IMAGE + str(round(frame_sec)) ...,코드2


In [4]:
label_dict = {
    '코드1': 0,
    '코드2': 1,
    '웹': 2,
    '이론': 3,
    '시스템 운영': 4,
    '원격': 5
}

df['label'] = df['label'].map(label_dict)

df.head()

Unnamed: 0,text,label
0,"self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, ...",1
1,현재 이미지를 여러개 업로드 하기 위해 자바스크립트로 동적으로 폼 여러개 생성하는데...,2
2,glob.glob(PATH) 를 사용할 때 질문입니다.\n\nPATH에 [ ] 가 ...,1
3,"tmpp = tmp.groupby(by = 'Addr1', as_index=Fals...",1
4,filename = TEST_IMAGE + str(round(frame_sec)) ...,1


In [5]:
import re
import emoji
from soynlp.normalizer import repeat_normalize

pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣]+')
url_pattern = re.compile(
    r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

def clean(x):
    x = pattern.sub(' ', x)
    x = emoji.replace_emoji(x, replace='') #emoji 삭제
    x = url_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x

In [6]:
x_train = [clean(x) for x in df['text'].values]

df['remove_special_sent'] = x_train

df.head()

Unnamed: 0,text,label,remove_special_sent
0,"self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, ...",1,"self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, ..."
1,현재 이미지를 여러개 업로드 하기 위해 자바스크립트로 동적으로 폼 여러개 생성하는데...,2,현재 이미지를 여러개 업로드 하기 위해 자바스크립트로 동적으로 폼 여러개 생성하는데...
2,glob.glob(PATH) 를 사용할 때 질문입니다.\n\nPATH에 [ ] 가 ...,1,glob.glob(PATH) 를 사용할 때 질문입니다. PATH에 [ ] 가 포함되...
3,"tmpp = tmp.groupby(by = 'Addr1', as_index=Fals...",1,"tmpp = tmp.groupby(by = 'Addr1', as_index=Fals..."
4,filename = TEST_IMAGE + str(round(frame_sec)) ...,1,filename = TEST_IMAGE + str(round(frame_sec)) ...


In [7]:
#train & test 데이터로 나누기
from sklearn.model_selection import train_test_split


data = df[['remove_special_sent', 'label']]
dataset_train, dataset_valid = train_test_split(data, test_size=0.2, random_state=0)

# 내용 확인
print(dataset_train.iloc[0])
print(dataset_valid.iloc[0])

remove_special_sent    new로 새로운 객체를 만들고 그 새로 만들어진 객체에 값을 넣기위해 Person함...
label                                                                  2
Name: 2287, dtype: object
remove_special_sent    실습 피드백 요청합니다. 손해 부분의 경우, 자사 서비스를 사용했을 때 발생하는 손...
label                                                                  3
Name: 1032, dtype: object


In [8]:
print(len(dataset_train))
print(len(dataset_valid))

2964
742


In [9]:
# Setting parameters
max_len = 150
batch_size = 32
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-6

In [10]:
# train_data tokenize

encoded_train = tokenizer(
    dataset_train['remove_special_sent'].tolist(),
    return_tensors='pt',
    max_length=max_len,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

# 확인
print( encoded_train[0].tokens )
print( encoded_train[0].ids )
print( encoded_train[0].attention_mask )
print()
print('디코딩 :',tokenizer.decode(encoded_train[0].ids))

['new', 'ë¡ľ', 'ĠìĥĪë¡ľìļ´', 'Ġê°Ŀì²´', 'ë¥¼', 'Ġë§Įëĵ¤', 'ê³ł', 'Ġê·¸', 'ĠìĥĪë¡ľ', 'Ġë§Įëĵ¤ìĸ´', 'ì§Ħ', 'Ġê°Ŀì²´', 'ìĹĲ', 'Ġê°ĴìĿĦ', 'ĠëĦ£', 'ê¸°', 'ìľĦ', 'íķ´', 'ĠPerson', 'íķ¨', 'ìĪĺ', 'ìĹĲìĦľ', 'Ġthis', 'ë¥¼', 'ĠìĿ´ìļ©íķľ', 'ê²Į', 'Ġë§ŀ', 'ëĤĺìļĶ', '?', 'Ġê·¸ëŁ¼', 'ĠPerson', 'ìĿ´ëĿ¼ëĬĶ', 'Ġíķ¨ìĪĺ', 'ëĬĶ', 'Ġê°ĴìĿĦ', 'ĠëĦ£', 'ê¸°', 'ìľĦ', 'íķľ', 'Ġë³´ì¡°', 'ìĹŃ', 'íķł', 'ìĿ´ëĿ¼ê³ł', 'ĠìĥĿê°ģ', 'íķ´', 'ëıĦ', 'ĠëĲłê¹ĮìļĶ', '?', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>

In [11]:
# valid data tokenized

encoded_valid = tokenizer(
    dataset_valid['remove_special_sent'].tolist(),
    return_tensors='pt',
    max_length=max_len,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

print( encoded_valid[0].tokens )
print( encoded_valid[0].ids )
print( encoded_valid[0].attention_mask )
print()
print('디코딩 :',tokenizer.decode(encoded_valid[-1].ids))

['ìĭ¤', 'ìĬµ', 'ĠíĶ¼ëĵľë°±', 'ĠìļĶì²Ń', 'íķ©ëĭĪëĭ¤', '.', 'ĠìĨĲíķ´', 'Ġë¶Ģë¶Ħ', 'ìĿĺ', 'Ġê²½ìļ°', ',', 'ĠìŀĲìĤ¬', 'ĠìĦľë¹ĦìĬ¤', 'ë¥¼', 'ĠìĤ¬ìļ©', 'íĸĪ', 'ìĿĦ', 'ĠëķĮ', 'Ġë°ľìĥĿ', 'íķĺëĬĶ', 'ĠìĨĲíķ´', 'ìĹĲ', 'ĠëĮĢíķ´', 'ĠìŀĳìĦ±', 'íķĺëĬĶ', 'Ġê²ĥ', 'ìĿ¸ê°Ģ', 'ìļĶ', '?', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of_text|>', '<|end_of

In [12]:
from torch.utils.data import Dataset, DataLoader

# 데이터셋 클래스
class ReviewDataset(Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = { key: torch.tensor(val[idx]) for key, val in self.encodings.items() }
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)


# 데이터셋 생성
train_dataset = ReviewDataset(encoded_train, dataset_train['label'].values)
valid_dataset = ReviewDataset(encoded_valid, dataset_valid['label'].values)

In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

model = model = AutoModelForCausalLM.from_pretrained("saltlux/Ko-Llama3-Luxia-8B", force_download=True)
model.to(device)

Downloading shards:   0%|          | 0/4 [03:07<?, ?it/s]


KeyboardInterrupt: 