In [1]:
import urllib.request
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [2]:
# 1. 데이터셋 다운로드
#urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="datas/ratings_train.txt")
#urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="datas/ratings_test.txt")

In [3]:
# 1. 데이터셋 로드
train_data = pd.read_csv("datas/ratings_train.txt", names=["id", "document", "label"], skiprows=1, sep="\t")
test_data = pd.read_csv("datas/ratings_test.txt", names=["id", "document", "label"], skiprows=1, sep="\t")

train_data = train_data.dropna(axis=0)
test_data = test_data.dropna(axis=0)

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

In [4]:
# 2. 모델 및 토크나이저 로드
#model_name = "skt/kogpt2-base-v2"
model_name = 'bert-base-multilingual-cased' 
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 3. 데이터셋 전처리
def preprocess_function(examples):
    texts = examples["document"]
    encodings = tokenizer(texts, padding="max_length", truncation=True, max_length=512)
    encodings["labels"] = [float(label) for label in examples["label"]] 
    return encodings

encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/149995 [00:00<?, ? examples/s]

Map:   0%|          | 0/49997 [00:00<?, ? examples/s]

In [6]:
# 4. 토크나이저와 데이터셋 병합
model.resize_token_embeddings(len(tokenizer))

Embedding(119547, 768, padding_idx=0)

In [7]:
# 5. 모델 fine-tuning
training_args = TrainingArguments(
    output_dir="models",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="logs",
    logging_steps=10,
)

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
)

In [9]:
progress_bar = tqdm(range(training_args.num_train_epochs))
for epoch in progress_bar:
    trainer.train()
    metrics = trainer.evaluate()
    loss = metrics["eval_loss"]
    progress_bar.set_description(f"Epoch {epoch+1}/{training_args.num_train_epochs}, Loss: {loss:.4f}")

  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: Target size (torch.Size([16])) must be the same as input size (torch.Size([16, 2]))