<a href="https://colab.research.google.com/github/ysys143/ml2024/blob/main/training_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q pip==23.3.1

In [None]:
!pip install -q torch pytorch-lightning ratsnlp

In [None]:
import torch
from ratsnlp.nlpbook.classification import ClassificationTrainArguments


In [None]:
args = ClassificationTrainArguments(
    pretrained_model_name="beomi/kcbert-base",
    downstream_corpus_name="nsmc",
    downstream_model_dir="/content/drive/MyDrive/nlpbook/checkpoint-doccls",
    batch_size=32 if torch.cuda.is_available() else 4,
    learning_rate = 5e-5,
    max_seq_length=128,
    epochs=3,
    tpu_cores=0 if torch.cuda.is_available() else 8,
    seed=7,
)

In [None]:
# random seed　고정
from ratsnlp import nlpbook
nlpbook.set_seed(args)

# setting logger
nlpbook.set_logger(args)

In [None]:
#downloading corpus
from Korpora import Korpora
Korpora.fetch(
    corpus_name=args.downstream_corpus_name,
    root_dir=args.downstream_corpus_root_dir,
    force_download=True,
)

In [None]:
# setting tokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name,
    do_lower_case=False,
)

In [None]:
# 데이터로더가 배치를 만들 때 인스턴스를 제공해 batch 형태로 구성
from ratsnlp.nlpbook.classification import NsmcCorpus, ClassificationDataset
corpus = NsmcCorpus()
train_dataset = ClassificationDataset(
    args=args,
    corpus=corpus,
    tokenizer=tokenizer,
    mode="train",
)

In [None]:
# 학습 데이터 로더 구축
from torch.utils.data import DataLoader, RandomSampler
train_dataloader = DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    sampler=RandomSampler(train_dataset, replacement=False), #비복원방식으로 추출
    collate_fn=nlpbook.data_collator, # 선택된 인스턴스들을 배치로 만듬
    drop_last=False,
    num_workers=args.cpu_workers,
)

In [None]:
# 평가 데이터 로더 구축
# 학습용 데이터로더와 달리 랜덤으로 배치 구성할 필요가 없어 SequentialSampler 사용
from torch.utils.data import SequentialSampler
val_dataset = ClassificationDataset(
	args=args,
	corpus=corpus,
	tokenizer=tokenizer,
	mode="test",
)

In [None]:
val_dataloader = DataLoader(
    val_dataset,
    batch_size=args.batch_size,
    sampler=SequentialSampler(val_dataset),
    collate_fn=nlpbook.data_collator,
    drop_last=False,
    num_workers=args.cpu_workers,
)

### 모델 불러오기

In [None]:
from transformers import BertConfig, BertForSequenceClassification
pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels=corpus.num_labels,
)

model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=args.pretrained_model_name,
    config=pretrained_model_config,
)

### 모델 학습시키기

In [None]:
from ratsnlp.nlpbook.classification import ClassificationTask
task = ClassificationTask(model, args)
trainer = nlpbook.get_trainer(args) #GPU/TPU 설정, 로그 및 체크포인트 등 귀찮은 설정을 알아서 해줌

In [None]:
trainer.fit(
    task,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)