In [3]:
!pip install transformers
!pip install pytorch_lightning
!pip install adamp

Collecting pytorch_lightning
  Obtaining dependency information for pytorch_lightning from https://files.pythonhosted.org/packages/60/eb/f29a4511a2675c9c14ca31cde4562f7676cf70396cf9e599210dca2f1e66/pytorch_lightning-2.1.2-py3-none-any.whl.metadata
  Downloading pytorch_lightning-2.1.2-py3-none-any.whl.metadata (21 kB)
Collecting torch>=1.12.0 (from pytorch_lightning)
  Obtaining dependency information for torch>=1.12.0 from https://files.pythonhosted.org/packages/d6/a8/43e5033f9b2f727c158456e0720f870030ad3685c46f41ca3ca901b54922/torch-2.1.1-cp311-cp311-win_amd64.whl.metadata
  Downloading torch-2.1.1-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Obtaining dependency information for torchmetrics>=0.7.0 from https://files.pythonhosted.org/packages/a3/88/cc27059747ddecff744826e38014822023cbfff4ca079a6ee9a96602dd0b/torchmetrics-1.2.0-py3-none-any.whl.metadata
  Downloading torchmetrics-1.2.0-py3-none-any.whl.metadata (21 kB)
Collecting

In [1]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ExponentialLR, CosineAnnealingWarmRestarts
from torch.optim import AdamW
from adamp import AdamP

from sklearn.model_selection import StratifiedKFold

import pytorch_lightning as pl
from pytorch_lightning import LightningModule

import transformers
from transformers import AutoModel, AutoTokenizer
transformers.logging.set_verbosity_error()

import argparse
parser = argparse.ArgumentParser(description="Classifier")
parser.add_argument('--pretrained_model', default='kykim/funnel-kor-base', type=str)
parser.add_argument('--pretrained_tokenizer', default='', type=str)
parser.add_argument('--batch_size', default=16, type=int)
parser.add_argument('--lr', default=4e-5, type=float)
parser.add_argument('--epochs', default=5, type=int)
parser.add_argument('--max_length', default=312, type=int)
parser.add_argument('--train_data_path', default='./train.csv', type=str)
parser.add_argument('--val_data_path', default='', type=str)
parser.add_argument('--optimizer', default='AdamW')
parser.add_argument('--lr_scheduler', default='none')
parser.add_argument('--device', default=torch.device('cuda'), type=int)
parser.add_argument('--mixed_precision', default=16, type=int)
parser.add_argument('--cpu_workers', default=os.cpu_count(), type=int)
parser.add_argument('--seed', default=1102, type=int)
parser.add_argument('--date', default=1029, type=int)
args = parser.parse_args('')

def set_seeds(seed=args.seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    pl.seed_everything(seed)

set_seeds()
os.chdir("C:/Users/82107/Documents/GitHub/hello")
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
submission_id = f"{parser.description}_{args.date}"

print("Using PyTorch Ver", torch.__version__)
print("Using Lightning Ver", pl.__version__)
print("Fix Seed:", args.seed)
print("Submission ID:", submission_id)

Seed set to 1102


Using PyTorch Ver 2.1.1+cpu
Using Lightning Ver 2.1.2
Fix Seed: 1102
Submission ID: Classifier_1029


In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
data2 = pd.read_csv("C:/Users/82107/Documents/GitHub/train_맞춤법검사.csv")
traindata, testdata = train_test_split(data2, test_size=0.3, random_state=42)
traindata = traindata.iloc[:,:2]
testdata = testdata.iloc[:,:1]
print(traindata.head(),testdata.head())
train_df = traindata
test_df = testdata

                                                  text  label
84   아침일찍 가서 그런지 품절이 자주 되는 소금빵은 있었으나 샌드위치 등은 부족했음. ...      1
243  "성싱담 빵집의 위치가 좋아서 자주 방문하게 되는데, 주변 환경도 조용하고 아늑합니...      0
92   ��🥐🥖 너무나도 다양하고 예쁜 빵이 많아서 눈이 돌아가는 줄 알았어요!!🥪 이번에...      1
195  "성싱담에서 구운 신선한 빵은 항상 기대 이상이에요. 밤 빵은 밤의 고소한 맛과 부...      0
126  명물인 튀김소보로 종류 다 맛있고요,,,,새롭게 나온 빵들도 맛있어요:) 무엇보다 ...      1                                                   text
30   빵 종류도 많고 가격이 타 메이커 보다 상대적으로 저렴합니다계산대도 많아서 줄이 금...
116                        출장길에들름튀김소보로 초코소보로 맛있게 먹었습니다
79        빵 천국. 여전히 튀김소보로는 최고이고애플파이 고로케 반미 우유생크림 다 맛있음
127  말할 것도 없이 최고예요 지하철역 바로 앞이고 손님 많은데도 직원분들 친절하시고 빵...
190  "대전의 성싱담 빵집을 방문한 건 정말 행운이었어요. 각 빵은 그 자체로 예술작품 ...


In [3]:
X = train_df["text"].values
y = train_df["label"].values
X_test = test_df["text"].values

X.shape, y.shape, X_test.shape

((188,), (188,), (81,))

In [4]:
class CustomDataset(Dataset):
    def __init__(self, sentence, label):
        self.sentence = sentence
        self.label = label
        self.tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

    def __len__(self):
        return len(self.sentence)

    def __getitem__(self, idx):
        sentence = self.sentence[idx]
        encoded = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            return_token_type_ids=True,
            return_attention_mask=True,
        )
        
        input_ids = encoded["input_ids"][0]
        token_type_ids = encoded["token_type_ids"][0]
        attention_masks = encoded["attention_mask"][0]
        
        if self.label is not None:
            label = self.label[idx]
            return [input_ids, token_type_ids, attention_masks], label
        else:
            return [input_ids, token_type_ids, attention_masks]

In [5]:
latent_dim= 384

class SwiGLU(nn.Module):
    def forward(self, x):
        x, gate = x.chunk(2, dim=-1)
        return F.silu(gate) * x

class Backbone(nn.Module):
    def __init__(self, latent_dim=latent_dim):
        super().__init__()
        self.txt_model = AutoModel.from_pretrained(args.pretrained_model)
        self.classifier = nn.Sequential(
            SwiGLU(),
            nn.Linear(latent_dim, 1, bias=False)
        )

    def forward(self, x):
        input_ids = x[0]
        token_type_ids = x[1]
        attention_mask = x[2]
        
        txt_side = self.txt_model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
        )
        
        txt_feature = txt_side.last_hidden_state[:, 0, :]
        
        output = self.classifier(txt_feature)
        
        return output

In [6]:
class Model(LightningModule):
    def __init__(self, backbone, args):
        super().__init__()
        self.backbone = backbone
        
    def forward(self, **kwargs):
        return self.backbone(**kwargs)

    def step(self, batch):
        x, y = batch
        y_hat = self.backbone(x)
        loss = nn.BCEWithLogitsLoss()(y_hat.squeeze(), y.float())
        return loss, y, y_hat

    def training_step(self, batch):
        loss, y, y_hat = self.step(batch)
        pred = (y_hat > 0).float()
        accuracy = (pred.squeeze() == y).float().mean()
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_acc", accuracy, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch):
        loss, y, y_hat = self.step(batch)
        pred = (y_hat > 0).float()
        accuracy = (pred.squeeze() == y).float().mean()
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_acc", accuracy, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def test_step(self, batch):
        loss, y, y_hat = self.step(batch)
        pred = (y_hat > 0).float()
        accuracy = (pred.squeeze() == y).float().mean()
        self.log("test_acc:", accuracy)

    def predict_step(self, batch, dataloader_idx=0):
        y_hat = self.backbone(batch)
        return y_hat
    
    def configure_optimizers(self):
        if args.optimizer == 'AdamW':
            optimizer = AdamW(self.parameters(), lr=args.lr)
        if args.optimizer == 'AdamP':
            optimizer = AdamP(self.parameters(), lr=args.lr)
        if args.lr_scheduler == "none":
            return [optimizer]
        
        if args.lr_scheduler == 'cos':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2)
        if args.lr_scheduler == 'exp':
            scheduler = ExponentialLR(optimizer, gamma=0.5)
        
        return [optimizer], [scheduler]

In [7]:
val_acc_list = []
preds_list = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=args.seed)

for i, (train_index, val_index) in enumerate(skf.split(X, y)):

    X_train = X[train_index]
    X_val = X[val_index]

    y_train = y[train_index]
    y_val = y[val_index]
    
    train_ds = CustomDataset(X_train, y_train)
    val_ds = CustomDataset(X_val, y_val)
    test_ds = CustomDataset(X_test, None)

    train_dataloader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.cpu_workers)
    val_dataloader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False, num_workers=args.cpu_workers)
    test_dataloader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, num_workers=args.cpu_workers)

    #
    model = Model(Backbone(), args)

    callbacks = [
        pl.callbacks.ModelCheckpoint(
            dirpath="saved/", filename=f"{args.pretrained_model}_{i}",
            monitor="val_acc", mode="max"
        ),
    ]
    
    trainer = pl.Trainer(
        num_sanity_val_steps=0,
        max_epochs=args.epochs, accelerator="auto", callbacks=callbacks,
        precision=args.mixed_precision,
        devices=1
    )

    trainer.fit(model, train_dataloader, val_dataloader)
    
    #
    ckpt = torch.load(f"saved/{args.pretrained_model}_{i}.ckpt")
    model.load_state_dict(ckpt['state_dict'])

    eval_dict = trainer.validate(model, dataloaders=val_dataloader)[0]
    val_acc_list.append(eval_dict["val_acc"])
    
    y_preds = trainer.predict(model, dataloaders=test_dataloader)
    preds_list.append(np.vstack(y_preds))
    
val_acc_mean = np.mean(val_acc_list)

print(f"VAL FOLD MEAN: {val_acc_mean}")

c:\Users\82107\anaconda3\Lib\site-packages\lightning_fabric\connector.py:565: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
c:\Users\82107\anaconda3\Lib\site-packages\pytorch_lightning\trainer\connectors\accelerator_connector.py:557: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
c:\Users\82107\anaconda3\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger

In [53]:
args.device

device(type='cuda')