<a href="https://colab.research.google.com/github/ttogle918/AI_practice/blob/main/TOPIC/%ED%86%A0%ED%94%BD%EB%B6%84%EB%A5%98_TfidVectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

In [None]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
CFG = {
    'EPOCHS':10,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':256,
    'SEED':41
}

In [None]:
from datasets import load_dataset
dataset_tc = load_dataset('klue', 'ynat')
dataset_tc, dataset_tc['train'][0]



  0%|          | 0/2 [00:00<?, ?it/s]

(DatasetDict({
     train: Dataset({
         features: ['guid', 'title', 'label', 'url', 'date'],
         num_rows: 45678
     })
     validation: Dataset({
         features: ['guid', 'title', 'label', 'url', 'date'],
         num_rows: 9107
     })
 }),
 {'guid': 'ynat-v1_train_00000',
  'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
  'label': 3,
  'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
  'date': '2016.06.30. 오전 10:36'})

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
title, label = [], []
for i in range(dataset_tc['train'].num_rows) :
  data = dataset_tc['train'][i]
  title.append(data['title'])
  label.append(data['label'])
len(title), len(label)

(45678, 45678)

In [None]:
trainset = pd.DataFrame({'title' : title, 'label' : label})
trainset.head()

Unnamed: 0,title,label
0,유튜브 내달 2일까지 크리에이터 지원 공간 운영,3
1,어버이날 맑다가 흐려져…남부지방 옅은 황사,3
2,내년부터 국가RD 평가 때 논문건수는 반영 않는다,2
3,김명자 신임 과총 회장 원로와 젊은 과학자 지혜 모을 것,2
4,회색인간 작가 김동식 양심고백 등 새 소설집 2권 출간,3


In [None]:
title, label = [], []
for i in range(dataset_tc['validation'].num_rows) :
  data = dataset_tc['validation'][i]
  title.append(data['title'])
  label.append(data['label'])
len(title), len(label)

(9107, 9107)

In [None]:
validset = pd.DataFrame({'title' : title, 'label' : label})
validset.head()

Unnamed: 0,title,label
0,5억원 무이자 융자는 되고 7천만원 이사비는 안된다,2
1,왜 수소충전소만 더 멀리 떨어져야 하나 한경연 규제개혁 건의,2
2,항응고제 성분 코로나19에 효과…세포실험서 확인,0
3,실거래가 가장 비싼 역세권은 신반포역…3.3㎡당 1억 육박,1
4,기자회견 하는 성 소수자 단체,2


In [None]:
trainset['label'].unique()
validset['label'].unique()

array([2, 0, 1, 3, 6, 4, 5])

# pre-processing

In [None]:
# 1. 문장(Text) 벡터화 -> TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 4, analyzer = 'word', ngram_range=(1, 2))
vectorizer.fit(np.array(trainset["title"]))

train_vec = vectorizer.transform(trainset["title"])
val_vec = vectorizer.transform(validset["title"])

print(train_vec.shape, val_vec.shape)

(45678, 16343) (9107, 16343)


# train

In [None]:
train_labels = trainset['label']
valid_labels = validset['label']

# dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, st_vec, st_labels):
        self.st_vec = st_vec
        self.st_labels = st_labels

    def __getitem__(self, index):
        st_vector = torch.FloatTensor(self.st_vec[index].toarray()).squeeze(0)
        return st_vector, self.st_labels[index]

    def __len__(self):
        return len(self.st_vec.toarray())

In [None]:
train_dataset = CustomDataset(train_vec, train_labels)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_vec, valid_labels)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

# model define

In [None]:
class BaseModel(nn.Module):
    def __init__(self, input_dim=9351):
        super(BaseModel, self).__init__()
        self.feature_extract = nn.Sequential(
            nn.Linear(in_features=input_dim, out_features=1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),

            nn.Linear(in_features=1024, out_features=1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),

            nn.Linear(in_features=1024, out_features=512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=512, out_features=7)
        )

            
    def forward(self, x):
        x = self.feature_extract(x)
        output = self.classifier(x)

        return output

# train

In [None]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    
    criterion = nn.CrossEntropyLoss().to(device)
    
    best_loss = 999999
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for sentence, label in tqdm(iter(train_loader)):
            sentence = sentence.to(device)
            label = label.to(device)
            
            optimizer.zero_grad()
            
            logit = model(sentence)
            
            loss = criterion(logit, label)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss, val_f1 = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] F1 : [{val_f1:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_loss)
            
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            
    return best_model

In [None]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    
    preds = []
    labels = []
    
    with torch.no_grad():
        for sentence, label in tqdm(iter(val_loader)):
            sentence = sentence.to(device)
            label = label.to(device)
            
            logit = model(sentence)
            
            loss = criterion(logit, label)
            
            val_loss.append(loss.item())

            preds += logit.argmax(1).detach().cpu().numpy().tolist()
            labels += label.detach().cpu().numpy().tolist()

    f1 = f1_score(labels, preds, average='mic')
    
    return np.mean(val_loss), f1

# run

In [None]:
model = BaseModel(train_vec.shape[1])
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [0.94053] Val Loss : [0.92979] F1 : [0.66803]


  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

Epoch : [2] Train Loss : [0.23159] Val Loss : [0.98669] F1 : [0.65831]


  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

Epoch : [3] Train Loss : [0.07759] Val Loss : [1.04436] F1 : [0.66517]


  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

Epoch : [4] Train Loss : [0.04358] Val Loss : [1.08586] F1 : [0.66454]
Epoch 00004: reducing learning rate of group 0 to 5.0000e-05.


  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

Epoch : [5] Train Loss : [0.02882] Val Loss : [1.11247] F1 : [0.66513]


  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

Epoch : [6] Train Loss : [0.02291] Val Loss : [1.15035] F1 : [0.66022]


  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

Epoch : [7] Train Loss : [0.02165] Val Loss : [1.16948] F1 : [0.66234]
Epoch 00007: reducing learning rate of group 0 to 2.5000e-05.


  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

Epoch : [8] Train Loss : [0.01899] Val Loss : [1.18699] F1 : [0.66072]


  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

Epoch : [9] Train Loss : [0.01869] Val Loss : [1.17976] F1 : [0.66734]


  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

Epoch : [10] Train Loss : [0.01802] Val Loss : [1.20938] F1 : [0.66138]
Epoch 00010: reducing learning rate of group 0 to 1.2500e-05.
