In [1]:
!nvidia-smi

Mon Jun 27 17:51:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-PCIE...  On   | 00000000:00:06.0 Off |                    0 |
| N/A   26C    P0    41W / 250W |  17040MiB / 32510MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  On   | 00000000:00:07.0 Off |                    0 |
| N/A   25C    P0    40W / 250W |   2375MiB / 32510MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------

In [2]:
import random
import pandas as pd
import numpy as np
import os
import re
from glob import glob

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

#from transformers import BertTokenizer
from transformers import RobertaTokenizerFast, RobertaModel

from tqdm import tqdm

from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

import warnings
warnings.filterwarnings(action='ignore') 

import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
os.environ["CUDA_VISIBLE_DEVICES"]= "0, 1"  # Set the GPU 2 to use, 멀티 gpu

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
#GPU 체크 및 할당
if torch.cuda.is_available():    
    #device = torch.device("cuda:0")
    print('Device:', device)
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')


Device: cuda
There are 2 GPU(s) available.
We will use the GPU: Tesla V100-PCIE-32GB


In [5]:
CFG = {
    'MAX_LEN':140,
    'MODEL_NAME':'roberta-base',
    'EPOCHS':20,
    'LEARNING_RATE':1e-5,
    'BATCH_SIZE':8,
    'SEED':41
}

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [7]:
tokenizer = RobertaTokenizerFast.from_pretrained(CFG['MODEL_NAME'])

In [8]:
## main 및 test 로드
#csv 형식의 training 데이터를 로드합니다.
import pandas as pd 

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

#데이터 살펴보기 위해 데이터 최상단의 5줄을 표시합니다.
train.head() 
print('train shape:',train.shape)
print('test shape:',test.shape)

train shape: (25000, 3)
test shape: (25000, 2)


In [9]:
test_total = pd.read_csv('data/test_total.csv')
test_total.head()

Unnamed: 0,id,reviews,target
0,0,채소가 약간 시들어 있어요,2
1,1,발톱 두껍고 단단한 분들 써도 소용없어요 이 테이프 물렁거리고 힘이없어서 들어 올리...,1
2,2,부들부들 좋네요 입어보고 시원하면 또 살게요,5
3,3,이런 1. 8 골드 주라니깐 파란개 오네 회사전화걸어도 받지도 않고 머하자는거임?,1
4,4,검수도 없이 보내구 불량 배송비 5000원 청구하네요 완전별로 별하나도 아까워요,1


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       25000 non-null  int64 
 1   reviews  25000 non-null  object
 2   target   25000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 586.1+ KB


In [11]:
train['target'][train['target'] == 1] = 0 ## label : 10-1 -> 10
train['target'][train['target'] == 2] = 1 ## Label : 10-2 -> 0
train['target'][train['target'] == 4] = 2 ## Label : 10-2 -> 0
train['target'][train['target'] == 5] = 3 ## Label : 10-2 -> 0
test_total['target'][test_total['target'] == 1] = 0 ## label : 10-1 -> 10
test_total['target'][test_total['target'] == 2] = 1 ## Label : 10-2 -> 0
test_total['target'][test_total['target'] == 4] = 2 ## Label : 10-2 -> 0
test_total['target'][test_total['target'] == 5] = 3 ## Label : 10-2 -> 0

In [12]:
def preprocessing(text_list):
    resample_list = []
    for text in text_list:
        text = text.lower()
        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        text = re.sub('\s+', ' ', text)
        resample_list.append(text)
    return resample_list

In [13]:
train['reviews'] = preprocessing(train['reviews'].values)
test['reviews'] = preprocessing(test['reviews'].values)

In [14]:
def get_encode_data(code_list):
    input_ids = []
    attention_masks = []
    for sent in tqdm(code_list):
        encoded_dict = tokenizer.encode_plus(
                            text = sent,                     # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = CFG['MAX_LEN'],           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        
    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    return input_ids, attention_masks

def get_data(dataframe):
    code, mask = get_encode_data(dataframe['reviews'].values)
    return code, mask

In [15]:
train_code_ids, train_code_masks = get_data(train)
test_code_ids, test_code_masks = get_data(test)

100%|██████████| 25000/25000 [00:08<00:00, 2946.09it/s]
100%|██████████| 25000/25000 [00:08<00:00, 2997.50it/s]


In [16]:
class CustomDataset(Dataset):
    def __init__(self, code_ids, code_masks, labels, train_mode):
        self.code_ids = code_ids
        self.code_masks = code_masks
        self.labels = labels
        self.train_mode = train_mode
        
    def __getitem__(self, index):
        code_input_ids = self.code_ids[index]
        code_input_masks = self.code_masks[index]
        
        if self.train_mode:
            label = self.labels[index]
            return code_input_ids, code_input_masks, label
        else:
            return code_input_ids, code_input_masks

    def __len__(self):
        return len(self.code_ids)

In [17]:
train_dataset = CustomDataset(train_code_ids, train_code_masks, train['target'].values, True)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=8)

test_dataset = CustomDataset(test_code_ids, test_code_masks, test_total['target'].values, True)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=8)

In [18]:
class RobertaSimilarModel(nn.Module):
    def __init__(self):
        super(RobertaSimilarModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained(CFG['MODEL_NAME'])
        self.classifier = nn.Linear(768, 4)

    def forward(self, code_input_id, code_mask):
        _, feature = self.roberta(input_ids= code_input_id, attention_mask=code_mask, return_dict=False)
        
        output = self.classifier(feature)
        return output

In [19]:
import torch.optim as optim # 최적화 알고리즘들이 포함힘

model = RobertaSimilarModel()
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr= CFG["LEARNING_RATE"] )#0.001
scheduler = None

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
#import gc
#gc.collect()
#torch.cuda.empty_cache()

In [21]:
def train(model, optimizer, train_loader, scheduler, device): 
    #model.to(device)
    #model = nn.DataParallel(model)
    NGPU = torch.cuda.device_count()
    if NGPU > 1:
        model = torch.nn.DataParallel(model, device_ids=list(range(NGPU)))
    #torch.multiprocessing.set_start_method('spawn')
    model.to(device)
    
    n = len(train_loader)
    
    #Loss Function 정의
    #criterion = nn.CrossEntropyLoss().to(device)
    best_acc = 0
    
    for epoch in range(1,CFG["EPOCHS"]+1): #에포크 설정
        model.train() #모델 학습
        running_loss = 0.0
            
        for code_input_id, code_input_mask, label in tqdm(iter(train_loader)):
            code_input_id, code_input_mask = code_input_id.to(device), code_input_mask.to(device) #배치 데이터
            label = label.to(device)
            optimizer.zero_grad() #배치마다 optimizer 초기화
        
            # Data -> Model -> Output
            logit = model(code_input_id, code_input_mask) #예측값 산출
            #print('model :', logit)
            #print('label :', label)
            
            loss = criterion(logit, label) #손실함수 계산
            
            #print('loss :', loss)
            
            # 역전파
            loss.backward() #손실함수 기준 역전파 
            optimizer.step() #가중치 최적화
            running_loss += loss.item()
            
        #print('len :', len(train_loader))
            #print('loss :', running_loss)
            
        print('[%d] Train loss: %.10f' %(epoch, running_loss / len(train_loader)))
        
        if scheduler is not None:
            scheduler.step()
            
        #Validation set 평가
        model.eval() #evaluation 과정에서 사용하지 않아야 하는 layer들을 알아서 off 시키도록 하는 함수
        vali_loss = 0.0
        correct = 0
        with torch.no_grad(): #파라미터 업데이트 안하기 때문에 no_grad 사용
            for code_input_id, code_input_mask, label in tqdm(iter(test_loader)):
                code_input_id, code_input_mask = code_input_id.to(device), code_input_mask.to(device)
                label = label.to(device)
                logit = model(code_input_id, code_input_mask)
                vali_loss += criterion(logit, label)
                pred = logit.argmax(dim=1, keepdim=True)  #4개의 class중 가장 값이 높은 것을 예측 label로 추출
                correct += pred.eq(label.view_as(pred)).sum().item() #예측값과 실제값이 맞으면 1 아니면 0으로 합산
        vali_acc = 100 * correct / len(test_loader.dataset)
        print('Vail set: Loss: {:.4f}, Accuracy: {}/{} ( {:.0f}%)\n'.format(vali_loss / len(test_loader), correct, len(test_loader.dataset), 100 * correct / len(test_loader.dataset)))
        
        #베스트 모델 저장
        if best_acc < vali_acc:
            best_acc = vali_acc
            torch.save(model.state_dict(), './saved/best_model.pth') #이 디렉토리에 best_model.pth을 저장
            print('Model Saved.')

In [None]:
train(model, optimizer, train_loader, scheduler, device)

100%|██████████| 3125/3125 [08:42<00:00,  5.98it/s]

[1] Train loss: 1.0805524924



 58%|█████▊    | 1806/3125 [02:07<01:31, 14.39it/s]