In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import time, datetime

In [2]:
data = pd.read_parquet("../data/dataset_grammar.parquet")

In [3]:
data

Unnamed: 0,Rating,Review
0,1,숙성 돼지고기 전문점입니다. 건물 모양 때문에 매장 모양도 좀 특이하지만 쾌적한 편...
1,1,고기가 정말 맛있었어요! 육즙이 가득 있어서 너무 좋았아요 일하시는 분들 너무 친절...
2,1,"잡내 없고 깔끔, 담백한 맛의 순댓국이 순댓국을 안 좋아하는 사람들에게도 술술 넘어..."
3,1,고기 양이 푸짐해서 특 순대국밥을 시킨 기분이 듭니다 맛도 좋습니다 다만 양념장이 ...
4,1,순댓국 자체는 제가 먹어본 순대국밥집 중에서 Top5 안에는 들어요. 그러나 밥 양...
...,...,...
641980,2,"요즘, 핫하게,,, 떠오르고 있는 중국집. , 맥주의 여 파루 속이 안 좋지만 와봄..."
641981,0,원래 글 안 쓰는데 이거는 정말 다른 분들 위해서 써야 할 것 같네요 방금 포장 주...
641982,1,"우리 팀 단골집, 술 먹고 다음 날 가면 푸짐하게 배불리 해장할 수 있는 곳, 주말..."
641983,2,"원래는 평택에 있었는데, 연남동에도 최근에 생겨서 방문했는데.. 진짜 줄이 어마어마..."


In [3]:
data = data.drop(index=data.index[data.Rating==2])
data = data.reset_index(drop=True)

In [5]:
data

Unnamed: 0,Rating,Review
0,1,숙성 돼지고기 전문점입니다. 건물 모양 때문에 매장 모양도 좀 특이하지만 쾌적한 편...
1,1,고기가 정말 맛있었어요! 육즙이 가득 있어서 너무 좋았아요 일하시는 분들 너무 친절...
2,1,"잡내 없고 깔끔, 담백한 맛의 순댓국이 순댓국을 안 좋아하는 사람들에게도 술술 넘어..."
3,1,고기 양이 푸짐해서 특 순대국밥을 시킨 기분이 듭니다 맛도 좋습니다 다만 양념장이 ...
4,1,순댓국 자체는 제가 먹어본 순대국밥집 중에서 Top5 안에는 들어요. 그러나 밥 양...
...,...,...
459202,0,"731 배달 시켜 먹었고요, 거리상 1.8km입니다. 배민에서 시켰고 정확히 58만..."
459203,1,송탄 미군부대 근처에 위치한 곳 원래 로컬 맛 집으로 되게 유명했는데 삼대 천왕에 ...
459204,1,집에서 40킬로 정도 떨어져 있는 곳인데도 몇 달에 한 번은 이거 먹으러 일부러 갑...
459205,0,원래 글 안 쓰는데 이거는 정말 다른 분들 위해서 써야 할 것 같네요 방금 포장 주...


In [6]:
data.Rating.value_counts()

1    388281
0     70926
Name: Rating, dtype: int64

In [7]:
data.Review.str.len().describe()

count    459207.000000
mean        138.518812
std         188.789379
min           0.000000
25%          39.000000
50%          76.000000
75%         164.000000
max        3971.000000
Name: Review, dtype: float64

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [9]:
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-multilingual-cased', vocab_size=119547, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
train_data, valid_data = train_test_split(data, shuffle=True, stratify=data.Rating, random_state=217, test_size=0.2)

In [6]:
valid_data, test_data = train_test_split(valid_data, stratify=valid_data.Rating, random_state=217, test_size=0.5)

In [12]:
train_data.shape, valid_data.shape, test_data.shape

((367365, 2), (45921, 2), (45921, 2))

In [13]:
np.bincount(train_data.Rating), np.bincount(valid_data.Rating), np.bincount(test_data.Rating)

(array([ 56741, 310624]), array([ 7092, 38829]), array([ 7093, 38828]))

In [7]:
def bert_tokenizer(sent, MAX_LEN):
    encoded_dict = tokenizer.encode_plus(text = sent, add_special_tokens=True, max_length=MAX_LEN, padding='max_length', return_attention_mask=True, truncation=True)
    
    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    token_type_ids = encoded_dict['token_type_ids']
    
    return input_ids, attention_masks, token_type_ids

In [8]:
def make_dataset(dataset, MAX_LEN):
    input_ids, attention_masks, token_type_ids = [], [], []
    for each_review in tqdm(dataset['Review']):
        input_id, attention_mask, token_type_id = bert_tokenizer(each_review, MAX_LEN)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
    
    tensordataset = TensorDataset(torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(token_type_ids), torch.LongTensor(dataset['Rating'].values).unsqueeze(dim=1))
    return tensordataset

In [9]:
train_dataset = make_dataset(train_data, 512)

  0%|          | 0/367365 [00:00<?, ?it/s]

In [10]:
valid_dataset = make_dataset(valid_data, 512)
test_dataset = make_dataset(test_data, 512)

  0%|          | 0/45921 [00:00<?, ?it/s]

  0%|          | 0/45921 [00:00<?, ?it/s]

## 동적 MAX_LEN, collate_fn에 따라 dynamic padding을 구현할 수도 있을텐데,, 실력부족

In [None]:
train_dataset[0]

(tensor([   101,   8847,  18622,  12092,   9254, 119192,  11664,   9294,  12092,
           9254,  11664,   9665,  70915,  12092,   9254, 119192,  11664,    102,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,    

In [11]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)
valid_dataloader = DataLoader(valid_dataset) # default: batch_size=1, shuffle=False
test_dataloader = DataLoader(test_dataset)

In [12]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint_testing.pt'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print("")
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [13]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [14]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=5e-6)
#loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')
epochs = 1000
early_stopping = EarlyStopping(patience = 10, verbose = True, path='./bert_best.pt')

In [None]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

177854978

In [15]:
device = torch.device("cuda")

In [16]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [17]:
train_loss_list, train_accuracy_list, valid_loss_list, valid_accuracy_list = [], [], [], []
optimizer.zero_grad()
for epoch in range(epochs):
    t0 = time.time()
    
    train_loss, train_accuracy, valid_loss, valid_accuracy = 0.0, 0.0, 0.0, 0.0
    epoch_accuracy, epoch_loss = 0.0, 0.0
    model.train()
    optimizer.zero_grad()
    for batch in train_dataloader:
        b_input_ids, b_attention_masks, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)
        out = model(b_input_ids, b_attention_masks, b_token_type_ids, labels=b_labels)
        loss, logits = out.loss, out.logits
        epoch_loss += loss.item()        

        loss.backward()
        optimizer.step()
        pred = torch.argmax(F.softmax(logits, dim=0), dim=1).unsqueeze(dim=1)
        epoch_accuracy += (pred==b_labels).cpu().numpy().mean()
        
        optimizer.zero_grad()
        
    train_loss = float(epoch_loss / len(train_dataloader))
    train_accuracy = float(epoch_accuracy / len(train_dataloader))
    
    train_loss_list.append(train_loss)
    train_accuracy_list.append(train_accuracy)
    
    epoch_accuracy, epoch_loss = 0.0, 0.0
    with torch.no_grad():
        model.eval()
        for batch in valid_dataloader:
            b_input_ids, b_attention_masks, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)
            out = model(b_input_ids, b_attention_masks, b_token_type_ids, labels=b_labels)
            loss, logits = out.loss, out.logits
            epoch_loss += loss.item()

            pred = torch.argmax(F.softmax(logits, dim=0)).unsqueeze(dim=0)
            epoch_accuracy += (pred==b_labels).cpu().numpy()
               
        valid_loss = float(epoch_loss / len(valid_dataloader))
        valid_accuracy = float(epoch_accuracy / len(valid_dataloader))
        
        valid_loss_list.append(valid_loss)
        valid_accuracy_list.append(valid_accuracy)

        print(f"EPOCH: {epoch}  ||  Elapsed: {format_time(time.time()-t0)}.")
        print(f"   Train_acc: {train_accuracy:.4f} | Train_loss: {train_loss:.4f}  ||  Valid_acc: {valid_accuracy:.4f} | Valid_loss: {valid_loss:.4f}")
        
        early_stopping(valid_loss, model)
        print("")
        if early_stopping.early_stop:
            print("Early stopping")
            break

  pred = torch.argmax(F.softmax(logits)).unsqueeze(dim=0)


EPOCH: 0  ||  Elapsed: 1:03:32.
   Train_acc: 0.8335 | Train_loss: 0.1379  ||  Valid_acc: 0.9582 | Valid_loss: 0.1111

Validation loss decreased (inf --> 0.111131).  Saving model ...

EPOCH: 1  ||  Elapsed: 1:03:29.
   Train_acc: 0.8849 | Train_loss: 0.0997  ||  Valid_acc: 0.9601 | Valid_loss: 0.1129
EarlyStopping counter: 1 out of 10

EPOCH: 2  ||  Elapsed: 1:03:32.
   Train_acc: 0.9064 | Train_loss: 0.0812  ||  Valid_acc: 0.9603 | Valid_loss: 0.1142
EarlyStopping counter: 2 out of 10

EPOCH: 3  ||  Elapsed: 1:03:28.
   Train_acc: 0.9234 | Train_loss: 0.0658  ||  Valid_acc: 0.9602 | Valid_loss: 0.1118
EarlyStopping counter: 3 out of 10

EPOCH: 4  ||  Elapsed: 1:03:28.
   Train_acc: 0.9363 | Train_loss: 0.0537  ||  Valid_acc: 0.9614 | Valid_loss: 0.1218
EarlyStopping counter: 4 out of 10

EPOCH: 5  ||  Elapsed: 1:03:29.
   Train_acc: 0.9445 | Train_loss: 0.0449  ||  Valid_acc: 0.9603 | Valid_loss: 0.1273
EarlyStopping counter: 5 out of 10

EPOCH: 6  ||  Elapsed: 1:03:34.
   Train_acc: 

In [None]:
    epoch_accuracy, epoch_loss = 0.0, 0.0
    with torch.no_grad():
        model.eval()
        for batch in valid_dataloader:
            b_input_ids, b_attention_masks, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)
            out = model(b_input_ids, b_attention_masks, b_token_type_ids, labels=b_labels)
            loss, logits = out.loss, out.logits
            epoch_loss += loss.item()

            pred = torch.argmax(F.softmax(logits, dim=0)).unsqueeze(dim=0)
            epoch_accuracy += (pred==b_labels).cpu().numpy()
               
        valid_loss = float(epoch_loss / len(valid_dataloader))
        valid_accuracy = float(epoch_accuracy / len(valid_dataloader))
        
        valid_loss_list.append(valid_loss)
        valid_accuracy_list.append(valid_accuracy)

In [24]:
model.load_state_dict(torch.load('./bert_best.pt'))
test_accuracy, epoch_accuracy = 0.0, 0.0
with torch.no_grad():
    model.eval()
    for batch in test_dataloader:
        b_input_ids, b_attention_masks, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)
        out = model(b_input_ids, b_attention_masks, b_token_type_ids, labels=b_labels)
        loss, logits = out.loss, out.logits

        pred = torch.argmax(F.softmax(logits, dim=0)).unsqueeze(dim=0)
        epoch_accuracy += (pred==b_labels).cpu().numpy()
            
    test_accuracy = float(epoch_accuracy / len(test_dataloader))

In [29]:
test_accuracy

0.15446092201824874

예준님 코멘트

5e-5, 3e-5, 2e-5

Epoch: 2,3,4

Batch size: 16, 32


train 시 vram 17889MiB 소요