<a href="https://colab.research.google.com/github/sunny0103/DeepLearning_nlp_projects/blob/main/Naver_shopping_reviews/Naver_shopping_reviews_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cd "/content/drive/MyDrive/Data/shopping_reviews"

/content/drive/MyDrive/Data/shopping_reviews


In [3]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc

from tqdm import tqdm, tqdm_notebook
# from glob import glob

warnings.filterwarnings('ignore')
plt.style.use('ggplot')

In [4]:
!pip install transformers[torch] datasets



In [5]:
!pip install accelerate -U



In [6]:
from datasets import (load_dataset,
                      DatasetDict)

from transformers import (AutoTokenizer,
                          AdamW,
                          AutoModelForSequenceClassification
                          )

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn as nn

from sklearn.metrics import accuracy_score

import random
import os
from tqdm import tqdm, tqdm_notebook

In [7]:
def seed_everything(seed):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

seed_everything(42)
SEED = 42

In [8]:
df=pd.read_csv('./cleaned_train.csv')
df.head()
df.shape

(25000, 3)

In [9]:
df.drop('id',axis=1, inplace= True)
df.head()

Unnamed: 0,reviews,target
0,조아요 처음구입 싸게햇어요,2
1,생각보다 잘 안돼요 매지 바른지 하루밖에 안됐는데ㅠㅠ 25천원가량 주고 사기 너무 ...,1
2,디자인은괜찮은데 상품이 금이가서 교환했는데 두번째받은상품도 까져있고 안쪽에 금이가져...,2
3,기전에 이 제품말고 이마트 트레이더스에서만 팔던 프리미엄 제품을 사용했었습니다. 샘...,2
4,튼튼하고 손목을 잘 받쳐주네요~,5


In [10]:
targets = sorted(df.target.unique())

label_dict = {}
for idx, label in enumerate(targets):
  label_dict[label] = idx
label_dict

{1: 0, 2: 1, 4: 2, 5: 3}

In [11]:
df['label'] = df.target.replace(label_dict)

In [12]:
df.head()

Unnamed: 0,reviews,target,label
0,조아요 처음구입 싸게햇어요,2,1
1,생각보다 잘 안돼요 매지 바른지 하루밖에 안됐는데ㅠㅠ 25천원가량 주고 사기 너무 ...,1,0
2,디자인은괜찮은데 상품이 금이가서 교환했는데 두번째받은상품도 까져있고 안쪽에 금이가져...,2,1
3,기전에 이 제품말고 이마트 트레이더스에서만 팔던 프리미엄 제품을 사용했었습니다. 샘...,2,1
4,튼튼하고 손목을 잘 받쳐주네요~,5,3


In [13]:
from sklearn.model_selection import train_test_split
# index로 train, validation을 나눔
# imbalaced label => stratify label
X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                  df.label.values,
                                                  test_size=0.2,
                                                  random_state=SEED,
                                                  stratify=df.label.values)

In [14]:
df['data_type'] = ['not_set'] * df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'valid'

In [15]:
df.groupby(['label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,reviews,target
label,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1
0,train,3600,3600
0,valid,900,900
1,train,6400,6400
1,valid,1600,1600
2,train,2000,2000
2,valid,500,500
3,train,8000,8000
3,valid,2000,2000


In [16]:
df.head()

Unnamed: 0,reviews,target,label,data_type
0,조아요 처음구입 싸게햇어요,2,1,train
1,생각보다 잘 안돼요 매지 바른지 하루밖에 안됐는데ㅠㅠ 25천원가량 주고 사기 너무 ...,1,0,train
2,디자인은괜찮은데 상품이 금이가서 교환했는데 두번째받은상품도 까져있고 안쪽에 금이가져...,2,1,train
3,기전에 이 제품말고 이마트 트레이더스에서만 팔던 프리미엄 제품을 사용했었습니다. 샘...,2,1,valid
4,튼튼하고 손목을 잘 받쳐주네요~,5,3,train


In [17]:
# total dataset
train_set = df[df.data_type =='train']
valid_set = df[df.data_type =='valid']

In [18]:
# # sample dataset
# train_set = df[df.data_type =='train'].sample(n=2000)
# valid_set = df[df.data_type =='valid'].sample(n=500)

In [18]:
MODEL_NAME ={'Bert':'kykim/bert-kor-base', 'Electra': 'kykim/electra-kor-base'}

In [20]:
max_len = 512
# Convert the NumPy array to a list
reviews_train = train_set.reviews.values.tolist()
reviews_valid = valid_set.reviews.values.tolist()

def Dataset(model_type):
  tokenizer = AutoTokenizer.from_pretrained(model_type)

  encoded_data_train = tokenizer.batch_encode_plus(
      reviews_train,  # Use the list here
      add_special_tokens=True,
      return_attention_mask=True,
      pad_to_max_length=True,
      max_length=max_len,
      return_tensors='pt',
      truncation=True
  )

  encoded_data_valid = tokenizer.batch_encode_plus(
      reviews_valid,
      add_special_tokens = True,
      return_attention_mask = True,
      pad_to_max_length = True,
      max_length = max_len,
      return_tensors ='pt',
      truncation=True
  )

  input_ids_train = encoded_data_train['input_ids']
  attention_masks_train = encoded_data_train['attention_mask']
  labels_train = torch.tensor(train_set.label.values)

  input_ids_val = encoded_data_valid['input_ids']
  attention_masks_val = encoded_data_valid['attention_mask']
  labels_val = torch.tensor(valid_set.label.values)

  train_dataset = TensorDataset(input_ids_train, attention_masks_train, labels_train)
  valid_dataset = TensorDataset(input_ids_val, attention_masks_val, labels_val)
  return train_dataset, valid_dataset

In [21]:
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
MAX_LEN = 512

In [22]:
def accuracy_metrics(pred, label):
  return sum(pred==label)/len(label)

In [23]:
class Trainer():
  def __init__(self, trainloader, validloader, model, optimizer, criterion,device, model_name=None):
    self.trainloader = trainloader
    self.validloader = validloader
    self.model = model
    self.optimizer = optimizer
    self.device = device
    self.criterion = criterion

  def train(self, epochs =1):
    self.model.train()
    best_acc, best_loss = None, None
    for epoch in range(1, epochs+1):
      print('======   Epoch:{:1d}/{:1d} Running   ========='.format(epoch, epochs))
      running_loss = 0.0
      for input_ids, attention_mask, y in tqdm_notebook(self.trainloader):
        input_ids = input_ids.to(self.device)
        attention_mask = attention_mask.to(self.device)
        y = y.to(self.device)

        self.optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)[0]
        loss = self.criterion(outputs, y).to(self.device)
        loss.backward()
        self.optimizer.step()
        running_loss += loss.item()
      train_loss = running_loss/len(self.trainloader)
      valid_loss, predictions, labels = self.validate()
      accuracy = accuracy_metrics(predictions, labels)
      tqdm.write('Epoch:{:1d}, Train loss: {:.3f}, Validation loss: {:.3f}, Accuracy: {:.3f}'.format(epoch, train_loss, valid_loss, accuracy))
      if best_acc is None and best_loss is None:
        best_loss = valid_loss
        best_acc = accuracy
        torch.save(model.state_dict(), f'./{model_name}_epoch_{epoch}.model')
      elif best_acc <= accuracy or valid_loss <= best_loss:
        best_loss = valid_loss
        best_acc = accuracy
        torch.save(model.state_dict(), f'./{model_name}_epoch_{epoch}.model')
      else:
        break

    # return accuracy

  def validate(self):
    self.model.eval()
    predictions, labels = [], []
    with torch.no_grad():
      running_loss, running_acc = 0.0, 0.0
      for input_ids, attention_mask, y in tqdm_notebook(self.validloader):
        input_ids = input_ids.to(self.device)
        attention_mask = attention_mask.to(self.device)
        y = y.to(self.device)

        outputs = model(input_ids, attention_mask)[0]
        loss = self.criterion(outputs, y)
        running_loss += loss.item()

        pred = outputs.detach().cpu().numpy().argmax(-1)
        label = y.detach().cpu().numpy()
        predictions.append(pred)
        labels.append(label)

    valid_loss = running_loss/len(self.validloader)
    predictions =  np.concatenate(predictions, axis=0)
    labels = np.concatenate(labels, axis=0)
    return valid_loss, predictions, labels

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
LEARNING_RATES = 2e-5
epochs = 10

In [25]:
for k, v in MODEL_NAME.items():
  model_name, model_type = k, v
  print("====================   Model: {}    ====================".format(model_name))
  train_dataset, valid_dataset = Dataset(model_type)
  trainloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
  validloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
  model = AutoModelForSequenceClassification.from_pretrained(model_type,
                                                             num_labels=len(label_dict),
                                                             output_attentions=False,
                                                             output_hidden_states=False)
  model = model.to(device)
  optimizer = AdamW(model.parameters(), lr =LEARNING_RATES)
  criterion =  nn.CrossEntropyLoss()

  trainer = Trainer(trainloader= trainloader,
                  validloader = validloader,
                  model = model,
                  optimizer = optimizer,
                  criterion = criterion,
                  device = device,
                  model_name = model_name)
  trainer.train(epochs=epochs)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at kykim/bert-kor-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Epoch:1, Train loss: 0.753, Validation loss: 0.695, Accuracy: 0.701


  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Epoch:2, Train loss: 0.641, Validation loss: 0.703, Accuracy: 0.711


  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Epoch:3, Train loss: 0.504, Validation loss: 0.781, Accuracy: 0.694


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Epoch:1, Train loss: 0.763, Validation loss: 0.694, Accuracy: 0.697


  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Epoch:2, Train loss: 0.642, Validation loss: 0.705, Accuracy: 0.710


  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Epoch:3, Train loss: 0.532, Validation loss: 0.758, Accuracy: 0.699


In [26]:
test = pd.read_csv('./cleaned_test.csv', index_col=0)
test

Unnamed: 0_level_0,reviews
id,Unnamed: 1_level_1
0,채소가 약간 시들어 있어요
1,발톱 두껍고 단단한 분들 써도 소용없어요 이 테이프 물렁거리고 힘이없어서 들어 올리...
2,부들부들 좋네요 입어보고 시원하면 또 살게요
3,이런 1. 8 골드 주라니깐 파란개 오네 회사전화걸어도 받지도 않고 머하자는거임?
4,검수도 없이 보내구 불량 배송비 5000원 청구하네요 완전별로 별하나도 아까워요
...,...
24995,사용해보니 좋아요~^^
24996,저렴한가격에. 질좋고. 핏좋고. 너무. 이쁘게. 입고다녀요..
24997,세트상품이라고 써있어서 그런줄 알고 구매했더니 단품이었네요 낚인 느낌도 들고 그러네...
24998,역시 로네펠트!! 좋아요.


In [27]:
max_len = 512

def Testset(model_type):
  reviews_test = test.reviews.values.tolist()

  tokenizer = AutoTokenizer.from_pretrained(model_type)

  encoded_data_test = tokenizer.batch_encode_plus(
      reviews_test,
      add_special_tokens = True,
      return_attention_mask = True,
      pad_to_max_length = True,
      max_length = max_len,
      return_tensors ='pt',
      truncation=True
  )

  input_ids_test = encoded_data_test['input_ids']
  attention_masks_test = encoded_data_test['attention_mask']

  test_dataset = TensorDataset(input_ids_test, attention_masks_test)
  return test_dataset

In [28]:
for k, v in MODEL_NAME.items():
  print(k, v)

Bert kykim/bert-kor-base
Electra kykim/electra-kor-base


In [29]:
bert_test = Testset('kykim/bert-kor-base')
electra_test = Testset('kykim/electra-kor-base')

In [30]:
bert_testloader = DataLoader(bert_test, batch_size=BATCH_SIZE, shuffle=False)
electra_testloader = DataLoader(electra_test, batch_size=BATCH_SIZE, shuffle=False)

In [31]:
def predict(model_type, model_load, testloader):
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME[model_type],
                                                           num_labels=len(label_dict),
                                                           output_attentions=False,
                                                           output_hidden_states=False).to(device)

  model.load_state_dict(torch.load(model_load, map_location=device))
  model.eval()
  probs = None
  with torch.no_grad():
    for input_ids, attention_mask in tqdm_notebook(testloader):
      input_ids = input_ids.to(device)
      attention_mask = attention_mask.to(device)

      outputs = model(input_ids, attention_mask)[0]
      outputs = outputs.cpu().numpy()
      if probs is None:
        probs = outputs
      else:
        probs = np.concatenate([probs, outputs])

    # _, test_preds = torch.max(torch.tensor(probs), dim=1)
  return probs

In [32]:
probs1 = predict('Bert','./Bert_epoch_2.model', bert_testloader)
probs2 = predict('Electra','./Electra_epoch_2.model', electra_testloader)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at kykim/bert-kor-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1563 [00:00<?, ?it/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1563 [00:00<?, ?it/s]

In [38]:
 probs = probs1 + probs2
_, preds = torch.max(torch.tensor(probs), dim=1)

In [39]:
preds

tensor([1, 0, 3,  ..., 1, 3, 1])

In [40]:
submission = pd.read_csv('./sample_submission.csv')
submission.head()

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [41]:
# label dictionary: {1: 0, 2: 1, 4: 2, 5: 3}
submission['target'] = preds
submission['target'] = submission['target'].map({0:1, 1:2, 2:4, 3:5})
submission.head()

Unnamed: 0,id,target
0,0,2
1,1,1
2,2,5
3,3,1
4,4,1


In [42]:
submission.to_csv('./submission_ensemble.csv', index=False)