<a href="https://colab.research.google.com/github/sunny0103/DeepLearning_nlp_projects/blob/main/Naver_shopping_reviews/Naver_shopping_reviews_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cd "/content/drive/MyDrive/Data/shopping_reviews"

/content/drive/MyDrive/Data/shopping_reviews


In [3]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc

from tqdm import tqdm, tqdm_notebook
# from glob import glob

warnings.filterwarnings('ignore')
plt.style.use('ggplot')

In [14]:
!pip install transformers[torch] datasets



In [15]:
!pip install accelerate -U



In [16]:
from datasets import (load_dataset,
                      DatasetDict)

from transformers import (AutoTokenizer,
                          AdamW,
                          AutoModelForSequenceClassification
                          )

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn as nn

from sklearn.metrics import accuracy_score

import random
import os, gc
from tqdm import tqdm, tqdm_notebook

In [17]:
def seed_everything(seed):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

seed_everything(42)
SEED = 42

In [18]:
df=pd.read_csv('./cleaned_train.csv')
df.head()
df.shape

(25000, 3)

In [19]:
df.drop('id',axis=1, inplace= True)
df.head()

Unnamed: 0,reviews,target
0,조아요 처음구입 싸게햇어요,2
1,생각보다 잘 안돼요 매지 바른지 하루밖에 안됐는데ㅠㅠ 25천원가량 주고 사기 너무 ...,1
2,디자인은괜찮은데 상품이 금이가서 교환했는데 두번째받은상품도 까져있고 안쪽에 금이가져...,2
3,기전에 이 제품말고 이마트 트레이더스에서만 팔던 프리미엄 제품을 사용했었습니다. 샘...,2
4,튼튼하고 손목을 잘 받쳐주네요~,5


In [20]:
targets = sorted(df.target.unique())

label_dict = {}
for idx, label in enumerate(targets):
  label_dict[label] = idx
label_dict

{1: 0, 2: 1, 4: 2, 5: 3}

In [21]:
df['label'] = df.target.replace(label_dict)

In [22]:
df.head()

Unnamed: 0,reviews,target,label
0,조아요 처음구입 싸게햇어요,2,1
1,생각보다 잘 안돼요 매지 바른지 하루밖에 안됐는데ㅠㅠ 25천원가량 주고 사기 너무 ...,1,0
2,디자인은괜찮은데 상품이 금이가서 교환했는데 두번째받은상품도 까져있고 안쪽에 금이가져...,2,1
3,기전에 이 제품말고 이마트 트레이더스에서만 팔던 프리미엄 제품을 사용했었습니다. 샘...,2,1
4,튼튼하고 손목을 잘 받쳐주네요~,5,3


In [23]:
from sklearn.model_selection import train_test_split
# index로 train, validation을 나눔
# imbalaced label => stratify label
X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                  df.label.values,
                                                  test_size=0.2,
                                                  random_state=SEED,
                                                  stratify=df.label.values)

In [24]:
df['data_type'] = ['not_set'] * df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'valid'

In [25]:
df.groupby(['label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,reviews,target
label,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1
0,train,3600,3600
0,valid,900,900
1,train,6400,6400
1,valid,1600,1600
2,train,2000,2000
2,valid,500,500
3,train,8000,8000
3,valid,2000,2000


In [26]:
df.head()

Unnamed: 0,reviews,target,label,data_type
0,조아요 처음구입 싸게햇어요,2,1,train
1,생각보다 잘 안돼요 매지 바른지 하루밖에 안됐는데ㅠㅠ 25천원가량 주고 사기 너무 ...,1,0,train
2,디자인은괜찮은데 상품이 금이가서 교환했는데 두번째받은상품도 까져있고 안쪽에 금이가져...,2,1,train
3,기전에 이 제품말고 이마트 트레이더스에서만 팔던 프리미엄 제품을 사용했었습니다. 샘...,2,1,valid
4,튼튼하고 손목을 잘 받쳐주네요~,5,3,train


In [30]:
bert_tokenizer = AutoTokenizer.from_pretrained('jason9693/SoongsilBERT-base-beep')
electra_tokenizer = AutoTokenizer.from_pretrained('kykim/electra-kor-base')
roberta_tokenizer = AutoTokenizer.from_pretrained('jason9693/klue-roberta-small-apeach')
funnel_tokenizer = AutoTokenizer.from_pretrained('kykim/funnel-kor-base')

In [31]:
max_len = 512

class CustomDataset(Dataset):

  def __init__(self, dataset, tokenizer, train_mode=True):
    self.dataset  = dataset
    self.train_mode = train_mode
    self.tokenizer = tokenizer

  def __getitem__(self, index):
    text = self.dataset.loc[index, 'reviews']

    encoded_data = self.tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        max_length = max_len,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        return_tensors ='pt'
        )

    input_ids = encoded_data['input_ids'][0]
    attention_masks = encoded_data['attention_mask'][0]

    if self.train_mode:
      labels = torch.tensor(self.dataset.loc[index,'label'])
      return input_ids, attention_masks, labels
    return input_ids, attention_masks

  def __len__(self):
    return len(self.dataset)

In [33]:
BATCH_SIZE = 16

In [35]:
torch.cuda.empty_cache()
gc.collect()

63

In [37]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [39]:
def predict(model_name, model_load, testloader):
  model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=len(label_dict),
                                                           output_attentions=False,
                                                           output_hidden_states=False,
                                                           ignore_mismatched_sizes=True
                                                           ).to(device)
  model.load_state_dict(torch.load(model_load, map_location=device))
  model.eval()
  probs = None
  with torch.no_grad():
    for input_ids, attention_mask in tqdm_notebook(testloader):
      input_ids = input_ids.to(device)
      attention_mask = attention_mask.to(device)

      outputs = model(input_ids, attention_mask)[0]
      outputs = outputs.cpu().numpy()
      if probs is None:
        probs = outputs
      else:
        probs = np.concatenate([probs, outputs])

    # _, test_preds = torch.max(torch.tensor(probs), dim=1)
  return probs

In [40]:
test_set = pd.read_csv('./cleaned_test.csv', index_col=0)
test_set

Unnamed: 0_level_0,reviews
id,Unnamed: 1_level_1
0,채소가 약간 시들어 있어요
1,발톱 두껍고 단단한 분들 써도 소용없어요 이 테이프 물렁거리고 힘이없어서 들어 올리...
2,부들부들 좋네요 입어보고 시원하면 또 살게요
3,이런 1. 8 골드 주라니깐 파란개 오네 회사전화걸어도 받지도 않고 머하자는거임?
4,검수도 없이 보내구 불량 배송비 5000원 청구하네요 완전별로 별하나도 아까워요
...,...
24995,사용해보니 좋아요~^^
24996,저렴한가격에. 질좋고. 핏좋고. 너무. 이쁘게. 입고다녀요..
24997,세트상품이라고 써있어서 그런줄 알고 구매했더니 단품이었네요 낚인 느낌도 들고 그러네...
24998,역시 로네펠트!! 좋아요.


In [41]:
bert_dataset = CustomDataset(test_set, bert_tokenizer, train_mode=False)
electra_dataset = CustomDataset(test_set, electra_tokenizer, train_mode=False)
roberta_dataset = CustomDataset(test_set, roberta_tokenizer, train_mode=False)
funnel_dataset = CustomDataset(test_set, funnel_tokenizer, train_mode=False)

bart_testloader = DataLoader(bert_dataset, batch_size=BATCH_SIZE, shuffle=False)
electra_testloader = DataLoader(electra_dataset, batch_size=BATCH_SIZE, shuffle=False)
roberta_testloader = DataLoader(roberta_dataset, batch_size=BATCH_SIZE, shuffle=False)
funnel_testloader = DataLoader(funnel_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [42]:
bert_load = './BERT.model'
probs1 = predict('jason9693/SoongsilBERT-base-beep', bert_load, bart_testloader)

electra_load = './Electra.model'
probs2 = predict('kykim/electra-kor-base', electra_load, electra_testloader)

roberta_load = './Roberta.model'
probs3 = predict('jason9693/klue-roberta-small-apeach', roberta_load, roberta_testloader)

funnel_load = './Funnel.model'
probs4 = predict('kykim/funnel-kor-base', funnel_load, funnel_testloader)

Downloading pytorch_model.bin:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at jason9693/SoongsilBERT-base-beep and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1563 [00:00<?, ?it/s]

Downloading pytorch_model.bin:   0%|          | 0.00/473M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1563 [00:00<?, ?it/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/272M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at jason9693/klue-roberta-small-apeach and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1563 [00:00<?, ?it/s]

Downloading pytorch_model.bin:   0%|          | 0.00/744M [00:00<?, ?B/s]

Some weights of FunnelForSequenceClassification were not initialized from the model checkpoint at kykim/funnel-kor-base and are newly initialized: ['classifier.linear_hidden.weight', 'classifier.linear_hidden.bias', 'classifier.linear_out.bias', 'classifier.linear_out.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1563 [00:00<?, ?it/s]

In [43]:
probs = probs1 + probs2 + probs3 + probs4
_, preds = torch.max(torch.tensor(probs), dim=1)

In [44]:
submission = pd.read_csv('./sample_submission.csv')
submission.head()

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [45]:
# label dictionary: {1: 0, 2: 1, 4: 2, 5: 3}
submission['target'] = preds
submission['target'] = submission['target'].map({0:1, 1:2, 2:4, 3:5})
submission.head()

Unnamed: 0,id,target
0,0,2
1,1,1
2,2,5
3,3,1
4,4,1


In [46]:
submission.to_csv('./submission_ensemble.csv', index=False)