<a href="https://colab.research.google.com/github/sunny0103/DeepLearning_nlp_projects/blob/main/korea_news_topics/korea_news_topics_classification_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cd "/content/drive/MyDrive/Data/KorNews_topic"

/content/drive/MyDrive/Data/KorNews_topic


In [3]:
!pip install transformers[sentencepiece]

Collecting transformers[sentencepiece]
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[sentencepiece])
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[sentencepiece])
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[sentencepiece])
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import random, os, gc
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

from transformers import (AutoTokenizer,
                          AdamW,
                          AutoModelForSequenceClassification)

plt.style.use('ggplot')

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


seed_everything(42)
SEED = 42

In [6]:
topic = pd.read_csv('./topic_dict.csv')
display(topic)

Unnamed: 0,topic,topic_idx
0,IT과학,0
1,경제,1
2,사회,2
3,생활문화,3
4,세계,4
5,스포츠,5
6,정치,6


In [7]:
id2label = topic.to_dict()['topic']
label2id = {label: id for id, label in id2label.items()}
print(id2label)
print(label2id)

{0: 'IT과학', 1: '경제', 2: '사회', 3: '생활문화', 4: '세계', 5: '스포츠', 6: '정치'}
{'IT과학': 0, '경제': 1, '사회': 2, '생활문화': 3, '세계': 4, '스포츠': 5, '정치': 6}


In [8]:
test_set = pd.read_csv('./test_data.csv')
test_set.head()

Unnamed: 0,index,title
0,45654,유튜브 내달 2일까지 크리에이터 지원 공간 운영
1,45655,어버이날 맑다가 흐려져…남부지방 옅은 황사
2,45656,내년부터 국가RD 평가 때 논문건수는 반영 않는다
3,45657,김명자 신임 과총 회장 원로와 젊은 과학자 지혜 모을 것
4,45658,회색인간 작가 김동식 양심고백 등 새 소설집 2권 출간


In [9]:
max_len = 64

class CustomDataset(Dataset):

  def __init__(self, dataset, tokenizer, train_mode=True):
    self.dataset = dataset
    self.train_mode = train_mode
    self.tokenizer = tokenizer

  def __getitem__(self, index):
    text = self.dataset.loc[index, 'title']

    encoded_data = self.tokenizer.encode_plus(
    text,
    add_special_tokens = True,
    max_length = max_len,
    padding = 'max_length',
    truncation = True,
    return_attention_mask = True,
    return_tensors ='pt'
    )

    input_ids = encoded_data['input_ids'][0]
    attention_masks = encoded_data['attention_mask'][0]

    if self.train_mode:
      labels = torch.tensor(self.dataset.loc[index,'topic_idx'])
      return input_ids, attention_masks, labels
    return input_ids, attention_masks

  def __len__(self):
    return len(self.dataset)

In [10]:
def predict(model_type, model_load, testloader):
  model = AutoModelForSequenceClassification.from_pretrained(model_type,
                                                            num_labels=len(id2label),
                                                            id2label=id2label,
                                                            label2id=label2id,
                                                            ignore_mismatched_sizes=True).to(device)

  model.load_state_dict(torch.load(model_load, map_location=device))
  model.eval()
  probs = None
  with torch.no_grad():
    for input_ids, attention_mask in tqdm(testloader):
      input_ids = input_ids.to(device)
      attention_mask = attention_mask.to(device)

      outputs = model(input_ids, attention_mask)[0]
      outputs = outputs.detach().cpu().numpy()
      if probs is None:
        probs = outputs
      else:
        probs = np.concatenate([probs, outputs])
  return probs

In [11]:
BATCH_SIZE = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu' )
torch.cuda.empty_cache()
gc.collect()

140

In [12]:
bert_model_type = 'jason9693/SoongsilBERT-base-beep'
roberta_model_type='Huffon/klue-roberta-base-nli'
electra_model_type ='monologg/koelectra-base-v3-discriminator'

In [14]:
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_type)
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_type)
electra_tokenizert = AutoTokenizer.from_pretrained(electra_model_type)

Downloading (…)okenizer_config.json:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/347k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/245k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/499 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/926 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

In [16]:
bert_test = CustomDataset(test_set, bert_tokenizer, train_mode=False)
roberta_test = CustomDataset(test_set, roberta_tokenizer, train_mode=False)
electra_test = CustomDataset(test_set, electra_tokenizert, train_mode=False)

In [17]:
bert_testloader = DataLoader(bert_test, batch_size=BATCH_SIZE, shuffle=False)
roberta_testloader = DataLoader(roberta_test, batch_size=BATCH_SIZE, shuffle=False)
electra_testloader = DataLoader(electra_test, batch_size=BATCH_SIZE, shuffle=False)

In [18]:
probs1 = predict(bert_model_type,'./Bert.model', bert_testloader)
probs2 = predict(roberta_model_type,'./Roberta.model', roberta_testloader)
probs3 = predict(electra_model_type,'./koelectra.model', electra_testloader)

Downloading pytorch_model.bin:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at jason9693/SoongsilBERT-base-beep and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/571 [00:00<?, ?it/s]

Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at Huffon/klue-roberta-base-nli and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/571 [00:00<?, ?it/s]

Downloading pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/571 [00:00<?, ?it/s]

In [19]:
probs = probs1 + probs2+ probs3
_, preds = torch.max(torch.tensor(probs), dim=1)

In [20]:
print(preds)
len(preds)

tensor([2, 3, 2,  ..., 2, 2, 2])


9131

In [21]:
submission = pd.read_csv('./sample_submission.csv')
submission.head()

Unnamed: 0,index,topic_idx
0,45654,0
1,45655,0
2,45656,0
3,45657,0
4,45658,0


In [22]:
submission['topic_idx'] = preds
submission.head()

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3


In [23]:
submission.to_csv('./ensemble_submit.csv', index=False)