# Data-Centric NLP 대회: 주제 분류 프로젝트

## Load Libraries

In [1]:
# !pip install evaluate

In [2]:
import os
import random
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader

import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

from tokenization_kobert import KoBertTokenizer

## Set Hyperparameters

In [3]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [4]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [5]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '../data')
OUTPUT_DIR = os.path.join(BASE_DIR, '../output')

## Load Tokenizer and Model

In [6]:
model_name = 'monologg/kobert'
tokenizer = KoBertTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

## Define Dataset

In [7]:
int2label = ["IT과학", "경제", "사회", "생활문화", "세계", "스포츠", "정치"]
label2int = {"IT과학": 0, "경제":1, "사회":2, "생활문화":3, "세계":4, "스포츠":5, "정치":6}

In [19]:
data = pd.read_csv("../data/g2p_clean_train.csv")
display(data[data["text"].isnull() == True].head())
data = data.drop(data[data["text"].isnull() == True].index)
display(data[data["text"].isnull() == True].head())
# data = data.drop(data.index[30042]) # nan 제거
# data.loc[30042, "text"] = "함께 아리랑"

Unnamed: 0,ID,text,target,url,date
30042,ynat-v1_train_30042,,5,https://news.naver.com/main/read.nhn?mode=LS2D...,2018.12.21. 오후 4:14


Unnamed: 0,ID,text,target,url,date


In [10]:
data.loc[30042]

ID                                      ynat-v1_train_30042
text                                                 함께 아리랑
target                                                    5
url       https://news.naver.com/main/read.nhn?mode=LS2D...
date                                    2018.12.21. 오후 4:14
Name: 30042, dtype: object

In [20]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        self.ids = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, max_length=40,padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),  
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }
    
    def __len__(self):
        return len(self.labels)

In [21]:
raw_texts, labels = data["text"].values, data["target"].values
num_classes = len(set(labels))
print(f"This dataset has {num_classes} classes.")
print(f"Classes: {set(labels)}")

This dataset has 7 classes.
Classes: {0, 1, 2, 3, 4, 5, 6}


In [None]:
# data_train = BERTDataset(dataset_train, tokenizer)
# data_valid = BERTDataset(dataset_valid, tokenizer)

In [23]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [24]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


## Stratified KFold

In [25]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='no',
    evaluation_strategy='epoch',
    save_strategy='no',
    logging_steps=100,
    eval_steps=500,
    save_steps=100,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    load_best_model_at_end=False,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED,
    report_to='none',
    # fp16=True
)

In [33]:
data.iloc[[2,1]]["text"].values.tolist()

['내년부터 국가RD 평가 때 논문건수는 반영 않는다', '어버이날 맑다가 흐려져 ... 남부지방 옅은 황사']

In [None]:
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
pred_probs = []
text_embeddings = []
indices = []
N_SPLITS = 10
skfold = StratifiedKFold(n_splits=N_SPLITS) # 5 -> valid 0.2
cnt = 0
for train_index, valid_index in skfold.split(raw_texts,labels):
    cnt+=1
    print(f"*** kfold: {cnt}/{N_SPLITS} ***")
    data_train = BERTDataset(data.iloc[train_index], tokenizer)
    data_valid = BERTDataset(data.iloc[valid_index], tokenizer)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=data_train,
        eval_dataset=data_valid,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    
    model.eval()
    BATCH_SIZE=2048
    valid_text = data.iloc[valid_index]["text"].values.tolist()
    indices.append(valid_index)
    for batch in tqdm(DataLoader(valid_text, batch_size=BATCH_SIZE)):
        inputs = tokenizer(batch, max_length=40, padding=True, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
            logits = outputs.logits
            probs = torch.nn.Softmax(dim=1)(logits).cpu().numpy()
            pred_probs.append(probs)
            emb = outputs.hidden_states[-1][:,0,:].cpu().numpy()
            text_embeddings.append(emb)

In [67]:
indices

[array([   0,    1,    2, ..., 4975, 4979, 4984]),
 array([ 4017,  4019,  4023, ..., 10028, 10034, 10035]),
 array([ 8199,  8204,  8206, ..., 15188, 15193, 15204]),
 array([12121, 12130, 12140, ..., 20089, 20092, 20093]),
 array([16389, 16391, 16397, ..., 25072, 25077, 25082]),
 array([20587, 20599, 20601, ..., 29694, 29695, 29703]),
 array([24437, 24462, 24469, ..., 35330, 35341, 35347]),
 array([28638, 28649, 28658, ..., 39037, 39044, 39048]),
 array([32736, 32749, 32757, ..., 42495, 42498, 42508]),
 array([37589, 37611, 37668, ..., 45674, 45675, 45676])]

In [68]:
indices = np.concatenate(indices)
pred_probs = np.concatenate(pred_probs)
text_embeddings = np.concatenate(text_embeddings)

In [69]:
# print(pred_probs.shape, indices.shape)
print(pred_probs.shape, text_embeddings.shape, indices.shape)

(45677, 7) (45677, 768) (45677,)


## get text_embeddings from pre_trained model (not finetuned)

In [56]:
from kobert_transformers import get_kobert_model
transformer = get_kobert_model().to(DEVICE)

loading configuration file https://huggingface.co/monologg/kobert/resolve/main/config.json from cache at /opt/ml/.cache/huggingface/transformers/31dc8da633439f22ed80bede01f337996bc709eb8429f86f2b24e2103558b039.89a06cdfd16840fd89cc5c2493ef63cd0b6068e85f70ac988a3673e2722cab2e
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 8002
}

loading weights file https://huggingface.co/monologg/kobert/resolve/main/pytorch_model.bin from cache at /opt/ml/.cache/huggingface/transforme

In [58]:
BATCH_SIZE=2048
transformer.eval()
text_embeddings = []
for batch_indices in tqdm(DataLoader(indices, batch_size=BATCH_SIZE)):
    batch = data.iloc[batch_indices]["text"].values.tolist()
    with torch.no_grad():
        input = tokenizer(batch, max_length=40, padding=True, return_tensors="pt").to(DEVICE)
        output = transformer(**input).pooler_output.cpu().numpy()
    text_embeddings.append(output)
    
text_embeddings = np.concatenate(text_embeddings)
print(text_embeddings.shape)    

100%|██████████| 12/12 [00:26<00:00,  2.18s/it]

(45677, 768)





## cleanlab

In [70]:
from cleanlab import Datalab
data_dict = {"texts": data.iloc[indices]["text"], "labels": data.iloc[indices]["target"]}
lab = Datalab(data_dict, label_name="labels")
lab.find_issues(pred_probs=pred_probs, features=text_embeddings)

Finding label issues ...
Finding outlier issues ...
Fitting OOD estimator based on provided features ...
Finding near_duplicate issues ...
Audit complete. 7630 issues found in the dataset.


In [71]:
label_issues = lab.get_issues("label")
res_df = pd.DataFrame(
{
    "text": raw_texts,
    "target": labels,
    "suggested": label_issues["predicted_label"],
    "given_label": [int2label[x] for x in labels],
    "suggested_label": label_issues["predicted_label"].apply(lambda x: int2label[x]),
    "label_score": label_issues["label_score"],
    "error": label_issues["is_label_issue"]
})

< 0.01 1245
< 0.02 1777


In [61]:
res_df.to_csv("cleanlab_result_finetune2.csv") # cv=10, text_embedding from pretrained (기존에는 각자 다른 fine_tuned 모델에서), loc->iloc 오류 해결

In [83]:
res_df = pd.read_csv("../cleanlab_result_finetune2.csv")

## error_df

In [84]:
error_df = res_df[res_df["error"] == True].sort_values("label_score")
print("< 0.01",len(error_df[error_df["label_score"]  < 0.01]))
print("< 0.02", len(error_df[error_df["label_score"]  < 0.02]))

< 0.01 1245
< 0.02 1764


In [63]:
pd.set_option('display.max_rows', 10000)

In [None]:
error_df.head(100)

In [None]:
error_df[error_df["label_score"] < 0.02].head(1000)#.groupby("suggested_label")["text"].count()

In [None]:
index = error_df[error_df["label_score"]  < 0.02].index
to_change = error_df.loc[index]

In [None]:
### for wandb setting
# os.environ['WANDB_DISABLED'] = 'true'

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()