In [None]:
#!pip install 'cleanlab[datalab]'

In [None]:
# !pip install transformers

In [None]:
# !pip install kobert-transformers

In [None]:
# !pip install sklearn
# !pip install cleanlab

In [None]:
import torch, gc
import re 
import string 
import numpy as np
import pandas as pd 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_predict 
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from transformers import AutoModel
from kobert_transformers import get_tokenizer, get_kobert_model
from tqdm import tqdm
from torch.utils.data import DataLoader

from cleanlab import Datalab

In [None]:
# pd.set_option('display.max_rows', 10000)

In [None]:
int2label = ["ITSci", "Economic", "Society","Life", "World", "Sports", "Politics"]
# int2label = ["IT과학", "경제", "사회", "생활문화", "세계", "스포츠", "정치"]

In [None]:
data = pd.read_csv("data/train.csv")
data = data.drop(data[data["text"].isnull() == True].index)

# text_embedding ([CLS] sentence embedding)

In [None]:
raw_texts, labels = data["text"].values, data["target"].values
num_classes = len(set(labels))
print(f"This dataset has {num_classes} classes.")
print(f"Classes: {set(labels)}")

In [None]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

In [None]:
tokenizer = get_tokenizer()
transformer = get_kobert_model()
transformer = transformer.to(DEVICE)

In [None]:
BATCH_SIZE=4096# 2048
transformer.eval()
text_embeddings = []
for batch in tqdm(DataLoader(raw_texts, batch_size=BATCH_SIZE)):
    with torch.no_grad():
        input = tokenizer(batch, padding=True, return_tensors="pt").to(DEVICE)
        output = transformer(**input).pooler_output
    text_embeddings.append(output)
    
total = torch.cat(text_embeddings)
text_embeddings = total.cpu().detach().numpy()
print(text_embeddings.shape)    

In [None]:
from datasets import Dataset

In [None]:
from skorch import NeuralNetClassifier, NeuralNet
from transformers import AutoModelForSequenceClassification

model_name = 'monologg/kobert'
# tokenizer = KoBertTokenizer.from_pretrained(model_name) # transformer
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)
model_skorch = NeuralNet(model, criterion=nn.CrossEntropyLoss)

In [None]:
import torch.utils.data

class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),  
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }
    
    def __len__(self):
        return len(self.labels)

In [None]:
dataset = BERTDataset(data, tokenizer)

In [None]:
dataset

In [None]:
pred_probs = cross_val_predict(
    model_skorch, dataset, labels, 
    cv=5, method="predict_proba", verbose=1
)
predicted_labels = pred_probs.argmax(axis=1)
acc = accuracy_score(labels, predicted_labels)
print(f"Cross-validated estimate of accuracy on held-out data: {acc}")

In [None]:
a = torch.randn(2, 3)
m =  nn.Softmax(dim=1)
print(a)
m(a)

In [None]:
from torch import nn

class ClassifierModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.num_labels = 7

        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 7)
        self.softmax = nn.Softmax(dim=1) # acc = 0.577643015084178
        # self.softmax = nn.LogSoftmax(dim=1) # acc = 0.04608446263984062

        # self.post_init()

    def forward(self, X):
        tmp = self.dropout(X)
        logits = self.classifier(tmp)
        logits = self.softmax(logits)
        return logits

In [None]:
# model_skorch = NeuralNetClassifier(ClassifierModule, criterion=nn.CrossEntropyLoss) # acc =  0.3926264859776255
model_skorch = NeuralNetClassifier(ClassifierModule, max_epochs=20) # cross entropy = 0.3926264859776255

In [None]:
pred_probs = cross_val_predict(
    model_skorch, text_embeddings, labels, 
    cv=5, method="predict_proba", verbose=1
)
predicted_labels = pred_probs.argmax(axis=1)
acc = accuracy_score(labels, predicted_labels)
print(f"Cross-validated estimate of accuracy on held-out data: {acc}")

In [None]:
pred_probs[0]

# LogisticRegression classifier

In [None]:
model = LogisticRegression(max_iter=400, verbose=1)
pred_probs = cross_val_predict(model, text_embeddings, labels, method="predict_proba", verbose=1)

In [None]:
from sklearn.metrics import accuracy_score

predicted_labels = pred_probs.argmax(axis=1)
acc = accuracy_score(labels, predicted_labels)
print(f"Cross-validated estimate of accuracy on held-out data: {acc}")

# find label issues

In [None]:
data_dict = {"texts": raw_texts, "labels": labels}

In [None]:
lab = Datalab(data_dict, label_name="labels")
lab.find_issues(pred_probs=pred_probs, features=text_embeddings)

In [None]:
label_issues = lab.get_issues("label")
label_issues.head()

In [None]:
res_df = pd.DataFrame(
{
    "text": raw_texts,
    "target": labels,
    "suggested": label_issues["predicted_label"],
    "given_label": [int2label[x] for x in labels],
    "suggested_label": label_issues["predicted_label"].apply(lambda x: int2label[x]),
    "label_score": label_issues["label_score"],
    "error": label_issues["is_label_issue"]
})

In [None]:
# res_df.to_csv("cleanlab_result.csv")

In [None]:
# 에러, 에러 아닌 개수
print("error", len(res_df[res_df["target"] != res_df["suggested"]]))
print("right", len(res_df[res_df["target"] == res_df["suggested"]]))

In [None]:
# 에러만 따로 빼서 보기
error_df = res_df[res_df["given_label"] != res_df["suggested_label"]].sort_values("label_score")

In [None]:
print("< 0.01",len(error_df[error_df["label_score"]  < 0.01]))
print("< 0.02", len(error_df[error_df["label_score"]  < 0.02]))

In [None]:
# 수정할 threshold 설정
to_change = error_df[error_df["label_score"]  < 0.02]
to_change.head()

In [None]:
to_change.groupby("suggested_label")["text"].count().plot(kind="bar")

In [None]:
to_change.groupby("given_label")["text"].count().plot(kind="bar")

In [None]:
to_change.groupby(["given_label", "suggested_label"])["text"].count().plot(kind='bar')

In [None]:
# 라벨 에러 삭제
new_train = data.drop(index)

In [None]:
# 라벨 에러 예측된 값으로 수정
new_train = train_df.copy()
for i in index:
    new_train.loc[i, "target"] = to_change.loc[i, "suggested"]

In [None]:
# new_train.to_csv("new_train.csv")

In [None]:
test_data = pd.read_csv("data/test.csv")
train_data = pd.read_csv("data/train.csv")

In [None]:
max_len = 0
for i in tqdm(data.itertuples()):
    tokens = tokenizer(i.text)
    max_len = max(max_len, len(tokens.input_ids))

In [None]:
for i in tqdm(test_data.itertuples()):
    tokens = tokenizer(i.text)
    max_len = max(max_len, len(tokens.input_ids))
print(max_len)

In [None]:
for i in tqdm(train_data.itertuples()):
    tokens = tokenizer(i.text)
    max_len = max(max_len, len(tokens.input_ids))
print(max_len)

In [None]:
train_data["len"] = train_data["text"].apply(lambda x: len(x))
sen = train_data[["text", "len"]].sort_values("len")
sen[sen.len < 10]

In [None]:
test_data["len"] = test_data["text"].apply(lambda x: len(x))
sen = test_data[["text", "len"]].sort_values("len")
sen[sen.len < 10]