In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel,AutoModelForSequenceClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from collections import OrderedDict

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

train_data = pd.read_csv("./Dataset/unsmile_train_v1.0.tsv", sep="\t")
valid_data = pd.read_csv("./Dataset/unsmile_valid_v1.0.tsv", sep="\t")

In [None]:
MODEL_NAME = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 128


In [None]:
train_data.shape[0] , valid_data.shape[0]

In [None]:
class MultiHeadDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe["문장"]
        self.binary_label = dataframe["clean"].values
        self.multi_label = dataframe[[ '여성/가족', '남성', '성소수자', '인종/국적', '연령', '지역', '종교', '기타 혐오', '악플/욕설', '개인지칭']].values
        self.max_len = max_len

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'binary_label': torch.tensor(self.binary_label[index], dtype=torch.float),
            'multi_label': torch.tensor(self.multi_label[index], dtype=torch.float)
        }
    


training_set = MultiHeadDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiHeadDataset(valid_data, tokenizer, MAX_LEN)

train_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': 16,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
train_steps = len(training_loader.dataset) // train_params['batch_size']
val_steps = len(testing_loader.dataset) // test_params['batch_size']

In [None]:
class MultiTaskNet(torch.nn.Module):
    def __init__(self,net):
        super(MultiTaskNet, self).__init__()
        self.net = net
        # "beomi/KcELECTRA-base"모델 마지막 레이어의 output개수 768 -> 이 부분 하드 코딩으로 self.n_features = 768 이렇게 말고 다르게 바꿔 넣어야함
        self.n_features = 768
        self.activation = torch.tanh
        
        # fc1 : binary-classfication layer , fc2: multi-classfication layer
        self.net.fc1 = nn.Sequential(OrderedDict([('linear', nn.Linear(self.n_features,self.n_features)),('dropout', nn.Dropout(p=0.1)),('final', nn.Linear(self.n_features, 1))]))
        self.net.fc2 = nn.Sequential(OrderedDict([('linear', nn.Linear(self.n_features,self.n_features)),('dropout', nn.Dropout(p=0.1)),('final', nn.Linear(self.n_features, 10))]))
        
        
        
        

    def forward(self, input_ids, attention_mask,token_type_ids):
        outputs = self.net(input_ids=input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids)
        outputs_cls = outputs.last_hidden_state[:,0].contiguous()
        
        # binary_head는 sigmoid 거쳐서 나옴
        # multi_head는 sigmoid 안 거쳐서 나옴 
        binary_head = torch.sigmoid(self.net.fc1(outputs_cls).squeeze())
        multi_head = self.net.fc2(outputs_cls)
        
        return binary_head,multi_head


In [7]:
net = AutoModel.from_pretrained("beomi/KcELECTRA-base")
model = MultiTaskNet(net)
model.to(device=device)


#BCEWithLogitsLoss -> multi-label classfication 할 때 사용하는 loss , multi-label != multi-class -> 우리 데이터는 멑티 분류할 때 한 데이터에 라벨이 여러개 있는 경우도 있어서 ㅇㅇ  
binary_loss = nn.BCELoss() 
multi_loss = nn.BCEWithLogitsLoss()

LEARNING_RATE = 5e-5
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
model

MultiTaskNet(
  (net): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [10]:
for epoch in range(10):
    model.train()

    total_training_loss = 0
    total_validation_loss = 0
    training_binary_loss = 0
    training_multi_loss = 0

    validation_binary_loss = 0
    validation_multi_loss = 0

    for i, data in tqdm(enumerate(training_loader)):
        
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        binary_label = data['binary_label'].to(device, dtype = torch.float)
        multi_label = data['multi_label'].to(device, dtype = torch.float)
        
    
        optimizer.zero_grad()
        
        binary_output, multi_output = model(ids, mask, token_type_ids)
    
        loss_1 = binary_loss(binary_output, binary_label)
        loss_2 = multi_loss(multi_output, multi_label)
        
        loss = loss_1 + loss_2 

        loss.backward()
        optimizer.step()
        total_training_loss += loss
        
        training_binary_loss += loss_1.item()
        training_multi_loss += loss_2.item()

    print('EPOCH ', epoch+1)
    print("Training Losses: binary: {}, multi: {}".format(loss_1, loss_2))

    with torch.no_grad():
        model.eval()

        for i, data in tqdm(enumerate(testing_loader)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            binary_label = data['binary_label'].to(device, dtype = torch.float)
            multi_label = data['multi_label'].to(device, dtype = torch.float)
            
            binary_output, multi_output = model(ids, mask, token_type_ids)
            

            loss_1 = binary_loss(binary_output, binary_label)
            loss_2 = multi_loss(multi_output, multi_label)

            loss = loss_1 + loss_2 
            total_validation_loss += loss

            validation_binary_loss += loss_1.item()
            validation_multi_loss += loss_2.item()
            

            
            
        
        print("Validation Losses: binary: {}, multi: {}".format(loss_1, loss_2))

    avgTrainLoss = total_training_loss / train_steps
    avgValLoss = total_validation_loss / val_steps
    
    print('Average Losses — Training: {} | Validation {}'.format(avgTrainLoss, avgValLoss))
    print() 
    

1876it [02:32, 12.28it/s]


EPOCH  1
Training Losses: binary: 0.11253851652145386, multi: 0.015868904069066048


234it [00:09, 25.82it/s]


Validation Losses: binary: 0.971618115901947, multi: 0.15979619324207306
Average Losses — Training: 0.06437462568283081 | Validation 0.8668431043624878



1876it [02:32, 12.27it/s]


EPOCH  2
Training Losses: binary: 0.001460848841816187, multi: 0.0066737886518239975


234it [00:09, 25.83it/s]


Validation Losses: binary: 0.18912534415721893, multi: 0.08738290518522263
Average Losses — Training: 0.04505448415875435 | Validation 0.8606873750686646



1876it [02:33, 12.25it/s]


EPOCH  3
Training Losses: binary: 0.00039386420394293964, multi: 0.004001200199127197


234it [00:09, 25.78it/s]


Validation Losses: binary: 0.0009473111131228507, multi: 0.07337018102407455
Average Losses — Training: 0.05182472616434097 | Validation 1.0112286806106567



1876it [02:32, 12.28it/s]


EPOCH  4
Training Losses: binary: 0.0012508764630183578, multi: 0.007190556265413761


234it [00:09, 25.70it/s]


Validation Losses: binary: 2.6112937927246094, multi: 0.30537134408950806
Average Losses — Training: 0.04232030361890793 | Validation 0.9199234843254089



1876it [02:33, 12.25it/s]


EPOCH  5
Training Losses: binary: 0.01021643728017807, multi: 0.0013544183457270265


234it [00:09, 25.76it/s]


Validation Losses: binary: 1.0906221866607666, multi: 0.09722980111837387
Average Losses — Training: 0.03724443539977074 | Validation 0.8601778149604797



1876it [02:32, 12.27it/s]


EPOCH  6
Training Losses: binary: 0.006300763692706823, multi: 0.0031862477771937847


234it [00:09, 25.76it/s]


Validation Losses: binary: 0.22826586663722992, multi: 0.08817727863788605
Average Losses — Training: 0.04015176743268967 | Validation 0.9456781148910522



1876it [02:33, 12.21it/s]


EPOCH  7
Training Losses: binary: 0.6700863242149353, multi: 0.08748984336853027


234it [00:09, 25.74it/s]


Validation Losses: binary: 1.432080626487732, multi: 0.3040362000465393
Average Losses — Training: 0.03709809109568596 | Validation 0.8064575791358948



1876it [02:33, 12.24it/s]


EPOCH  8
Training Losses: binary: 0.000338098470820114, multi: 0.001991614932194352


234it [00:09, 25.87it/s]


Validation Losses: binary: 0.4333277940750122, multi: 0.028164368122816086
Average Losses — Training: 0.04220062121748924 | Validation 0.8799358010292053



1876it [02:33, 12.23it/s]


EPOCH  9
Training Losses: binary: 0.0016015124274417758, multi: 0.001310021267272532


234it [00:09, 25.72it/s]


Validation Losses: binary: 0.9457406997680664, multi: 0.09609341621398926
Average Losses — Training: 0.0326753631234169 | Validation 0.9261738657951355



1876it [02:33, 12.25it/s]


EPOCH  10
Training Losses: binary: 0.00028030769317410886, multi: 0.00873784814029932


234it [00:09, 25.78it/s]


Validation Losses: binary: 0.0002108744956785813, multi: 0.1858549863100052
Average Losses — Training: 0.03874655440449715 | Validation 1.0534192323684692



In [75]:
total_validation_acc = 0
validation_binary_acc = 0
validation_multi_acc = 0

total_validation_loss
validation_binary_loss = 0
validation_multi_loss= 0 
with torch.no_grad():
        model.eval()

        for i, data in tqdm(enumerate(testing_loader)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            binary_label = data['binary_label'].to(device, dtype = torch.float)
            multi_label = data['multi_label'].to(device, dtype = torch.float)
            
            binary_output, multi_output = model(ids, mask, token_type_ids)
            

            loss_1 = binary_loss(binary_output, binary_label)
            loss_2 = multi_loss(multi_output, multi_label)

            loss = loss_1 + loss_2 
            total_validation_loss += loss

            validation_binary_loss += loss_1.item()
            validation_multi_loss += loss_2.item()
            
            
            
            validation_binary_acc += ((binary_output > 0.5).float() == binary_label).float().sum()
            validation_multi_acc += (((multi_output > 0).float() == multi_label).float().sum(axis=1)== 10).float().sum()
        
        validation_binary_acc = 100 * validation_binary_acc / len(testing_set)
        validation_multi_acc = 100 * validation_multi_acc / len(testing_set)
            
        
        print("Validation Losses: binary: {}, multi: {}".format(loss_1, loss_2))
        print("Validation acc: binary {}, multi: {}".format(validation_binary_acc,validation_multi_acc))

234it [00:09, 25.38it/s]

Validation Losses: binary: 1.1268572807312012, multi: 0.2429865151643753
Validation acc: binary 86.96815490722656, multi: 71.04629516601562





In [78]:
def sentence_classification(sentence):
    
    tokenized_sent = tokenizer(
        sentence,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        add_special_tokens=True,
    )
    
    tokenized_sent.to(device)
    
    model.eval()
    with torch.no_grad():
        binary_output,multi_output = model(**tokenized_sent)
    
    print(binary_output)
    if binary_output > 0.5:
        print("Clean")
    else:
        print("No Clean")
    
    print(multi_output)
    multi_output = multi_output.detach().cpu().numpy()
    final_outputs = np.array(multi_output) >= 0 
    
    print(pd.DataFrame(final_outputs,columns=[[ '여성/가족', '남성', '성소수자', '인종/국적', '연령', '지역', '종교', '기타 혐오', '악플/욕설', '개인지칭']]))
    return 
    
sentence_classification("나는 쓰레기같은 너가 좋지 않아")

tensor(5.2116e-05, device='cuda:0')
No Clean
tensor([[ -7.4159, -10.3318,  -9.4131,  -8.7833,  -8.9303,  -6.6012, -10.7025,
          -9.9619,   6.7083,  -9.7401]], device='cuda:0')
   여성/가족     남성   성소수자  인종/국적     연령     지역     종교  기타 혐오 악플/욕설   개인지칭
0  False  False  False  False  False  False  False  False  True  False


In [87]:
tokenizer.save_pretrained('./multi-task-model')
net.save_pretrained('./multi-task-model')

In [8]:
net = AutoModel.from_pretrained("./multi-task-model")
model2 = MultiTaskNet(net)
model2.to(device=device)


Some weights of the model checkpoint at ./multi-task-model were not used when initializing ElectraModel: ['fc1.linear.weight', 'fc1.linear.bias', 'fc2.linear.bias', 'fc2.linear.weight', 'fc2.final.weight', 'fc1.final.bias', 'fc2.final.bias', 'fc1.final.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


MultiTaskNet(
  (net): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem