In [None]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from transformers import ElectraModel, ElectraTokenizer
from tqdm.notebook import tqdm
import torch.nn as nn
import torch.nn.init as init
from sklearn.metrics import f1_score
data = pd.read_csv('/content/dialogues_data.csv')
data.reset_index(drop=True, inplace=True)

In [None]:
data.head()

Unnamed: 0,감정,문장
0,anger,일은 왜 해도 해도 끝이 없을까? 화가 난다.그냥 내가 해결하는 게 나아. 남들한테...
1,anger,이번 달에 또 급여가 깎였어! 물가는 오르는데 월급만 자꾸 깎이니까 너무 화가 나....
2,anger,회사에 신입이 들어왔는데 말투가 거슬려. 그런 애를 매일 봐야 한다고 생각하니까 스...
3,anger,직장에서 막내라는 이유로 나에게만 온갖 심부름을 시켜. 일도 많은 데 정말 분하고 ...
4,anger,얼마 전 입사한 신입사원이 나를 무시하는 것 같아서 너무 화가 나.상사인 나에게 먼...


In [None]:
data['문장'].nunique(), data['감정'].nunique()

(101011, 4)

In [None]:
data.drop_duplicates(subset=['문장'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
data.loc[(data['감정'] == "anger"), '감정'] = 0
data.loc[(data['감정'] == "happy"), '감정'] = 1
data.loc[(data['감정'] == "panic"), '감정'] = 2
data.loc[(data['감정'] == "sadness"), '감정'] = 3

In [None]:
data

Unnamed: 0,감정,문장
0,0,일은 왜 해도 해도 끝이 없을까? 화가 난다.그냥 내가 해결하는 게 나아. 남들한테...
1,0,이번 달에 또 급여가 깎였어! 물가는 오르는데 월급만 자꾸 깎이니까 너무 화가 나....
2,0,회사에 신입이 들어왔는데 말투가 거슬려. 그런 애를 매일 봐야 한다고 생각하니까 스...
3,0,직장에서 막내라는 이유로 나에게만 온갖 심부름을 시켜. 일도 많은 데 정말 분하고 ...
4,0,얼마 전 입사한 신입사원이 나를 무시하는 것 같아서 너무 화가 나.상사인 나에게 먼...
...,...,...
101006,2,오마이갓 믿을 수가 없어
101007,2,하느님 이거 너무 힘들어요
101008,2,엄청난 쇼
101009,2,오머나 정말 놀랐습니다


In [None]:
# GPU 사용
#device = torch.device("cuda")

In [None]:
class CsixDataset(Dataset):

  def __init__(self, data):
    self.dataset = data
    # 중복제거
    self.dataset.drop_duplicates(subset=['문장'], inplace=True)
    self.tokenizer =  AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
    #"monologg/koelectra-small-v3-discriminator" small data
    #"monologg/koelectra-base-v3-discriminator" big data

    print(self.dataset.describe())

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:2].values
    text = row[1]
    y = row[0]

    inputs = self.tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        max_length=75,
        pad_to_max_length=True,
        add_special_tokens=True
        )

    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [None]:
# 데이터셋을 트레이닝과 테스트로 나눔
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 트레이닝 데이터셋과 테스트 데이터셋 클래스 인스턴스 생성
train_dataset = CsixDataset(train_data)
test_dataset = CsixDataset(test_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

           감정                                                 문장
count   80808                                              80808
unique      4                                              80808
top         1  할머니가 엄마를 들들 볶는 것을 볼 때마다 구역질이 나.할머니는 엄마에게 쌀밥도 못...
freq    22660                                                  1
           감정                   문장
count   20203                20203
unique      4                20203
top         3  내 인생은 행복으로 가득 차 있어요
freq     5591                    1


In [None]:
num_classes = 4  # 클래스 수에 맞게 설정
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator", num_labels=num_classes)

pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
text, attention_mask, y = train_dataset[0]
model(text.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))



SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0523,  0.0228, -0.0807,  0.1499]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
# 모델 레이어 보기
# model = model.to(device)

In [None]:
epochs = 4
batch_size = 16

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-6)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)



In [None]:
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []

for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0.0
    correct_train = 0
    total_train = 0

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        # y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch, attention_mask=attention_masks_batch)[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        _, predicted = torch.max(y_pred, 1)
        correct_train += (predicted == y_batch).sum().item()
        total_train += len(y_batch)

        # Print batch accuracy
        batch_accuracy = correct_train / total_train
        print(f"Epoch {epoch+1}/{epochs}, Batch Accuracy: {batch_accuracy:.4f}")

    train_losses.append(train_loss / len(train_loader))
    train_accuracy = correct_train / total_train
    train_accuracies.append(train_accuracy)

    # Testing
    model.eval()
    test_loss = 0.0
    correct_test = 0
    total_test = 0

    with torch.no_grad():
        for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
            # y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch, attention_mask=attention_masks_batch)[0]
            loss = F.cross_entropy(y_pred, y_batch)

            test_loss += loss.item()

            _, predicted = torch.max(y_pred, 1)
            correct_test += (predicted == y_batch).sum().item()
            total_test += len(y_batch)

    test_losses.append(test_loss / len(test_loader))
    test_accuracy = correct_test / total_test
    test_accuracies.append(test_accuracy)

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_losses[-1]:.4f}, Train Accuracy: {train_accuracy:.4f}, Test Loss: {test_losses[-1]:.4f}, Test Accuracy: {test_accuracy:.4f}")

    # Calculate F1 score

    predicted_labels = []
    true_labels = []

    with torch.no_grad():
        for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
            # y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch, attention_mask=attention_masks_batch)[0]
            _, predicted = torch.max(y_pred, 1)

            predicted_labels.extend(predicted.cpu().numpy())
            true_labels.extend(y_batch.cpu().numpy())

    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    print("F1 Score:", f1)

  0%|          | 0/5051 [00:00<?, ?it/s]



[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Epoch 1/4, Batch Accuracy: 0.2993
Epoch 1/4, Batch Accuracy: 0.3019
Epoch 1/4, Batch Accuracy: 0.3009
Epoch 1/4, Batch Accuracy: 0.2977
Epoch 1/4, Batch Accuracy: 0.2969
Epoch 1/4, Batch Accuracy: 0.2971
Epoch 1/4, Batch Accuracy: 0.2996
Epoch 1/4, Batch Accuracy: 0.3030
Epoch 1/4, Batch Accuracy: 0.3031
Epoch 1/4, Batch Accuracy: 0.3043
Epoch 1/4, Batch Accuracy: 0.3054
Epoch 1/4, Batch Accuracy: 0.3065
Epoch 1/4, Batch Accuracy: 0.3066
Epoch 1/4, Batch Accuracy: 0.3077
Epoch 1/4, Batch Accuracy: 0.3087
Epoch 1/4, Batch Accuracy: 0.3134
Epoch 1/4, Batch Accuracy: 0.3153
Epoch 1/4, Batch Accuracy: 0.3188
Epoch 1/4, Batch Accuracy: 0.3196
Epoch 1/4, Batch Accuracy: 0.3213
Epoch 1/4, Batch Accuracy: 0.3212
Epoch 1/4, Batch Accuracy: 0.3219
Epoch 1/4, Batch Accuracy: 0.3193
Epoch 1/4, Batch Accuracy: 0.3200
Epoch 1/4, Batch Accuracy: 0.3232
Epoch 1/4, Batch Accuracy: 0.3263
Epoch 1/4, Batch Accuracy: 0.3285
Epoch 1/4, Batch Accuracy: 0.331

  0%|          | 0/1263 [00:00<?, ?it/s]

Epoch 1/4, Train Loss: 0.6163, Train Accuracy: 0.7640, Test Loss: 0.4689, Test Accuracy: 0.8262


  0%|          | 0/1263 [00:00<?, ?it/s]

F1 Score: 0.8259103717205686


  0%|          | 0/5051 [00:00<?, ?it/s]



[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Epoch 2/4, Batch Accuracy: 0.8125
Epoch 2/4, Batch Accuracy: 0.8137
Epoch 2/4, Batch Accuracy: 0.8148
Epoch 2/4, Batch Accuracy: 0.8182
Epoch 2/4, Batch Accuracy: 0.8192
Epoch 2/4, Batch Accuracy: 0.8169
Epoch 2/4, Batch Accuracy: 0.8179
Epoch 2/4, Batch Accuracy: 0.8178
Epoch 2/4, Batch Accuracy: 0.8177
Epoch 2/4, Batch Accuracy: 0.8197
Epoch 2/4, Batch Accuracy: 0.8175
Epoch 2/4, Batch Accuracy: 0.8175
Epoch 2/4, Batch Accuracy: 0.8154
Epoch 2/4, Batch Accuracy: 0.8163
Epoch 2/4, Batch Accuracy: 0.8153
Epoch 2/4, Batch Accuracy: 0.8153
Epoch 2/4, Batch Accuracy: 0.8143
Epoch 2/4, Batch Accuracy: 0.8125
Epoch 2/4, Batch Accuracy: 0.8107
Epoch 2/4, Batch Accuracy: 0.8116
Epoch 2/4, Batch Accuracy: 0.8125
Epoch 2/4, Batch Accuracy: 0.8125
Epoch 2/4, Batch Accuracy: 0.8142
Epoch 2/4, Batch Accuracy: 0.8142
Epoch 2/4, Batch Accuracy: 0.8166
Epoch 2/4, Batch Accuracy: 0.8166
Epoch 2/4, Batch Accuracy: 0.8173
Epoch 2/4, Batch Accuracy: 0.819

  0%|          | 0/1263 [00:00<?, ?it/s]

Epoch 2/4, Train Loss: 0.4534, Train Accuracy: 0.8296, Test Loss: 0.4506, Test Accuracy: 0.8339


  0%|          | 0/1263 [00:00<?, ?it/s]

F1 Score: 0.8338941757955769


  0%|          | 0/5051 [00:00<?, ?it/s]



[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Epoch 3/4, Batch Accuracy: 0.8365
Epoch 3/4, Batch Accuracy: 0.8373
Epoch 3/4, Batch Accuracy: 0.8356
Epoch 3/4, Batch Accuracy: 0.8375
Epoch 3/4, Batch Accuracy: 0.8382
Epoch 3/4, Batch Accuracy: 0.8410
Epoch 3/4, Batch Accuracy: 0.8394
Epoch 3/4, Batch Accuracy: 0.8411
Epoch 3/4, Batch Accuracy: 0.8417
Epoch 3/4, Batch Accuracy: 0.8422
Epoch 3/4, Batch Accuracy: 0.8438
Epoch 3/4, Batch Accuracy: 0.8442
Epoch 3/4, Batch Accuracy: 0.8467
Epoch 3/4, Batch Accuracy: 0.8481
Epoch 3/4, Batch Accuracy: 0.8475
Epoch 3/4, Batch Accuracy: 0.8498
Epoch 3/4, Batch Accuracy: 0.8511
Epoch 3/4, Batch Accuracy: 0.8524
Epoch 3/4, Batch Accuracy: 0.8509
Epoch 3/4, Batch Accuracy: 0.8512
Epoch 3/4, Batch Accuracy: 0.8481
Epoch 3/4, Batch Accuracy: 0.8476
Epoch 3/4, Batch Accuracy: 0.8463
Epoch 3/4, Batch Accuracy: 0.8442
Epoch 3/4, Batch Accuracy: 0.8438
Epoch 3/4, Batch Accuracy: 0.8442
Epoch 3/4, Batch Accuracy: 0.8454
Epoch 3/4, Batch Accuracy: 0.847

  0%|          | 0/1263 [00:00<?, ?it/s]

Epoch 3/4, Train Loss: 0.3949, Train Accuracy: 0.8533, Test Loss: 0.4258, Test Accuracy: 0.8442


  0%|          | 0/1263 [00:00<?, ?it/s]

F1 Score: 0.8441720291336103


  0%|          | 0/5051 [00:00<?, ?it/s]



[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Epoch 4/4, Batch Accuracy: 0.8714
Epoch 4/4, Batch Accuracy: 0.8715
Epoch 4/4, Batch Accuracy: 0.8692
Epoch 4/4, Batch Accuracy: 0.8670
Epoch 4/4, Batch Accuracy: 0.8672
Epoch 4/4, Batch Accuracy: 0.8651
Epoch 4/4, Batch Accuracy: 0.8653
Epoch 4/4, Batch Accuracy: 0.8665
Epoch 4/4, Batch Accuracy: 0.8677
Epoch 4/4, Batch Accuracy: 0.8668
Epoch 4/4, Batch Accuracy: 0.8679
Epoch 4/4, Batch Accuracy: 0.8671
Epoch 4/4, Batch Accuracy: 0.8633
Epoch 4/4, Batch Accuracy: 0.8606
Epoch 4/4, Batch Accuracy: 0.8617
Epoch 4/4, Batch Accuracy: 0.8610
Epoch 4/4, Batch Accuracy: 0.8585
Epoch 4/4, Batch Accuracy: 0.8596
Epoch 4/4, Batch Accuracy: 0.8580
Epoch 4/4, Batch Accuracy: 0.8592
Epoch 4/4, Batch Accuracy: 0.8585
Epoch 4/4, Batch Accuracy: 0.8587
Epoch 4/4, Batch Accuracy: 0.8581
Epoch 4/4, Batch Accuracy: 0.8583
Epoch 4/4, Batch Accuracy: 0.8594
Epoch 4/4, Batch Accuracy: 0.8596
Epoch 4/4, Batch Accuracy: 0.8614
Epoch 4/4, Batch Accuracy: 0.860

  0%|          | 0/1263 [00:00<?, ?it/s]

Epoch 4/4, Train Loss: 0.3549, Train Accuracy: 0.8687, Test Loss: 0.4369, Test Accuracy: 0.8419


  0%|          | 0/1263 [00:00<?, ?it/s]

F1 Score: 0.8419686054521724


In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  #y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch, attention_mask=attention_masks_batch)[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

  0%|          | 0/1263 [00:00<?, ?it/s]



Accuracy: tensor(0.8419)


In [None]:
# 모델 저장하기
torch.save(model.state_dict(), "modelfinal.pt")

In [None]:
# 토크나이저 설정
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

In [None]:
koelectra_tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

result = koelectra_tokenizer.tokenize("너는 내년 대선 때 투표할 수 있어?")
print(result)
print([koelectra_tokenizer.encode(token) for token in result])

['너', '##는', '내년', '대선', '때', '투표', '##할', '수', '있', '##어', '?']
[[2, 2267, 3], [2, 7, 7, 2331, 3], [2, 6821, 3], [2, 7167, 3], [2, 2468, 3], [2, 7070, 3], [2, 7, 7, 3758, 3], [2, 2967, 3], [2, 3249, 3], [2, 7, 7, 3114, 3], [2, 35, 3]]


In [None]:
# 감정 예측 함수
def predict_emotion(input_sentence):
    inputs = tokenizer(input_sentence, return_tensors="pt", max_length=75, padding=True, truncation=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)
        logits = output.logits

    predicted_label = torch.argmax(logits, dim=1).item()
    emotions = ["anger", "happy", "panic", "sadness"]
    predicted_emotion = emotions[predicted_label]

    return predicted_emotion

In [None]:
class_names = ["anger", "happy", "panic", "sadness"]
def predict_with_prob(input_sentence):
    inputs = tokenizer(input_sentence, return_tensors="pt", max_length=75, padding=True, truncation=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # 모델을 사용하여 감정 예측
    outputs = model(input_ids, attention_mask)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1)

    # 예측된 감정
    predicted_class_idx = torch.argmax(logits, dim=1).item()
    predicted_emotion = class_names[predicted_class_idx]

    # 각 클래스(감정)별 확률
    class_probs = probs.squeeze().tolist()

    # 결과 반환
    return predicted_emotion, class_probs


In [None]:
# 질문 무한반복하기! 0 입력시 종료
end = 1
while end == 1:
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == "0":
        break
    print("입력한 문장:", sentence)

    # 감정 예측 및 각 클래스(감정)별 확률 출력
    predicted_emotion, class_probs = predict_with_prob(sentence)
    print("예측된 감정:", predicted_emotion)
    print("각 클래스별 확률:")
    for class_name, prob in zip(class_names, class_probs):
        print(f"{class_name}: {prob*100:.2f}%")
    print("\n")



하고싶은 말을 입력해주세요 : 드디어 끝났다
입력한 문장: 드디어 끝났다
예측된 감정: happy
각 클래스별 확률:
anger: 0.48%
happy: 95.49%
panic: 3.78%
sadness: 0.26%


하고싶은 말을 입력해주세요 : 0


In [None]:
# 모델 저장 경로
model_save_path = '/content/model.pt'

# 모델 저장
torch.save(model.state_dict(), model_save_path)