# 1. KoBERT 모델 구현

## 라이브러리 설치

In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[K     |████████████████████████████████| 49.1 MB 1.2 MB/s 
Collecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.10.1
    Uninstalling graphviz-0.10.1:
      Successfully uninstalled graphviz-0.10.1
Successfully installed graphviz-0.8.4 mxnet-1.9.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[K     |████████████████████████████████| 344 kB 4.5 MB/s 
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp38-cp38-

In [None]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-4qjt5u53
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-4qjt5u53
Collecting boto3<=1.15.18
  Downloading boto3-1.15.18-py2.py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 5.3 MB/s 
Collecting mxnet<=1.7.0.post2,>=1.4.0
  Downloading mxnet-1.7.0.post2-py2.py3-none-manylinux2014_x86_64.whl (54.7 MB)
[K     |████████████████████████████████| 54.7 MB 15 kB/s 
[?25hCollecting onnxruntime<=1.8.0,==1.8.0
  Downloading onnxruntime-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 60.9 MB/s 
[?25hCollecting sentencepiece<=0.1.96,>=0.1.6
  Downloading sentencepiece-0.1.96-cp38-cp38-manylin

In [None]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.7 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16179 sha256=709f395d351a3d07587135dadcfdd83cba2b20cdc05e0ee743011366b68451ea
  Stored in directory: /root/.cache/pip/wheels/ad/5c/ba/05fa33fa5855777b7d686e843ec07452f22a66a138e290e732
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

In [None]:
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

In [None]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
# ## CPU
# device = torch.device("cpu")

# #GPU 사용 시
# device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda:0")

In [None]:
bertmodel, vocab = get_pytorch_kobert_model()

## 데이터 불러오기 & 전처리

In [None]:
# #구글드라이브 연동
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
chatbot_data = pd.read_csv('/content/drive/MyDrive/텍스트이해/emotin_train_dataset_all.csv')


In [None]:
len(chatbot_data)

In [None]:
chatbot_data.sample(n=10)

In [None]:
chatbot_data.loc[(chatbot_data['tag'] == "기쁨"), 'tag'] = 0  #기쁨 => 0
chatbot_data.loc[(chatbot_data['tag'] == "신뢰"), 'tag'] = 1  #신뢰 => 1
chatbot_data.loc[(chatbot_data['tag'] == "기대"), 'tag'] = 2  #기대 => 2
chatbot_data.loc[(chatbot_data['tag'] == "중립"), 'tag'] = 3  #중립 => 3
chatbot_data.loc[(chatbot_data['tag'] == "분노"), 'tag'] = 4  #분노 => 4
chatbot_data.loc[(chatbot_data['tag'] == "슬픔"), 'tag'] = 5  #슬픔 => 5
chatbot_data.loc[(chatbot_data['tag'] == "혐오"), 'tag'] = 6  #혐오 => 6

In [None]:
data_list = []
for q, label in zip(chatbot_data['sentence'], chatbot_data['tag'])  :
    data = []
    data.append(q)
    data.append(str(label))

    data_list.append(data)

In [None]:
print(len(data_list))
print(data_list[0])
print(data_list[1000])
print(data_list[2000])
print(data_list[3000])
print(data_list[4000])
print(data_list[5000])
print(data_list[-1])

In [None]:
#train & test 데이터로 나누기
from sklearn.model_selection import train_test_split

dataset_train, dataset_test = train_test_split(data_list, test_size=0.2, random_state=0)

In [None]:
print(len(dataset_train))
print(len(dataset_test))
print(dataset_train[0])
print(dataset_test[0])

## 학습모델

In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
## Setting parameters
max_len = 64  
batch_size = 64    
warmup_ratio = 0.1
num_epochs = 50
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
# Tokenization
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

In [None]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [None]:
data_train[0]

In [None]:
data_test[1]

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=7, # number of emotions
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [None]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [None]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
train_dataloader

## 학습시키기

In [None]:
# F1 스코어 계산을 위해 사용
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

import numpy as np

index_to_tag = {0:'기쁨', 1:'기대', 2:'신뢰', 3:'중립', 4:'분노', 5:'슬픔', 6:'혐오'}

# 시퀀스를 NER 태그로 변환
def sequences_to_tag(sequences): # 예측값을 index_to_ner을 사용하여 태깅 정보로 변환
    result = []
    for sequence in sequences: # 전체 시퀀스로부터 시퀀스를 하나씩 꺼낸다
        temp = []
        for pred in sequence: # 시퀀스로부터 예측값을 하나씩 꺼낸다
            # pred_index = np.argmax(pred) # ex) [0, 0, 1, 0, 0]이라면 1의 인덱스인 2를 리턴
            pred = pred.item()
            temp.append(index_to_tag[pred]) 
        result.append(temp)

    return result

In [None]:
# 모델 저장코드 추가1
from copy import deepcopy
import gc
# from cpython.Lib import copy
best_f1 = 0
best_model = None 

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
            
        gc.collect()
        torch.cuda.empty_cache()
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
   
    model.eval()
    

    out_all = []
    label_all = []
        
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        
        
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        
        out = model(token_ids, valid_length, segment_ids)
        label_all.append(label)
        test_acc += calc_accuracy(out, label) 
        max_vals, max_indices = torch.max(out, 1)
        out_all.append(max_indices)


    # 테스트 데이터셋의 예측
    pred_tags = sequences_to_tag(out_all) # 예측
    label_tags = sequences_to_tag(label_all) # 실제 

    # F1 평가 결과
    print(classification_report(label_tags, pred_tags))
    test_f1 = f1_score(label_tags, pred_tags)
    print("F1-score: {:.1%}".format(test_f1)) 

    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

    # BEST MODEL 저장
    if test_f1 > best_f1:
      best_f1 = test_f1
      best_model = deepcopy(model)
    
      torch.save(model, 'best-model-add-50_s7.pt')
      torch.save(best_model.state_dict(),'best-model-params-add-50_s7.pt')
      torch.save({'model': model.state_dict(),'optimizer': optimizer.state_dict()}, 'best-model-add-50_s7' + '.tar')    
    
      gc.collect()
      torch.cuda.empty_cache()
     
    del out_all
    del label_all
    
        
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
# import os
# os.chdir('/content/drive/MyDrive/텍스트이해/models/')
# os.getcwd()


# torch.save(best_model, 'best-model-50_s7.pt')
# torch.save(best_model.state_dict(), 'best-model-params-50_s7.pt')
# torch.save({'model': best_model.state_dict(),'optimizer': optimizer.state_dict()},'best-model-50_s7' + '.tar')    

## 모델 불러오기

In [None]:
#구글드라이브 연동
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

os.chdir('/content/drive/MyDrive/텍스트이해/models/bestmodel_부정추가/')

model1 = torch.load('best-model-add-50_s7_neg.pt')  # 전체 모델을 통째로 불러옴, 클래스 선언 필수
model1.load_state_dict(torch.load('best-model-params-add-50_s7_neg.pt'))  # state_dict를 불러 온 후, 모델에 저장
checkpoint = torch.load('best-model-add-50_s7_neg.tar')   # dict 불러오기
model1.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])

## 예측하기

In [None]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

In [None]:
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model1.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model1(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                test_eval.append("기쁨")
            elif np.argmax(logits) == 1:
                test_eval.append("신뢰")
            elif np.argmax(logits) == 2:
                test_eval.append("기대")
            elif np.argmax(logits) == 3:
                test_eval.append("중립")
            elif np.argmax(logits) == 4:
                test_eval.append("분노")
            elif np.argmax(logits) == 5:
                test_eval.append("슬픔")
            elif np.argmax(logits) == 6:
                test_eval.append("혐오")

        return test_eval[0]

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/텍스트이해/categories/뷰티.csv')

test_sentence = df['sentence']

In [None]:
predict_emo = []
for i in test_sentence:
  predict_emo.append(predict(i))
print(predict_emo)

In [None]:
import pandas as pd
df2 = pd.DataFrame(predict_emo, columns = ['emotion'])
df2

In [None]:
df_merge = pd.merge(df, df2, left_index=True, right_index = True, how = 'left')
df_merge

In [None]:
df_merge.to_csv('/content/drive/MyDrive/텍스트이해/categories/뷰티_result.csv')

# 2. 부정 레이블 예측 -> 학습 데이터로 활용
 - base model에서 성능이 낮은 부정 레이블의 학습데이터 추가하기 위함

In [None]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

In [None]:
def predict(dataset_another):

    data = [dataset_another, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model1.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader) :   #tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model1(token_ids, valid_length, segment_ids)


        test_eval=[]
        test_prob= []
        for i in out:
            logits=i
            m = nn.Softmax()  
            output = m(logits)

            prob = output.detach().cpu().numpy() 
            max_prob = max(prob)
            max_idx = np.argmax(prob)

            if max_prob >= 0.9 :
                test_eval.append(max_idx)
            else : 
                test_eval.append(99)            
   
        return test_eval 


In [None]:
import pandas as pd
predict_raw = pd.read_csv('/content/drive/MyDrive/emotion_analysis/부정_예측대상.csv')
dataset_predict = predict_raw[(predict_raw['Emotion'] == '분노') | (predict_raw['Emotion'] == '슬픔') | (predict_raw['Emotion'] == '혐오')]
dataset_predict = dataset_predict.reset_index(drop=True)

In [None]:
results = []
for data in dataset_predict['Sentence'] :
    result = predict(data)
    results.append(result)
            
results

In [None]:
test_eval = []
for i in range(len(results)) :
  idx = results[i][0]

  if idx == 0:
      test_eval.append("기쁨")
  elif idx == 1:
      test_eval.append("신뢰")
  elif idx == 2:
      test_eval.append("기대")
  elif idx == 3:
      test_eval.append("중립")
  elif idx == 4:
      test_eval.append("분노")
  elif idx == 5:
      test_eval.append("슬픔")
  elif idx == 6:
      test_eval.append("혐오")
  elif idx == 99 : 
      test_eval.append("제외")

print(test_eval)

In [None]:
df = pd.DataFrame(test_eval, columns = ['new_emotion'])
new_negative = pd.merge(dataset_predict, df, left_index=True, right_index=True, how='left')
new_negative  = new_negative[['Sentence', 'new_emotion']]
new_negative  = new_negative[new_negative['new_emotion'] != '제외']
new_negative.rename(columns = {'Sentence':'sentence', 'new_emotion':'tag'}, inplace=True)

In [None]:
# concat - 해당 데이터를 학습에 사용
chatbot_data = pd.concat([chatbot_data, new_negative])
chatbot_data

# 3. 긍정레이블 문장임베딩 & 군집화
 - 수작업으로 생성한 긍정 데이터들이 레이블 별로 군집을 잘 형성되는지 확인하기 위함

In [None]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.1 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 24.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 62.9 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 77.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 69.9 MB/s 
Building wheels for collected pa

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('jhgan/ko-sroberta-multitask')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
#구글드라이브 연동
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
chatbot_data = pd.read_csv('/content/drive/MyDrive/텍스트이해/emotin_train_dataset_all.csv')

In [None]:
chatbot_data = chatbot_data[(chatbot_data['tag'] == '기쁨') | (chatbot_data['tag'] == '기대') | (chatbot_data['tag'] == '신뢰') | (chatbot_data['tag'] == '중립') ]

In [None]:
chatbot_data

Unnamed: 0,Sentence,tag
0,덕분에 주방에서 요리하는게 넘 좋아졌어요,기쁨
1,활동혜택으로 오션월드 입장권 2매까지 주는 폴인러버!,기쁨
2,원호 형님 형님이 올해 우리팀으로 온다고 해서 얼마나 기뻤는지 몰라요,기쁨
3,요리가 차~암 쉽쥬~~~~^^,기쁨
4,제일처럼 기쁘고 좋은데....,기쁨
...,...,...
6995,사람들은 죽어서도 자손들에게 대우 받고 싶은 인간의 본성이 제사라는 문화로 나타났다,중립
6996,서장훈:도대체 이게 무슨 의미가 있습니까~,중립
6997,박병호 보다 강정호가 한수 위다,중립
6998,아닙니다. 빨리갑니다ㅋㅋㅋㅋㅋㅋㅋ같이출발한 우리 숙모 저랑1시간차이났습니다ㅋㅋㅋㅋㅋㅋㅋㅋ,중립


In [None]:
sentences = list(chatbot_data['Sentence'])

In [None]:
embeddings = model.encode(sentences)

In [None]:
len(embeddings)

4000

In [None]:
# clustering
from sklearn.cluster import KMeans

num_clusters = 4
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

In [None]:
cluster_assignment

array([0, 2, 0, ..., 2, 2, 2], dtype=int32)

In [None]:
clustered_sentences = [[] for i in range(num_clusters)]

for sentence_id, cluster_id in enumerate(cluster_assignment):
  clustered_sentences[cluster_id].append(sentences[sentence_id])

In [None]:
for i, cluster in enumerate(clustered_sentences):
  print('Cluster', i+1)
  print(cluster)
  print("")

Cluster 1
['덕분에 주방에서 요리하는게 넘 좋아졌어요', '원호 형님 형님이 올해 우리팀으로 온다고 해서 얼마나 기뻤는지 몰라요', '하루에 2만명 넘고 너무 좋겠네요!!', '오빠들덕에 재미있는 크리스마스이브가 이렇게 마무리되서 기쁘고 오빠들도 메리크리스마스!!!', '연아양 본인이 만족하고 행복해하니 팬 입장으로서도 뿌듯하네요.', '신바람 나는 LG!!!', '광희가 잘되서 기분좋네!', '이런 저질프로가 폐지되서 정말행복합니다. 대한민국만세', '진짜 골키퍼 하드케리로 승점5점 땄네 ㅋㅋ', '봐라 결국 돌아오잔니ㅋㅋㅋㅋㅋ', '오랜만에 시원하다~', '이재용이 구속안된게 내가 기분이 좋네 ~^^', '속이 다 시원합니다', '비가 촉촉히 와서 좋네요', '동토의 나라가 이렇게 반갑다니.', '기부를 하는 본인이 더 행복하고 마음이 따뜻해진다는걸~~~^^', '찬성하는 정책이다!', '제가 하고싶었던걸 해주시니 가슴이 뻥 뚤리는 기분이네요', '앗싸 꽃피는 춘삼월 좋은소식 대한민국 방방곡곡 울려퍼지겠네', '태극기 걸은 게 맘에 듭니다 !!!!', '생활속의 작은 기쁨 ㅎㅎ0', '그 여자애가 연예인을 좋아해서 전에 생일때도 앨범을 사줬는데 좋아하더라구요.', '귀가따뜻핼.ㅎㅎ', '꽃다발 하나 선물해 보세요^^~~', '인맥도 환영합니다^^', '오늘은 치즈김밥 줘서덕분에 먹으면서 아침 맞이했다.', '그래도 캐럿이라는 이름이 있는것만으로 만족합니다!', '제대해서 좋아~~~~', '오늘 대박 세일가로 스틱 분유 사가지고 왔어요.', '메리크리스마스!!!!~ ㅋ', '오빠의 행동하나하나가우리에게 즐거움을 주네요.', '무조건 증정이니 해볼만 한 이벤트같네요~', '서용빈선수가 1루로복귀한다니 참으로 기쁜소식이군요.', '나오신다길래 조아서 히히히 다보구 이름을 쳐봤는데 펜카페가나오드라구요^^!', '유재석 오라버니 하늘이 파랗게 있으니 날씨가 좋더라고요', '아 오늘 생일이네요 ㅋㅋ', '세봉이들!오늘 드디어 데뷔600일~~~', '덕을 보네

In [None]:
all_list = []
for s, t in zip(list(chatbot_data.Sentence), list(chatbot_data.tag)) :
  all_list.append([s,t])

In [None]:
tag_dict = dict(all_list)

In [None]:
clusters_tag = []
for i, cluster in enumerate(clustered_sentences):
  print('Cluster', i+1)
  result = []
  for data in cluster :
    result.append(tag_dict[str(data)])
  clusters_tag.append(result)
  print("")
  print(clusters_tag)

Cluster 1

[['기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨', '기쁨

In [None]:
print(len(clusters_tag[0]))
print(len(clusters_tag[1]))
print(len(clusters_tag[2]))
print(len(clusters_tag[3]))

796
577
1465
1162


In [None]:
# Cluster 1
pd.DataFrame(clusters_tag[0]).value_counts()

신뢰    402
기쁨    224
기대    131
중립     39
dtype: int64

In [None]:
# Cluster 2
pd.DataFrame(clusters_tag[1]).value_counts()

기쁨    423
신뢰    149
기대      4
중립      1
dtype: int64

In [None]:
# Cluster 3
pd.DataFrame(clusters_tag[2]).value_counts()

중립    931
기대    220
신뢰    159
기쁨    155
dtype: int64

In [None]:
# Cluster 4
pd.DataFrame(clusters_tag[3]).value_counts()

기대    645
신뢰    290
기쁨    198
중립     29
dtype: int64