# 감정 분석

In [None]:
!pip install gluonnlp pandas tqdm
!pip install mxnet
!pip install sentencepiece

!pip install transformers
!pip install torch
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m344.5/344.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp310-cp310-linux_x86_64.whl size=661771 sha256=b5664197ec7e534d3421530afb5886f5d83db7f39fceb99f33c67a25ded47d20
  Stored in directory: /root/.cache/pip/wheels/1a/1e/0d/99f55911d90f2b95b9f7c176d5813ef3622894a4b30fde6bd3
Successfully built gluonnlp
Installing collected packages: gluonnlp
Successfully installed gluonnlp-0.10.0
Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Dow

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
import pandas as pd
from sklearn.model_selection import train_test_split
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel,AutoConfig
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(device)

cuda:0


## Setting Parameters

In [None]:
# Setting parameters
max_len = 100
batch_size = 64
warmup_ratio = 0.1
num_epochs = 20
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
from tqdm import tqdm, tqdm_notebook
from glob import glob
import gc
import os

## BERT Dataset

In [None]:
#gluonnlp의 BERTSentenceTransform을 사용하여 입력 문장을 BERT 입력 형식에 맞게 변환 -transform 메서드를 사용하여 각 문장을 BERT 입력 형식에 맞게 변환
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):

        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

## BERT Classifier

In [None]:
config=AutoConfig.from_pretrained('skt/kobert-base-v1')
config

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

BertConfig {
  "_name_or_path": "skt/kobert-base-v1",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "kobert_version": 1.0,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 8002
}

In [None]:
#입력 데이터의 패딩 부분을 제외하고, 실제 입력에 대한 어텐션 마스크를 생성하는 함수

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=5,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier =  nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=hidden_size, out_features=512),
            nn.Linear(in_features=512, out_features=num_classes)
        ) #nn.Linear(hidden_size , num_classes)

        #정규화 레이어 추가 (Layernormalization)
        self.layer_norm = nn.LayerNorm(768)

        #드롭아웃
        self.dropout = nn.Dropout(p=dr_rate)


    def gen_attention_mask(self, token_ids, valid_length): #token_ids는 입력 문장을 토큰화한 결과
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length) #gen_attention_mask 메서드를 사용하여 어텐션 마스크를 생성

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device)) #BERT 모델에 입력을 전달하여 출력을 계산

        pooled_output = self.dropout(pooler)
        normalized_output = self.layer_norm(pooled_output)  # Layer Normalization 적용
        out=self.classifier(normalized_output)

        return out


## Tokenizer & Model 정의

In [None]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1') #skt/kobert-base-v1 모델의 사전 학습된 가중치 로드
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]') ##BERTVocab 객체에 tokenizer.vocab_file 사전(vocab) 로드
tok = tokenizer.tokenize

tokenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

In [None]:
#모델 구조 확인
BERTClassifier(bertmodel, dr_rate=0.5)

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

## 데이터 로드 및 전처리
* 데이터셋 출처 : https://aihub.or.kr/mypage/reqst/datareqst/view.do?currMenu=133&topMenu=105&dataReqstSn=378435

In [None]:
dd=pd.read_csv('/content/drive/MyDrive/FineTune_KoBert/Data/감성대화말뭉치(최종데이터)_Validation.csv')
dd.head()

Unnamed: 0.1,Unnamed: 0,연령,성별,상황키워드,신체질환,감정_대분류,감정_소분류,사람문장1,시스템문장1,사람문장2,시스템문장2,사람문장3,시스템문장3
0,1,청년,남성,"진로,취업,직장",해당없음,불안,두려운,이번 프로젝트에서 발표를 하는데 내가 실수하는 바람에 우리 팀이 감점을 받았어. 너...,실수하시다니 정말 미안한 마음이 크겠어요.,내 능력이 부족한 거 같은데 그만 다녀야 될거같아.,능력을 올리려면 어떤 방법이 있을까요?,퇴근 후 여가에 회사 일을 더 열심히 해서 피해가 가지 않도록 해야겠어.,꼭 좋은 결과 있길 바라요.
1,2,청년,남성,"진로,취업,직장",해당없음,불안,두려운,회사에서 중요한 프로젝트를 혼자 하게 됐는데 솔직히 두렵고 무서워.,큰 프로젝트를 혼자 하셔서 고민이 많겠네요.,나에게 너무 크게 느껴지는 중요한 프로젝트라 버거운 느낌이 들어.,프로젝트를 잘하시기 위해서 어떤 걸 할 수 있나요?,동료 직원에게 도움을 요청해서 같이 해결해야겠어.,동료 직원에게 도움을 요청하기로 하셨군요.
2,3,청년,남성,"진로,취업,직장",해당없음,불안,두려운,상사가 너무 무섭게 생겨서 친해지는 게 너무 두려워.,직장 상사가 무섭게 생기셔서 친해지는 게 어렵군요.,무섭게 생겼는데도 업무를 보려면 친해져야 할 것 같단말이야.,상사분과 친해질 수 있는 방법은 무엇이 있을까요?,먼저 다가가서 말을 걸어볼게.,직장 상사와 친해지시면 좋겠네요.
3,4,청년,남성,"진로,취업,직장",해당없음,불안,두려운,이번에 힘들게 들어간 첫 직장이거든. 첫 직장이라서 그런지 너무 긴장된다.,첫 직장이라서 정말 떨리시고 긴장되실 것 같아요.,첫 직장이어서 잘 적응을 할 수 있을지 모르겠어.,잘 적응 하시려면 무엇을 할 수 있을까요?,직장 동료와 상사들이랑 친하게 지내야겠어.,직장에 잘 적응하시길 바라요.
4,5,청년,남성,"진로,취업,직장",해당없음,불안,두려운,직장에서 동료들이랑 관계가 안 좋아질까 봐 걱정돼.,직장 사람들 관계에 대해서 고민이시군요.,내가 낯가림이 심해서 친해질 수 있을지 모르겠어.,직장 사람들과 친해지려면 무슨 방법이 있을까요?,서로 같은 취미를 공유하고 얘기를 나누다 보면 친해질 수 있을 거 같아.,직장 사람들과 좋은 관계를 가지시길 응원해요.


In [None]:
train_set = pd.read_csv('/content/drive/MyDrive/FineTune_KoBert/Data/감성대화말뭉치(최종데이터)_Training.csv')
validation_set = pd.read_csv('/content/drive/MyDrive/FineTune_KoBert/Data/감성대화말뭉치(최종데이터)_Validation.csv')
train_set = train_set.loc[:, ['감정_대분류', '사람문장1']]
validation_set = validation_set.loc[:, ['감정_대분류', '사람문장1']]


train_set.dropna(inplace=True)
validation_set.dropna(inplace=True)
train_set.columns = ['label', 'data']
validation_set.columns = ['label', 'data']

## 상처와 슬픔을 하나의 레이블로 설정

#분노, 불안, 상처, 기쁨, 슬픔, 당황
train_set.loc[(train_set['label'] == '불안'), 'label'] = 0
train_set.loc[(train_set['label'] == '분노'), 'label'] = 1
train_set.loc[(train_set['label'] == '상처'), 'label'] = 2
train_set.loc[(train_set['label'] == '슬픔'), 'label'] = 2
train_set.loc[(train_set['label'] == '당황'), 'label'] = 3
train_set.loc[(train_set['label'] == '기쁨'), 'label'] = 4
train_set['label']=train_set['label'].astype(int)


validation_set.loc[(validation_set['label'] == '불안'), 'label'] = 0
validation_set.loc[(validation_set['label'] == '분노'), 'label'] = 1
validation_set.loc[(validation_set['label'] == '상처'), 'label'] = 2
validation_set.loc[(validation_set['label'] == '슬픔'), 'label'] = 2
validation_set.loc[(validation_set['label'] == '당황'), 'label'] = 3
validation_set.loc[(validation_set['label'] == '기쁨'), 'label'] = 4
validation_set['label']=validation_set['label'].astype(int)

df=pd.concat([train_set,validation_set])
df.dropna(inplace=True)

train_set_data = [[i, str(j)] for i, j in zip(df['data'], df['label'])]

## validation_set_data = [[i, str(j)] for i, j in zip(validation_set['data'], validation_set['label'])]

train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=4)
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=2)


In [None]:
df['label'].value_counts() #클래스 불균형 -> 평가지표 : f1_score

2    20278
0    10433
1    10417
3     9804
4     7339
Name: label, dtype: int64

## 모델 학습

In [None]:
from sklearn.metrics import f1_score
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    #acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    f1score=f1_score(Y.data.cpu().numpy(),max_indices.data.cpu().numpy(),average='macro') #macro 평균은 클래스별 f1-점수에 가중치를 주지 않음. 클래스 크기에 상관없이 모든 클래스를 같은 비중으로 다룬다.
    return f1score

def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=2)
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [None]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight'] # 가중치 감쇠(weight decay)를 적용하여 과적합을 방지
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
] # 가중치 감쇠를 적용하지 않는 파라미터와 적용하는 파라미터를 구분하여 옵티마이저를 설정

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate) #AdamW 옵티마이저를 사용하여 학습을 진행
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train f1score {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train f1score {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test f1score {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.6346133947372437 train f1score 0.20182011098450273
epoch 1 batch id 201 loss 1.5030759572982788 train f1score 0.14849068641358035
epoch 1 batch id 401 loss 1.1466549634933472 train f1score 0.24464554158617724
epoch 1 batch id 601 loss 1.1193196773529053 train f1score 0.3481131643525007
epoch 1 train f1score 0.3883116688306538


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 1 test f1score 0.6179307746907626


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.0250678062438965 train f1score 0.6477142857142857
epoch 2 batch id 201 loss 0.8105107545852661 train f1score 0.5993205673047098
epoch 2 batch id 401 loss 0.9252757430076599 train f1score 0.6031306636523326
epoch 2 batch id 601 loss 1.1548537015914917 train f1score 0.6080757504551355
epoch 2 train f1score 0.6103654550639966


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 2 test f1score 0.6309213665994773


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.8890087604522705 train f1score 0.6358225108225108
epoch 3 batch id 201 loss 0.7557751536369324 train f1score 0.6349149461653961
epoch 3 batch id 401 loss 0.8267492651939392 train f1score 0.6401250884309545
epoch 3 batch id 601 loss 0.9719328284263611 train f1score 0.6442419574961861
epoch 3 train f1score 0.6474926492652388


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 3 test f1score 0.6399906293358117


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.7436447739601135 train f1score 0.7040979808010592
epoch 4 batch id 201 loss 0.6149474382400513 train f1score 0.6821494890016734
epoch 4 batch id 401 loss 0.7838191390037537 train f1score 0.6876780306614042
epoch 4 batch id 601 loss 0.8158119320869446 train f1score 0.6951820110156004
epoch 4 train f1score 0.6998961701916279


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 4 test f1score 0.6313183399934146


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.6609554886817932 train f1score 0.7985054945054946
epoch 5 batch id 201 loss 0.5111013650894165 train f1score 0.7345275120669662
epoch 5 batch id 401 loss 0.7895407676696777 train f1score 0.7415141819932418
epoch 5 batch id 601 loss 0.647636353969574 train f1score 0.7495236441791676
epoch 5 train f1score 0.7534598849674663


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 5 test f1score 0.6276212795074988


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.6342167258262634 train f1score 0.8171428571428571
epoch 6 batch id 201 loss 0.5323421359062195 train f1score 0.7842004920899163
epoch 6 batch id 401 loss 0.5884804129600525 train f1score 0.7912320113315899
epoch 6 batch id 601 loss 0.5304477214813232 train f1score 0.7986627278161629
epoch 6 train f1score 0.8022901189913263


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 6 test f1score 0.6239929676081081


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.41794347763061523 train f1score 0.8640952380952381
epoch 7 batch id 201 loss 0.3940953314304352 train f1score 0.8322920908543331
epoch 7 batch id 401 loss 0.5663133263587952 train f1score 0.8328307827689997
epoch 7 batch id 601 loss 0.417910099029541 train f1score 0.8377631515084446
epoch 7 train f1score 0.8427321676079174


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 7 test f1score 0.627422156140278


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 0.37124747037887573 train f1score 0.8291838553603259
epoch 8 batch id 201 loss 0.3424805700778961 train f1score 0.8605215550919686
epoch 8 batch id 401 loss 0.31543266773223877 train f1score 0.8612236231311573
epoch 8 batch id 601 loss 0.43666815757751465 train f1score 0.8657673180792359
epoch 8 train f1score 0.8695978634801179


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 8 test f1score 0.6341476394405242


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 0.30411937832832336 train f1score 0.9127951933124347
epoch 9 batch id 201 loss 0.2619492709636688 train f1score 0.8863467449234328
epoch 9 batch id 401 loss 0.26216524839401245 train f1score 0.887251152518529
epoch 9 batch id 601 loss 0.2782294750213623 train f1score 0.890473248808252
epoch 9 train f1score 0.8947570788994073


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 9 test f1score 0.6241310087389688


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 10 batch id 1 loss 0.2476763129234314 train f1score 0.9222463768115942
epoch 10 batch id 201 loss 0.17379534244537354 train f1score 0.9092924806477418
epoch 10 batch id 401 loss 0.1840427815914154 train f1score 0.9099016774907868
epoch 10 batch id 601 loss 0.25825801491737366 train f1score 0.9120326127138415
epoch 10 train f1score 0.9150487650900695


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 10 test f1score 0.6224931273391615


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 11 batch id 1 loss 0.15869784355163574 train f1score 0.9513002364066192
epoch 11 batch id 201 loss 0.16354750096797943 train f1score 0.9295638322075929
epoch 11 batch id 401 loss 0.19856584072113037 train f1score 0.9297206988272287
epoch 11 batch id 601 loss 0.2237483263015747 train f1score 0.931606041819872
epoch 11 train f1score 0.933493555505984


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 11 test f1score 0.6219071565138595


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 12 batch id 1 loss 0.08399531245231628 train f1score 0.9696942606829966
epoch 12 batch id 201 loss 0.2470591515302658 train f1score 0.9448275463748778
epoch 12 batch id 401 loss 0.09251856803894043 train f1score 0.9453609673182716
epoch 12 batch id 601 loss 0.11477933079004288 train f1score 0.945400344452934
epoch 12 train f1score 0.9463618688943236


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 12 test f1score 0.6252670370530929


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 13 batch id 1 loss 0.03477591276168823 train f1score 0.9868599033816425
epoch 13 batch id 201 loss 0.11015105992555618 train f1score 0.9609639425753971
epoch 13 batch id 401 loss 0.06591081619262695 train f1score 0.9595096455843831
epoch 13 batch id 601 loss 0.14163999259471893 train f1score 0.9604904195893231
epoch 13 train f1score 0.9603855205647096


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 13 test f1score 0.6209387849312357


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 14 batch id 1 loss 0.04912715405225754 train f1score 0.9663043478260869
epoch 14 batch id 201 loss 0.13770630955696106 train f1score 0.968496083052564
epoch 14 batch id 401 loss 0.044335849583148956 train f1score 0.9682068780180827
epoch 14 batch id 601 loss 0.07676240801811218 train f1score 0.9691096876644681
epoch 14 train f1score 0.9692225902332241


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 14 test f1score 0.623250043156868


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 15 batch id 1 loss 0.03556622564792633 train f1score 0.9837908496732026
epoch 15 batch id 201 loss 0.020595375448465347 train f1score 0.9751207588834028
epoch 15 batch id 401 loss 0.010940310545265675 train f1score 0.976038184157705
epoch 15 batch id 601 loss 0.009319262579083443 train f1score 0.9760415747969661
epoch 15 train f1score 0.9757916900420973


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 15 test f1score 0.6220766611677349


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 16 batch id 1 loss 0.010560811497271061 train f1score 1.0
epoch 16 batch id 201 loss 0.022594131529331207 train f1score 0.9812317042347213
epoch 16 batch id 401 loss 0.00138821208383888 train f1score 0.9805555121938341
epoch 16 batch id 601 loss 0.054530318826436996 train f1score 0.9804146938641162
epoch 16 train f1score 0.9807725400336987


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 16 test f1score 0.6206022210283698


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 17 batch id 1 loss 0.03993107005953789 train f1score 0.9824113475177304
epoch 17 batch id 201 loss 0.06885547935962677 train f1score 0.985690554530412
epoch 17 batch id 401 loss 0.0012903979513794184 train f1score 0.9851761037373635
epoch 17 batch id 601 loss 0.01474657841026783 train f1score 0.9843967905733185
epoch 17 train f1score 0.9842200870111191


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 17 test f1score 0.6238691451688522


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 18 batch id 1 loss 0.0013552942546084523 train f1score 1.0
epoch 18 batch id 201 loss 0.0025356682017445564 train f1score 0.9881072366352727
epoch 18 batch id 401 loss 0.0009844973683357239 train f1score 0.9881460586054938
epoch 18 batch id 601 loss 0.09103548526763916 train f1score 0.9870576538373937
epoch 18 train f1score 0.987134988650327


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 18 test f1score 0.624469874807017


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 19 batch id 1 loss 0.001991989789530635 train f1score 1.0
epoch 19 batch id 201 loss 0.003179092425853014 train f1score 0.989945211334286
epoch 19 batch id 401 loss 0.0009549492970108986 train f1score 0.9904352016690289
epoch 19 batch id 601 loss 0.002220721449702978 train f1score 0.9901419467079232
epoch 19 train f1score 0.9898348675253814


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 19 test f1score 0.628514485368137


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 20 batch id 1 loss 0.0011594787938520312 train f1score 1.0
epoch 20 batch id 201 loss 0.0025520918425172567 train f1score 0.9913176561399838
epoch 20 batch id 401 loss 0.0008264869684353471 train f1score 0.9909138948073479
epoch 20 batch id 601 loss 0.021691715344786644 train f1score 0.9906144724180043
epoch 20 train f1score 0.9903433376833997


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/183 [00:00<?, ?it/s]

epoch 20 test f1score 0.6267548125214074


In [None]:
#모델 저장
#torch.save(model, '/content/drive/MyDrive/Colab Notebooks/CustomKoBERTWithLayNorm_epoch20_F1.pth')

#모델 저장
#torch.save(model, '/content/drive/MyDrive/Colab Notebooks/CustomKoBERTWithLayNorm_epoch20.pth')

#모델 불러오기
#모델 호출
#my_model_mark4 = torch.load('/content/drive/MyDrive/FineTune_KoBert/Model/CustomKoBERTWithLayNorm_epoch20_F1.pth')

## Test

In [None]:
model = BERTClassifier(bertmodel, dr_rate=0.5)
model=torch.load('/content/drive/MyDrive/FineTune_KoBert/Model/CustomKoBERTWithLayNorm_epoch20_F1.pth')

In [None]:
def inference(sentence,model_path='/content/drive/MyDrive/FineTune_KoBert/Model/CustomKoBERTWithLayNorm_epoch20_F1.pth'):
    #bert_model, vocab = get_pytorch_kobert_model()
    tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1') #skt/kobert-base-v1 모델의 사전 학습된 가중치 로드
    bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]') ##BERTVocab 객체에 tokenizer.vocab_file 사전(vocab) 로드
    tok = tokenizer.tokenize
    model = BERTClassifier(bertmodel, dr_rate=0.5)
    model=torch.load(model_path)
    model.eval()

    token = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=2)

    output = []
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
      token_ids = token_ids.long().to(device)
      segment_ids = segment_ids.long().to(device)
      valid_length= valid_length
      label = label.long().to(device)
      output.append(model(token_ids, valid_length, segment_ids))
    softmax = torch.nn.Softmax(dim=1)
    output = softmax(output[0])[0].tolist()
    return output

def predict_prob(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=2)
    model.eval()

    output = []
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
      token_ids = token_ids.long().to(device)
      segment_ids = segment_ids.long().to(device)
      valid_length= valid_length
      label = label.long().to(device)
      output.append(model(token_ids, valid_length, segment_ids))
    softmax = torch.nn.Softmax(dim=1)
    output = softmax(output[0])[0].tolist()
    return output


In [None]:
diary2='''
오늘은 예상치 못한 일들로 인해 마음이 혼란스러운 날이었다.
아침에 시작된 하루는 어딘가에서부터 조금 이상했다. 일상의 흐름이 달라진 것 같았고, 그 변화에 따라 마음도 어느새 혼란스러워졌다.
시간이 흘러가는 게 이상하게 느껴졌고, 계획한 것과 실제로 이루어지는 것 사이에 어긋남이 있었다. 일상의 일들이 마치 뒤섞이는 듯한 기분이었다.
정신을 가다듬고 일상을 되짚어보려고 했지만, 마음이 급속도로 얽혀져만 갔다.하루 중간에는 내가 기억하고 있던 것들과 현실이 서로 다르게 느껴졌다.
이상한 일들이 계속해서 일어나면서 마음은 점점 더 혼란스러워져만 갔다. 마치 하루가 이상한 꿈을 꾸고 있는 것 같은 기분이었다.
그런데도 시간은 계속 흘러가고, 날이 저물어가는 게 느껴졌다. 마음의 혼란은 여전하지만, 내일이라는 새로운 시작에 대한 희망이 간신히 남아있다.
이런 날도 가끔 있는 법이지. 내일은 분명 더 나은 하루가 될 거야.
'''
emotion_arr=['불안','분노','슬픔','당황','기쁨']
print(emotion_arr[predict(diary2)])
predict_prob(diary2)



불안


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

[0.999995231628418,
 2.2298355872862885e-07,
 8.501834827256971e-07,
 3.5438099530438194e-06,
 1.0978612863254966e-07]

## Music Sentiment Analysis

In [None]:
emotion_arr=['불안','분노','슬픔','당황','기쁨']
df_user_sentiment=pd.DataFrame([predict_prob(diary2)], columns=emotion_arr)
df_user_sentiment

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,불안,분노,슬픔,당황,기쁨
0,0.999995,2.229836e-07,8.501835e-07,4e-06,1.097861e-07


In [None]:
df_music=pd.read_csv("/content/drive/MyDrive/FineTune_KoBert/Data/df_allsongs.csv")
print(len(df_music))
df_music.head()

2962


Unnamed: 0,Title,SongID,Artist,Date,Genre,Lyric,preprocess_Lyric,preprocess_Lyric_list
0,그대만 있다면 (여름날 우리 X 너드커넥션 (Nerd Connection)),36699489,너드커넥션 (Nerd Connection),2023.08.15,발라드,"<div class=""lyric"" id=""d_video_summary""><!-- h...",날 사랑해서 떠난다며 눈물짓던 그대의 말을 믿을 수 없죠 하지만 나의 전부였던 그대...,"['날 사랑해서 떠난다며', '눈물짓던 그대의 말을 믿을 수 없죠', '하지만 나의..."
1,헤어지자 말해요,36382580,박재정,2023.04.20,발라드,"<div class=""lyric"" id=""d_video_summary""><!-- h...",헤어지자고 말하려 오늘 너에게 가다가 우리 추억 생각해 봤어 처음 본 네 얼굴 마주...,"['헤어지자고 말하려 오늘', '너에게 가다가 우리 추억 생각해 봤어', '처음 본..."
2,인사,34451383,범진,2021.12.24,"발라드, 인디음악","<div class=""lyric"" id=""d_video_summary""><!-- h...",돌아서는 너를 보며 난 아무 말도 할 수 없었고 슬퍼하기엔 짧았던 나의 해는 저물어...,"['돌아서는 너를 보며', '난 아무 말도 할 수 없었고', '슬퍼하기엔 짧았던',..."
3,첫 눈,4352438,EXO,2013.12.09,발라드,"<div class=""lyric"" id=""d_video_summary""><!-- h...",첫눈 오는 이런 오후에 너에게 전화를 걸 수만 있다면 기쁠텐데 벌써 일년이 지났는데...,"['첫눈 오는 이런 오후에', '너에게 전화를 걸 수만', '있다면 기쁠텐데', '..."
4,너의 모든 순간,4446485,성시경,2014.02.12,"발라드, 국내드라마","<div class=""lyric"" id=""d_video_summary""><!-- h...",이윽고 내가 한눈에 너를 알아봤을 때 모든 건 분명 달라지고 있었어 내 세상은 널 ...,"['이윽고 내가 한눈에', '너를 알아봤을 때', '모든 건 분명 달라지고 있었어'..."


In [None]:
final=pd.DataFrame(columns=[emotion_arr])
for i in range(len(df_music[:4])):
  emotion=predict_prob(df_music.loc[i,'preprocess_Lyric'])
  dd=pd.DataFrame([[df_music.loc[i,'Title']]+[df_music.loc[i,'Artist']]+[df_music.loc[i,'Genre']]+emotion],columns=[['Title','Artist','Genre']+emotion_arr])
  final=pd.concat([dd,final],axis=0)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
final = pd.concat([
    pd.DataFrame([
        [df_music.loc[i, 'Title'], df_music.loc[i, 'Artist'], df_music.loc[i, 'Genre']] + predict_prob(df_music.loc[i, 'preprocess_Lyric'])
    ], columns=['Title', 'Artist', 'Genre'] + emotion_arr)
    for i in range(len(df_music[:4]))
], axis=0, ignore_index=True)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

                                        Title                   Artist  \
0  그대만 있다면 (여름날 우리 X 너드커넥션 (Nerd Connection))  너드커넥션 (Nerd Connection)   
1                                    헤어지자 말해요                      박재정   
2                                          인사                       범진   
3                                         첫 눈                      EXO   

       Genre            불안        분노            슬픔            당황            기쁨  
0        발라드  5.628538e-07  0.999763  2.176232e-04  4.016572e-06  1.466946e-05  
1        발라드  3.556124e-02  0.000030  2.052079e-03  9.623545e-01  1.973345e-06  
2  발라드, 인디음악  1.012112e-02  0.000055  1.109477e-07  3.690328e-06  9.898201e-01  
3        발라드  8.598781e-07  0.000164  9.998344e-01  7.495815e-07  2.390063e-07  


In [None]:
final

Unnamed: 0,Title,Artist,Genre,불안,분노,슬픔,당황,기쁨
0,그대만 있다면 (여름날 우리 X 너드커넥션 (Nerd Connection)),너드커넥션 (Nerd Connection),발라드,5.628538e-07,0.999763,0.0002176232,4.016572e-06,1.466946e-05
1,헤어지자 말해요,박재정,발라드,0.03556124,3e-05,0.002052079,0.9623545,1.973345e-06
2,인사,범진,"발라드, 인디음악",0.01012112,5.5e-05,1.109477e-07,3.690328e-06,0.9898201
3,첫 눈,EXO,발라드,8.598781e-07,0.000164,0.9998344,7.495815e-07,2.390063e-07


In [None]:
final=pd.DataFrame(columns=[emotion_arr])
for i in range(len(df_music[:4])):
  emotion=inference(df_music.loc[i,'preprocess_Lyric'])
  dd=pd.DataFrame([[df_music.loc[i,'Title']]+[df_music.loc[i,'Artist']]+[df_music.loc[i,'Genre']]+emotion],columns=[['Title','Artist','Genre']+emotion_arr])
  final=pd.concat([dd,final],axis=0)
final

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Title,Artist,Genre,불안,분노,슬픔,당황,기쁨
0,첫 눈,EXO,발라드,8.598781e-07,0.000164,0.9998344,7.495815e-07,2.390063e-07
0,인사,범진,"발라드, 인디음악",0.01012112,5.5e-05,1.109477e-07,3.690328e-06,0.9898201
0,헤어지자 말해요,박재정,발라드,0.03556124,3e-05,0.002052079,0.9623545,1.973345e-06
0,그대만 있다면 (여름날 우리 X 너드커넥션 (Nerd Connection)),너드커넥션 (Nerd Connection),발라드,5.628538e-07,0.999763,0.0002176232,4.016572e-06,1.466946e-05


In [None]:
'''final=pd.DataFrame(columns=[emotion_arr])
for i in range(len(df_music)):
  emotion=inference(df_music.loc[i,'preprocess_Lyric'])
  dd=pd.DataFrame([[df_music.loc[i,'Title']]+[df_music.loc[i,'Artist']]+[df_music.loc[i,'Genre']]+emotion],columns=[['Title','Artist','Genre']+emotion_arr])
  final=pd.concat([dd,final],axis=0)

len(final)'''


final_emotion= pd.concat([
    pd.DataFrame([
        [df_music.loc[i, 'Title'], df_music.loc[i, 'Artist'], df_music.loc[i, 'Genre']] + predict_prob(df_music.loc[i, 'preprocess_Lyric'])
    ], columns=['Title', 'Artist', 'Genre'] + emotion_arr)
    for i in range(len(df_music))
], axis=0, ignore_index=True)


final_emotion.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

4


Unnamed: 0,Title,Artist,Genre,불안,분노,슬픔,당황,기쁨
0,그대만 있다면 (여름날 우리 X 너드커넥션 (Nerd Connection)),너드커넥션 (Nerd Connection),발라드,5.628538e-07,0.9997631,0.0002176232,4.016572e-06,1.466946e-05
1,헤어지자 말해요,박재정,발라드,0.03556124,3.032179e-05,0.002052079,0.9623545,1.973345e-06
2,인사,범진,"발라드, 인디음악",0.01012112,5.505212e-05,1.109477e-07,3.690328e-06,0.9898201
3,첫 눈,EXO,발라드,8.598781e-07,0.0001637204,0.9998344,7.495815e-07,2.390063e-07
4,너의 모든 순간,성시경,"발라드, 국내드라마",3.938197e-07,2.040685e-07,8.019197e-08,1.172084e-06,0.9999981


In [None]:
print(len(final_emotion))

2962


In [None]:
final_emotion.to_csv('/content/drive/MyDrive/FineTune_KoBert/Data/music_emotion.csv',index=False)

## Music Remmend

In [None]:
#유저 감정 분석 결과
df_user_sentiment

Unnamed: 0,불안,분노,슬픔,당황,기쁨
0,0.999995,2.229836e-07,8.501835e-07,4e-06,1.097861e-07


In [None]:
#노래 감정 분석 결과
final_emotion

Unnamed: 0,Title,Artist,Genre,불안,분노,슬픔,당황,기쁨
0,그대만 있다면 (여름날 우리 X 너드커넥션 (Nerd Connection)),너드커넥션 (Nerd Connection),발라드,5.628538e-07,9.997631e-01,2.176232e-04,4.016572e-06,1.466946e-05
1,헤어지자 말해요,박재정,발라드,3.556124e-02,3.032179e-05,2.052079e-03,9.623545e-01,1.973345e-06
2,인사,범진,"발라드, 인디음악",1.012112e-02,5.505212e-05,1.109477e-07,3.690328e-06,9.898201e-01
3,첫 눈,EXO,발라드,8.598781e-07,1.637204e-04,9.998344e-01,7.495815e-07,2.390063e-07
4,너의 모든 순간,성시경,"발라드, 국내드라마",3.938197e-07,2.040685e-07,8.019197e-08,1.172084e-06,9.999981e-01
...,...,...,...,...,...,...,...,...
2957,I'm Sorry,CNBLUE (씨엔블루),록/메탈,1.881501e-05,1.828733e-05,1.759351e-01,8.240229e-01,4.998634e-06
2958,Wind,FTISLAND (FT아일랜드),록/메탈,1.151260e-04,9.681188e-06,1.123642e-05,8.512194e-06,9.998554e-01
2959,인사 (Prod. by 남혜승),김경희,"록/메탈, 국내드라마",2.294169e-03,6.526361e-02,6.063820e-06,1.274381e-01,8.049981e-01
2960,심연,WOODZ,록/메탈,8.363325e-03,2.790830e-02,2.884534e-05,6.819587e-01,2.817409e-01


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 각 행의 감정 데이터를 문자열로 변환
user_emotion_str = df_user_sentiment.apply(lambda x: ' '.join(map(str, x)), axis=1)
music_emotion_str = final_emotion[emotion_arr].apply(lambda x: ' '.join(map(str, x)), axis=1)

# TF-IDF 벡터화
tfidf = TfidfVectorizer()
user_tfidf_matrix = tfidf.fit_transform(user_emotion_str)
music_tfidf_matrix = tfidf.transform(music_emotion_str)

# 코사인 유사도 계산
cosine_sim = cosine_similarity(user_tfidf_matrix, music_tfidf_matrix)
cosine_sim

array([[0.73029674, 0.25819889, 0.73029674, ..., 0.25819889, 0.        ,
        0.25819889]])

In [None]:
# 가장 유사한 음악 선택
most_similar_song_index = cosine_sim.argmax()
most_similar_song_info = final_emotion.iloc[most_similar_song_index]

# 선택된 음악과 유사한 음악 4곡 더 추천
num_additional_recommendations = 4
similar_songs_indices = cosine_sim.argsort()[0][-num_additional_recommendations-1:-1][::-1]
similar_songs_info = final_emotion.iloc[similar_songs_indices]

print(f"오늘 하루 당신의 감정은❓")
print(emotion_arr[predict(diary2)])
print(f"\n🍀당신의 감정을 달래줄, 음악을 추천합니다.🍀")
print(most_similar_song_info[['Title', 'Genre', 'Artist']])
print("\n🎶유사한 음악🎶")
print(similar_songs_info[['Title', 'Genre', 'Artist']])

오늘 하루 당신의 감정은❓
불안

🍀당신의 감정을 달래줄, 음악을 추천합니다.🍀
Title     미친 것처럼
Genre        발라드
Artist     V.O.S
Name: 367, dtype: object

🎶유사한 음악🎶
                     Title        Genre          Artist
1625      Like We Just Met     R&B/Soul       NCT DREAM
2626                   Go!  록/메탈, 국내드라마  도겸 (SEVENTEEN)
1571  갖고놀래 (Feat. 다이나믹 듀오)     R&B/Soul              범키
426                      벗          발라드        닐로(Nilo)
