### move & remote

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/dailywave/SentimentAnalysis')
print("현재 경로:", os.getcwd())

현재 경로: /content/drive/MyDrive/Colab Notebooks/dailywave/SentimentAnalysis


### pip

In [None]:
# 필요한 패키지 설치
!pip install emoji==2.2.0 \
matplotlib==3.6.2 \
numpy==1.23.5 \
pandas==1.5.2 \
soynlp==0.0.493 \
torch==1.13.0 --extra-index-url https://download.pytorch.org/whl/cu116 \
transformers==4.25.1 \
jupyter==1.0.0 \
ipykernel==6.19.2 \
datasets==2.7.1 \
tqdm==4.64.1 \
scikit-learn==1.2.0 \
emoji \
soynlp \
datasets

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116
Collecting emoji==2.2.0
  Downloading emoji-2.2.0.tar.gz (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting matplotlib==3.6.2
  Downloading matplotlib-3.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting pandas==1.5.2
  Downloading pandas-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting soynlp==0.0.493
  Downloading soynlp-0.0.493-py3-none-any.whl.metadata (24 kB)
Collecting torch==1.13.0
  Downloading https://download.pytorch.org/whl/cu116/torch-1.13.0%2Bcu116-cp310-cp310-linux_x86_64.whl (1983.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# import emoji
# import matplotlib
# import numpy
# import pandas
# import soynlp
# import torch
# import transformers
# import jupyter
# import ipykernel
# import datasets
# import tqdm
# import sklearn

# print("emoji 버전:", emoji.__version__)
# print("matplotlib 버전:", matplotlib.__version__)
# print("numpy 버전:", numpy.__version__)
# print("pandas 버전:", pandas.__version__)
# print("soynlp 버전:", soynlp.__version__)
# print("torch 버전:", torch.__version__)
# print("transformers 버전:", transformers.__version__)
# print("jupyter 버전:", jupyter.__version__)
# print("ipykernel 버전:", ipykernel.__version__)
# print("datasets 버전:", datasets.__version__)
# print("tqdm 버전:", tqdm.__version__)
# print("scikit-learn 버전:", sklearn.__version__)


### utils

In [None]:
import os
import re
import emoji
import torch
import numpy as np
from datetime import datetime
from soynlp.normalizer import repeat_normalize
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from transformers import EvalPrediction

def test_util():
  print("utils avilable")

def clean(text):
    pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣]+')
    url_pattern = re.compile(
        r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
    text = pattern.sub(' ', text)
    text = emoji.replace_emoji(text, replace='') #emoji 삭제
    text = url_pattern.sub('', text)
    text = text.strip()
    text = repeat_normalize(text, num_repeats=2)

    return text

def make_current_datetime_dir(path):
    now = datetime.now().strftime(r'%Y%m%dT%H-%M-%S')
    make_dir_path = os.path.join(path, now)
    os.mkdir(make_dir_path)

    return make_dir_path


def preprocess_data(examples, tokenizer, labels):
    # take a batch of texts
    sentences = [clean(sentence) for sentence in examples['document']]  # KcELECTRA 사전 학습시 사용한 정제 적용

    # encode them
    # encoding = tokenizer(sentences, padding='max_length', truncation=True, max_length=80)

    '''
    Trainer에서 data_collector 사용으로
    padding, max_length 옵션을 지정해주지 않아도됨.
    data_collector에서 각 미니 배치에 포함된 sequence 중 가장 긴 sequence를 기준으로 나머지 문장을 padding 함
    '''
    encoding = tokenizer(sentences, truncation=True)

    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}

    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(sentences), len(labels)))

    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding['labels'] = labels_matrix.tolist()

    return encoding


# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # finally, compute metrics
    y_true = labels
    pre = precision_score(y_true=y_true, y_pred=y_pred, average='micro')
    rec = recall_score(y_true=y_true, y_pred=y_pred, average='micro')
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)

    # return as dictionary
    metrics = {'pre': pre,
               'rec': rec,
               'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}

    return metrics


def compute_metrics(p: EvalPrediction):
    print("Calculating metrics...")
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result


if __name__ == '__main__':
  print("utils")

2024-11-05 08:19:58.245930: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-05 08:19:58.281717: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-05 08:19:58.305265: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-05 08:19:58.396330: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


utils


### constants

In [None]:
#EDC
ID2LABEL_KOR = {0: '기쁨',
        1: '당황',
        2: '분노',
        3: '불안',
        4: '상처',
        5: '슬픔',
        }

ID2LABEL_EN = {0: 'joy',
                1: 'embarrassed',
                2: 'anger',
                3: 'nervous',
                4: 'hurt',
                5: 'sad',
                }
#------------------------------
# # DVforEC(5)
# ID2LABEL_KOR = {0: '분노',
#         1: '역겨움',
#         2: '공포',
#         3: '행복',
#         4: '중립',
#         5: '슬픔',
#         6: '놀람',
#         }

# ID2LABEL_EN = {0: 'Angry',
#                 1: 'Disgust',
#                 2: 'Fear',
#                 3: 'Happiness',
#                 4: 'Neutral',
#                 5: 'Sadness',
#                 6: 'Surpise'
#                 }
#------------------------------
# # DVforEC(4_8l)
# ID2LABEL_KOR = {0: '분노',
#         1: '슬픔',
#         2: '불안',
#         3: '상처',
#         4: '당황',
#         5: '기쁨',
#         6: '감사',
#         7: '평온'
#         }

# ID2LABEL_EN = {0: 'Angry',
#                 1: 'Sadness',
#                 2: 'Fear',
#                 3: 'Hurt',
#                 4: 'embarrassed',
#                 5: 'Happiness',
#                 6: 'Thankful',
#                 7 : 'Peaceful'
#                 }


## train

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/dailywave/SentimentAnalysis')

import os
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, logging

current_path = os.getcwd()
print("현재 경로:", current_path)
test_util()
print(ID2LABEL_EN)

현재 경로: /content/drive/MyDrive/Colab Notebooks/dailywave/SentimentAnalysis
utils avilable
{0: 'joy', 1: 'embarrassed', 2: 'anger', 3: 'nervous', 4: 'hurt', 5: 'sad'}


In [None]:
import torch

# GPU 디바이스 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f'Using device: {device}')


Using device: cuda


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
logging.set_verbosity_error()


def train(opt):
    tokenizer = AutoTokenizer.from_pretrained(opt['pretrained_tokenizer'])
    id2label = ID2LABEL_EN
    label2id = {v: k for k, v in id2label.items()}
    labels = list(label2id.keys())
    # dataset = load_dataset('csv', data_files={'train': opt['train_dataset_path'],
    #                                           'val': opt['val_dataset_path']
    #                                           }
    #                        )
    # dataset = dataset.map(preprocess_data,
    #                               batched=True,
    #                               remove_columns=dataset['train'].column_names,
    #                               fn_kwargs={'tokenizer': tokenizer,
    #                                          'labels': labels
    #                                          }
    #                               )

    # 로컬 CSV 파일 로드
    train_df = pd.read_csv(opt['train_dataset_path'], dtype=str)  # 모든 열을 문자열로 로드
    val_df = pd.read_csv(opt['val_dataset_path'], dtype=str)

    print('val_df',train_df[0:1])

    # 데이터 타입을 확인
    print(train_df.dtypes)
    print(val_df.dtypes)

    # DataFrame을 Hugging Face Dataset으로 변환
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    # 데이터셋 합치기
    dataset = DatasetDict({
        'train': train_dataset,
        'val': val_dataset
    })

    # 데이터 전처리
    dataset = dataset.map(
        preprocess_data,
        batched=True,
        remove_columns=dataset['train'].column_names,
        fn_kwargs={'tokenizer': tokenizer, 'labels': labels}
    )



    dataset.set_format('torch')
    model = AutoModelForSequenceClassification.from_pretrained(opt['pretrained_model'],
                                                            problem_type=opt['problem_type'],
                                                            num_labels=len(labels),
                                                            id2label=id2label,
                                                            label2id=label2id)
    args = TrainingArguments(output_dir=make_current_datetime_dir(opt['output_dir']),
                            evaluation_strategy=opt['evaluation_strategy'],
                            save_strategy=opt['save_strategy'],
                            learning_rate=opt['learning_rate'],
                            per_device_train_batch_size=opt['per_device_train_batch_size'],
                            per_device_eval_batch_size=opt['per_device_eval_batch_size'],
                            num_train_epochs=opt['num_train_epochs'],
                            weight_decay=opt['weight_decay'],
                            load_best_model_at_end=opt['load_best_model_at_end'],
                            metric_for_best_model=opt['metric_for_best_model'],
                            seed=opt['seed'],
                            dataloader_num_workers=opt['dataloader_num_workers'],
                            no_cuda=opt['no_cuda']
                            )
    trainer = Trainer(args=args,
                      model=model,
                      tokenizer=tokenizer,
                      train_dataset=dataset['train'],
                      eval_dataset=dataset['val'],
                      compute_metrics=compute_metrics,
                      data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
                      )

    trainer.train()


if __name__ == '__main__':
    opt = {'pretrained_model': 'beomi/KcELECTRA-base-v2022',
           'pretrained_tokenizer': 'beomi/KcELECTRA-base-v2022',
           'problem_type': 'multi_label_classification',
           'train_dataset_path': './data/preprocess/EDC_train.csv', #train data
           'val_dataset_path': './data/preprocess/EDC_val.csv', # val data
           'output_dir': './weights/',
           'metric_for_best_model': 'f1',
           'evaluation_strategy': 'steps',
           'save_strategy': 'steps',
           'eval_steps': 500,
           'seed': 1031,
           'no_cuda': False,
           'learning_rate': 1e-5,
           'per_device_train_batch_size': 16,
           'per_device_eval_batch_size': 16,
           'num_train_epochs': 10,
           'weight_decay': 0.01,
           'dataloader_num_workers': 4,
           'load_best_model_at_end': False,
           }

    train(opt)
    print("train complete")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


val_df   label                                           document joy embarrassed  \
0     5  내가 몸이 아파보니까 이렇게 힘든데 다른 사람들이 아팠을 땐 왜 신경을 써주지 못했...   0           0   

  anger nervous hurt sad  
0     0       0    0   1  
label          object
document       object
joy            object
embarrassed    object
anger          object
nervous        object
hurt           object
sad            object
dtype: object
label          object
document       object
joy            object
embarrassed    object
anger          object
nervous        object
hurt           object
sad            object
dtype: object


  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

***** Running training *****
  Num examples = 4648
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2910
  Number of trainable parameters = 127781382
***** Running Evaluation *****
  Num examples = 996
  Batch size = 16


{'loss': 0.4104, 'learning_rate': 8.281786941580758e-06, 'epoch': 1.72}


Saving model checkpoint to ./weights/20241105T08-20-36/checkpoint-500
Configuration saved in ./weights/20241105T08-20-36/checkpoint-500/config.json


Calculating metrics...
{'eval_loss': 0.31987741589546204, 'eval_pre': 0.8292079207920792, 'eval_rec': 0.33634538152610444, 'eval_f1': 0.47857142857142854, 'eval_roc_auc': 0.6612449799196787, 'eval_accuracy': 0.33634538152610444, 'eval_runtime': 2.1874, 'eval_samples_per_second': 455.333, 'eval_steps_per_second': 28.801, 'epoch': 1.72}


Model weights saved in ./weights/20241105T08-20-36/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./weights/20241105T08-20-36/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./weights/20241105T08-20-36/checkpoint-500/special_tokens_map.json
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e52f9deb2e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1466, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1430, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.10/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 931, i

KeyboardInterrupt: 

## test

In [None]:
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import DataCollatorWithPadding, logging
from tqdm import tqdm
from datasets import Dataset, DatasetDict


os.environ["TOKENIZERS_PARALLELISM"] = 'false'
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
logging.set_verbosity_error()



def evaluate(opt):
    tokenizer = AutoTokenizer.from_pretrained(opt['ckpt_path'])
    # dataset = load_dataset('csv', data_files={'test': opt['test_dataset_path']})

     # 로컬 CSV 파일 로드
    test_df = pd.read_csv(opt['test_dataset_path'], dtype=str)  # 모든 열을 문자열로 로드
    # test_df.rename(columns={'document': 'sentence'}, inplace=True)


    # 데이터 타입을 확인
    print(test_df.dtypes)

    # DataFrame을 Hugging Face Dataset으로 변환
    train_dataset = Dataset.from_pandas(test_df)

    # 데이터셋 합치기
    dataset = DatasetDict({
        'test': train_dataset
    })

    dataset = dataset.map(preprocess_data,
                                  batched=True,
                                  remove_columns=dataset['test'].column_names,
                                  fn_kwargs={'tokenizer': tokenizer,
                                             'labels': list(ID2LABEL_EN.values())
                                             }
                                  )
    dataset.set_format('torch')
    dataloader = torch.utils.data.DataLoader(dataset['test'],
                                             batch_size=opt['batch_size'],
                                             shuffle=False,
                                             num_workers=opt['num_workers'],
                                             collate_fn=DataCollatorWithPadding(tokenizer=tokenizer)
                                             )

    scores = {'micro_f1': [],
            'roc_auc': [],
            'accuracy': []
            }
    device = torch.device(opt['device'])
    model = AutoModelForSequenceClassification.from_pretrained(opt['ckpt_path']).to(device)

    model.eval()
    for data in tqdm(dataloader, total=len(dataloader), ncols=100):
        inputs = {'input_ids': data['input_ids'].to(device),
                    'token_type_ids': data['token_type_ids'].to(device),
                    'attention_mask': data['attention_mask'].to(device)}
        labels = data['labels']
        outputs = model(**inputs)
        logits = outputs.logits.detach().cpu()

        score = multi_label_metrics(logits, labels)
        scores['micro_f1'].append(score['f1'])
        scores['roc_auc'].append(score['roc_auc'])
        scores['accuracy'].append(score['accuracy'])

    micro_f1 = np.mean(scores['micro_f1'])
    roc_auc = np.mean(scores['roc_auc'])
    accuracy = np.mean(scores['accuracy'])
    print(f'micro_f1: {micro_f1:.4f}, roc_acu: {roc_auc:.4f}, accuracy: {accuracy:.4f}')



if __name__ == '__main__':
    opt = {'ckpt_path': './weights/20241031T10-30-48/checkpoint-12500',
           'test_dataset_path': './data/preprocess/DVforEC(5)_test.csv',
           'device': 'cuda:0',
           'batch_size': 64,
           'num_workers': 4,
           }

    evaluate(opt)

### test & record result on csv

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import DataCollatorWithPadding, logging
from tqdm import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = 'false'
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
logging.set_verbosity_error()

def evaluate(opt):
    tokenizer = AutoTokenizer.from_pretrained(opt['ckpt_path'])
    test_df = pd.read_csv(opt['test_dataset_path'], dtype=str)

    # DataFrame을 Hugging Face Dataset으로 변환
    train_dataset = Dataset.from_pandas(test_df)

    # 데이터셋 합치기
    dataset = DatasetDict({
        'test': train_dataset
    })

    dataset = dataset.map(preprocess_data,
                          batched=True,
                          remove_columns=dataset['test'].column_names,
                          fn_kwargs={'tokenizer': tokenizer,
                                     'labels': list(ID2LABEL_EN.values())}
                          )
    dataset.set_format('torch')
    dataloader = torch.utils.data.DataLoader(dataset['test'],
                                             batch_size=opt['batch_size'],
                                             shuffle=False,
                                             num_workers=opt['num_workers'],
                                             collate_fn=DataCollatorWithPadding(tokenizer=tokenizer)
                                             )

    device = torch.device(opt['device'])
    model = AutoModelForSequenceClassification.from_pretrained(opt['ckpt_path']).to(device)

    model.eval()
    results = []
    for data in tqdm(dataloader, total=len(dataloader), ncols=100):
        inputs = {'input_ids': data['input_ids'].to(device),
                  'token_type_ids': data['token_type_ids'].to(device),
                  'attention_mask': data['attention_mask'].to(device)}
        outputs = model(**inputs)
        logits = outputs.logits.detach().cpu()

        # 가장 높은 점수의 감정 예측
        predictions = logits.argmax(dim=-1).numpy()
        results.extend(predictions)

    # 예측 결과를 DataFrame에 추가하고 필요한 열만 남기기
    test_df['result'] = results
    test_df = test_df[['label', 'document', 'result']]

    # label과 result 비교하여 ans 열 추가
    test_df['ans'] = test_df.apply(lambda x: 'T' if x['label'] == str(x['result']) else 'F', axis=1)

    # 숫자 데이터를 문자열로 바꾸기
    emotion_map = { '0': 'Angry', '1': 'Sadness', '2': 'Fear', '3': 'Hurt','4': 'embarrassed', '5': 'Happiness', '6': 'Thankful', '7': 'Peaceful'}
    test_df['result'] = test_df['result'].astype(str).map(emotion_map)
    test_df['label'] = test_df['label'].astype(str).map(emotion_map)

    # 수정된 DataFrame을 파일에 저장
    result_file = './data/result/DVforEC(4_8l)-001_test_result.csv'
    test_df.to_csv(result_file, index=False,encoding='utf-8-sig')
    print(f"Updated {result_file} with ans column")

if __name__ == '__main__':
    opt = {'ckpt_path': './weights/20241031T10-30-48/checkpoint-12500',
           'test_dataset_path': './data/preprocess/EDC_test.csv',
           'device': 'cuda:0',
           'batch_size': 64,
           'num_workers': 4,
           }

    evaluate(opt)


## predict

In [None]:
import time
import torch
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer

def infer(sentences):
    global model
    global tokenizer
    global device
    id2label = ID2LABEL_KOR
    results = []

    for sentence in sentences:
        sentence = clean(sentence)

        infer_stime = time.time()
        encoding = tokenizer(sentence, return_tensors='pt').to(device)
        outputs = model(**encoding)
        logits = outputs.logits
        sigmoid = torch.nn.Sigmoid()
        preds = sigmoid(logits.squeeze())
        infer_etime = time.time()

        result = {'문장': sentence,
                  '추론시간': infer_etime - infer_stime
                  }

        # 가장 높은 수치를 가진 라벨 찾기
        max_prob_idx = torch.argmax(preds).item()
        result['추론 감정'] = id2label[max_prob_idx]  # 해당 인덱스의 라벨을 가져옴

        for id, label in id2label.items():
            prob = preds[id].item()
            result[label] = prob

        results.append(result)

    results = pd.DataFrame(results)

    return results


if __name__ == '__main__':
    ckpt_path = './weights/20241031T10-30-48/checkpoint-12500'
    device = 'cuda:0'
    model = AutoModelForSequenceClassification.from_pretrained(ckpt_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)

    sentences = ['아내가 회사에서 잘렸어.']

    model.eval()
    ret = infer(sentences)
    print(ret.T)
