In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd drive/MyDrive/final_project

/content/drive/MyDrive/final_project


In [3]:
!pip install -r requirements.txt

Collecting kobert_tokenizer (from -r requirements.txt (line 8))
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-lu5lxe6r/kobert-tokenizer_114aa03a13774180bcbf21004393f656
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-lu5lxe6r/kobert-tokenizer_114aa03a13774180bcbf21004393f656
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mxnet (from -r requirements.txt (line 1))
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gluonnlp (from -r requirements.txt (line 2))
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m344.5/344.5 kB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25h  P

In [5]:
import os
import torch
from transformers import AdamW, BertModel
from model.classifier import BERTClassifier, kobert_input
from model.dataloader import WellnessTextClassificationDataset
from kobert_tokenizer import KoBERTTokenizer
import torch.nn as nn
import random

In [6]:
import openpyxl
from openpyxl import Workbook, load_workbook

In [7]:
# 감정 카테고리 로드
def load_emotion_category():
    # 카테고리 파일 경로 설정
    category_path = './data/wellness_dialog_category_배경.txt'

    # 카테고리 파일 열기
    c_f = open(category_path, 'r')
    # 파일의 모든 라인 읽기
    category_lines = c_f.readlines()

    # 각 라인을 처리하여 딕셔너리에 저장
    category = {}
    for line_num, line_data in enumerate(category_lines):
        data = line_data.split('    ')
        category[data[1][:-1]] = data[0]
    return category

In [8]:
if __name__ == "__main__":
    # 디바이스 설정
    ctx = "cuda" if torch.cuda.is_available() else "cpu"
    device = torch.device(ctx)
    # 체크포인트 로드
    checkpoint_path = "./checkpoint"
    save_ckpt_path = checkpoint_path + "/kobert-wellness-text-classification_배경.pth"
    checkpoint = torch.load(save_ckpt_path, map_location=device)
    # KoBERT 모델 로드
    bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
    # 모델 초기화
    model = BERTClassifier(bertmodel)
    # 모델 상태 딕셔너리 로드
    model.load_state_dict(checkpoint['model_state_dict'])
    # 모델을 디바이스에 로드
    model.to(ctx)
    model.eval()

    # 토크나이저 로드
    tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

    # 감정 카테고리 로드
    category = load_emotion_category()

    # 감성 대화 말뭉치 로드
    corpus_file = './data/감성대화말뭉치_배경분류.xlsx'
    wb = load_workbook(filename=corpus_file)
    ws = wb[wb.sheetnames[0]]
    first_line = 0
    # 각 행을 순회
    for row in ws.iter_rows():
        if first_line == 0:
            first_line = 1
            continue
        # 문장 데이터 가져오기
        sent = row[2].value
        # 입력 데이터 생성
        data = kobert_input(tokenizer, sent, device, 512)
        # 모델에 입력하여 출력값 얻기
        output = model(**data)

        # 로짓 값 추출
        logit = output[0]
        # 소프트맥스 적용
        softmax_logit = torch.softmax(logit, dim=-1)
        softmax_logit = softmax_logit.squeeze()

        # 최댓값 인덱스 및 값 추출
        max_index = torch.argmax(softmax_logit).item()
        max_index_value = softmax_logit[torch.argmax(softmax_logit)].item()

        # 예측된 카테고리 추출
        cate_pred = category[str(max_index)]

        # 예측된 카테고리를 4번째 열에 저장
        row[3].value = cate_pred

    # 변경된 내용을 파일에 저장
    wb.save(corpus_file)

config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [9]:
print(category)

{'0': '배경/가족', '1': '배경/건강문제', '2': '배경/결혼', '3': '배경/경제적문제', '4': '배경/공부', '5': '배경/군대', '6': '배경/다이어트', '7': '배경/대인관계', '8': '배경/대학', '9': '배경/배우자', '10': '배경/부모', '11': '배경/사고', '12': '배경/사업', '13': '배경/생활,거주', '14': '배경/성격', '15': '배경/시댁', '16': '배경/아르바이트', '17': '배경/애완동물', '18': '배경/어린시절', '19': '배경/여행', '20': '배경/연애', '21': '배경/운동', '22': '배경/유학', '23': '배경/음주', '24': '배경/임신', '25': '배경/자녀', '26': '배경/종교', '27': '배경/직장', '28': '배경/진로', '29': '배경/취미', '30': '배경/취업', '31': '배경/친구', '32': '배경/학교', '33': '배경/학업'}
