In [1]:
!pip install rel
!pip install konlpy
!pip install tensorflow==2.13.1 --upgrade
!pip install keras==2.13.1 --upgrade
!pip install numpy --upgrade # 발생하는 에러는 무시해도 됨

Collecting rel
  Downloading rel-0.4.9.5-py2.py3-none-any.whl (13 kB)
Installing collected packages: rel
Successfully installed rel-0.4.9.5
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0
Collecting tensorflow==2.13.1
  Downloading tensorflow-2.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (479.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m479.7/479.7 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.

# 꼭 런타임 다시 시작 후 아래 코드 실행

In [None]:
import websocket
import rel
import json
import re
import numpy as np
import torch
import pickle
from keras.models import load_model
from gensim.models import Doc2Vec
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import AutoTokenizer, AutoModel


TURTLEMQ_URL = "ws://175.45.195.151/turtle/"

# Interval
STOPWORDS_DICT_PATH = "/content/drive/MyDrive/Colab Notebooks/model/stopwords"
TOKENIZER_PATH = "/content/drive/MyDrive/Colab Notebooks/model/tokenizer.pickle"

# MLP
IMPRISONMENT_MLP_MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/model/imprisonment_231119.keras"
PROBATION_MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/model/probation_231120.keras"
FINE_MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/model/fine_231122.keras"

# KOBERT
IMPRISONMENT_KOBERT_MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/model/kobert_imprisonment_231119.h5"

# DOC2VEC
D2V_MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/model/d2v_231117.model"


class Worker:
    def __init__(self):
        # 형태소 분석기
        self.okt = Okt()

        # 불용어 사전
        self.stopwords = set()
        with open(STOPWORDS_DICT_PATH, "r") as f:
            for line in f.readlines():
                self.stopwords.add(line.strip())

        # 토크나이저 불러오기
        self.tokenizer = {}
        with open(TOKENIZER_PATH, 'rb') as handle:
            self.tokenizer = pickle.load(handle)

        # MLP 모델 불러오기
        self.imprisonment_mlp_model = load_model(IMPRISONMENT_MLP_MODEL_PATH)
        self.probation_model = load_model(PROBATION_MODEL_PATH)
        self.fine_model = load_model(FINE_MODEL_PATH)

        # KoBERT 모델 불러오기
        self.imprisonment_kobert_model = load_model(IMPRISONMENT_KOBERT_MODEL_PATH)
        self.kobert_tokenizer = AutoTokenizer.from_pretrained("monologg/kobert")
        self.kobert_model = AutoModel.from_pretrained("monologg/kobert")
        self.kobert_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

        # Doc2Vec 모델
        self.d2v_model = Doc2Vec.load(D2V_MODEL_PATH)


    def on_open(self, ws):
        print("Connection opened")
        self.send('{"type":"REGISTER_WORKER", "data":""}') # 초기 패킷 (워커 등록)


    def on_message(self, ws, message):
        print("TurtleMQ →", message)

        message = json.loads(message)

        if message['type'] == "REQUEST_TASK":
            result = { "imprisonment": 0, "probation": 0, "fine": 0, "judgementDecision": "예상 판결은 아직 지원되지 않습니다.", "similarPrecedents": [] }

            # 텍스트 클렌징 진행
            clean_text = self.get_clean_text(message['data'].strip())

            # 띄어쓰기 수정
            clean_text = self.correct_spacing(clean_text)

            # 명사 추출
            extracted_nouns = self.extract_nouns(clean_text)

            # 토크나이징
            tokens = self.get_tokenized_sequences(extracted_nouns)


            # 징역 예측 (MLP모델과 KoBERT 모델 결과의 평균)
            result['imprisonment'] = int((self.predict_imprisonment_kobert(extracted_nouns) + self.predict_imprisonment_mlp(tokens)) / 2)

            # 집행유예 예측
            result['probation'] = int(self.predict_probation(tokens) * 100)

            # 벌금 예측
            result['fine'] = int(self.predict_fine(tokens)) * 10000


            # 유사 판례 계산
            result['similarPrecedents'] = self.get_similar_precedents(extracted_nouns)

            response = { "type": "RESPONSE_TASK", "taskId": message["taskId"], "data": json.dumps(result) }
            self.send(json.dumps(response))



    def on_error(self, ws, error):
        print(error)

    def on_close(self, ws, close_status_code, close_msg):
        self.on_exit()

    def on_exit(self):
        if self.ws.keep_running:
            self.ws.close()

        print("Connection closed")
        raise SystemExit("Socket connection is closed.")

    def run(self):
        self.ws = websocket.WebSocketApp(TURTLEMQ_URL,
                                on_open=self.on_open,
                                on_message=self.on_message,
                                on_error=self.on_error,
                                on_close=self.on_close)

        get_ipython().events.register('post_execute', self.on_exit)

        self.ws.run_forever(dispatcher=rel, reconnect=5)  # 연결 실패면 5초뒤 다시 연결 시도
        rel.dispatch()


    # 텍스트 클렌징
    def get_clean_text(self, text) -> str:
        text = text.replace(',','').replace('"','').replace('\'','').replace('.','').replace('(',' ').\
            replace(')','').replace('!','').replace('?','').replace(':','').replace(';','').lower()
        text = text.replace("\n"," ")
        text = re.sub(r'\d+?\.\s\d+\.\s\d+\.', '', text) # 날짜 제거
        text = re.sub(r'\b\w+법원|지원', '', text) # 법원명 제거
        text = re.sub('수사보고|범 죄 사 실|범죄사실', '', text) # 판례의 기본적인 문구 제거
        text = re.sub(r'[「」『』\[\],.:%○]', '', text) # 특수기호 제거
        return text

    # 띄어쓰기 고치기
    def correct_spacing(self, text):
        tagged = self.okt.pos(text)
        corrected = ""
        for i in tagged:
            if i[1] in ('Josa', 'PreEomi', 'Eomi', 'Suffix', 'Punctuation'):
                corrected += i[0]
            else:
                corrected += " "+i[0]
        if corrected[0] == " ":
            corrected = corrected[1:]
        return corrected

    # 명사 추출
    def extract_nouns(self, text):
        nouns = []
        for noun in self.okt.nouns(text):
            if noun.isalpha() and (noun not in self.stopwords): # 단어에 숫자가 아닌 경우에만 저장 and 불용어 사전에 포함되지 않는 경우
                nouns.append(noun)
        return nouns

    # 토크나이징
    def get_tokenized_sequences(self, nouns):
        out = self.tokenizer.texts_to_sequences([nouns])
        return pad_sequences(out, 256)

    # 징역 예측 (MLP)
    def predict_imprisonment_mlp(self, tokens):
        return self.imprisonment_mlp_model.predict(tokens, verbose=0)

    # 징역 예측 (KoBERT)
    def predict_imprisonment_kobert(self, nouns):
        # 명사 병합
        processed_text = ' '.join(nouns)

        # 토큰화
        tokenized_text = self.kobert_tokenizer.tokenize(processed_text)
        tokenized_text = " ".join([word for word in tokenized_text if word != '[UNK]'])
        inputs = self.kobert_tokenizer(tokenized_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {k: v.to("cuda" if torch.cuda.is_available() else "cpu") for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.kobert_model(**inputs)

        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()[0]
        return self.imprisonment_kobert_model.predict(np.array([embedding]), verbose=0)[0][0]

    # 집행유예 예측 (MLP)
    def predict_probation(self, tokens):
        return self.probation_model.predict(tokens, verbose=0)

    # 벌금 예측 (MLP)
    def predict_fine(self, tokens):
        return self.fine_model.predict(tokens, verbose=0)

    # 유사 판례 계산
    def get_similar_precedents(self, nouns):
        result = []
        input_data_vector = self.d2v_model.infer_vector(nouns)
        for case in self.d2v_model.dv.most_similar([input_data_vector]):
            # 결과 5개만 추출
            if len(result) >= 5:
                break
            result.append('{} {}'.format(case[0], int(round(case[1], 2) * 100)))
        return result


    def send(self, data):
        if self.ws.keep_running:
            self.ws.send(data)
            print("TurtleMQ ←", data)

worker = Worker()
worker.run()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Connection opened
TurtleMQ ← {"type":"REGISTER_WORKER", "data":""}
TurtleMQ → {"type":"REGISTER_WORKER","messageId":null,"data":""}
TurtleMQ → {"type":"REQUEST_TASK","messageId":null,"data":"집해유예기간 경합범가중 ","taskId":"d00aef36-645c-4029-86db-558858958057"}


  result['imprisonment'] = int((self.predict_imprisonment_kobert(extracted_nouns) + self.predict_imprisonment_mlp(tokens)) / 2)
  result['probation'] = int(self.predict_probation(tokens) * 100)
  result['fine'] = int(self.predict_fine(tokens)) * 10000


TurtleMQ ← {"type": "RESPONSE_TASK", "taskId": "d00aef36-645c-4029-86db-558858958057", "data": "{\"imprisonment\": 3, \"probation\": 24, \"fine\": 0, \"judgementDecision\": \"\\uc608\\uc0c1 \\ud310\\uacb0\\uc740 \\uc544\\uc9c1 \\uc9c0\\uc6d0\\ub418\\uc9c0 \\uc54a\\uc2b5\\ub2c8\\ub2e4.\", \"similarPrecedents\": [\"\\ucc3d\\uc6d0\\uc9c0\\ubc29\\ubc95\\uc6d0\\uc9c4\\uc8fc\\uc9c0\\uc6d0/2022\\uace0\\ub2e81183 42\", \"\\uc758\\uc815\\ubd80\\uc9c0\\ubc29\\ubc95\\uc6d0/2021\\uace0\\ub2e84756 41\", \"\\ucc3d\\uc6d0\\uc9c0\\ubc29\\ubc95\\uc6d0/2020\\uace0\\ub2e83940 40\", \"\\ucc3d\\uc6d0\\uc9c0\\ubc29\\ubc95\\uc6d0\\ud1b5\\uc601\\uc9c0\\uc6d0/2021\\uace0\\uc815138 40\", \"\\ucc3d\\uc6d0\\uc9c0\\ubc29\\ubc95\\uc6d0\\ud1b5\\uc601\\uc9c0\\uc6d0/2021\\uace0\\ub2e81014 40\"]}"}
TurtleMQ → {"type":"REQUEST_TASK","messageId":null,"data":"자동차전용도로 음주측정 ","taskId":"9bc9cee3-a169-4a02-bc5e-a651307caa99"}
TurtleMQ ← {"type": "RESPONSE_TASK", "taskId": "9bc9cee3-a169-4a02-bc5e-a651307caa99", "data": "{\"im

In [None]:
ㅇ