## Worker

In [1]:
import websocket
import rel
import json
import re
import pandas as pd
import numpy as np
from keras.models import load_model
from gensim.models import Doc2Vec, KeyedVectors
from PyKomoran import *
from sklearn.preprocessing import StandardScaler


TURTLEMQ_URL = "ws://172.18.0.3:8080/turtle/"
KOMORAN_USER_DIC_PATH = "./user_dic/dic1.user"
STOPWORDS_DICT_PATH = "./user_dic/stopwords"
W2V_MODEL_PATH = "./model/w2v_extracted_nouns_231117.txt"
DATA_PATH = "./data/04_data_output.xlsx"
# DOC2VEC
D2V_MODEL_PATH = "./model/d2v_231117.model"
# Imprisonment
IMPR_MODEL_PATH = "./model/imprisonment_231117.h5"

class Worker:
    def __init__(self):
        # 학습 데이터 로드
        data = pd.read_excel(DATA_PATH)

        # 형태소 분석기(light, full)
        self.komoran_light = Komoran("STABLE")
        self.komoran_full = Komoran("EXP")
        self.komoran_full.set_user_dic(KOMORAN_USER_DIC_PATH)

        # 불용어 사전
        self.stopwords = set()
        with open(STOPWORDS_DICT_PATH, "r") as f:
            for line in f.readlines():
                self.stopwords.add(line.strip())

        # Word2Vec 임베딩 모델 로드
        self.w2v_model = KeyedVectors.load_word2vec_format(W2V_MODEL_PATH, binary=False)

        # 모든 텍스트 데이터를 Word2Vec 임베딩으로 변환하여 배열 생성
        embeddings = []
        for text in data['extracted_nouns']:
            words = text.split()
            word_vectors = [self.w2v_model[word] for word in words if word in self.w2v_model]
            if len(word_vectors) > 0:
                text_embedding = np.mean(word_vectors, axis=0)
                embeddings.append(text_embedding)

        # 표준화 스케일러 학습
        self.scaler = StandardScaler().fit(embeddings)

        # Imprisonment 모델 로드
        self.impr_model = load_model(IMPR_MODEL_PATH)

        # Doc2Vec 모델
        self.d2v_model = Doc2Vec.load(D2V_MODEL_PATH)


    def on_open(self, ws):
        print("Connection opened")
        self.send('{"type":"REGISTER_WORKER", "data":""}') # 초기 패킷 (워커 등록)


    def on_message(self, ws, message):
        print("TurtleMQ →", message)

        message = json.loads(message)

        if message['type'] == "REQUEST_TASK":
            result = { "imprisonment": 0, "probation": 0, "fine": 0, "judgementDecision": "예상 판결은 아직 지원되지 않습니다.", "similarPrecedents": [] }

            # 텍스트 클렌징 진행
            cleansing_text = self.get_cleansing_text(message['data'].strip())

            # 명사 추출
            extracted_nouns = self.get_nouns_from_text(cleansing_text)

            # 징역 예측
            result['imprisonment'] = int(self.predict_imprisonment(' '.join(extracted_nouns)))

            # 유사 판례 계산
            result['similarPrecedents'] = self.get_similar_precedents(extracted_nouns)

            response = { "type": "RESPONSE_TASK", "taskId": message["taskId"], "data": json.dumps(result) }
            self.send(json.dumps(response))



    def on_error(self, ws, error):
        print(error)

    def on_close(self, ws, close_status_code, close_msg):
        self.on_exit()

    def on_exit(self):
        if self.ws.keep_running:
            self.ws.close()

        print("Connection closed")
        raise SystemExit("Socket connection is closed.")

    def run(self):
        self.ws = websocket.WebSocketApp(TURTLEMQ_URL,
                                on_open=self.on_open,
                                on_message=self.on_message,
                                on_error=self.on_error,
                                on_close=self.on_close)
        
        get_ipython().events.register('post_execute', self.on_exit)

        self.ws.run_forever(dispatcher=rel, reconnect=5)  # 연결 실패면 5초뒤 다시 연결 시도
        rel.dispatch()


    # 텍스트 클렌징
    def get_cleansing_text(self, text) -> str:
        text = re.sub(r'\d+?\.\s\d+\.\s\d+\.', '', text) # 날짜 제거
        text = re.sub(r'\b\w+법원|지원', '', text) # 법원명 제거
        text = re.sub('수사보고|범 죄 사 실|범죄사실', '', text) # 판례의 기본적인 문구 제거
        text = re.sub(r'[「」『』\[\],.:%○]', '', text) # 특수기호 제거
        return text
    

    # 명사 추출
    def get_nouns_from_text(self, text) -> [str]:
        # Full 명사 추출
        nouns = []
        for noun in self.komoran_full.get_nouns(text):
            if noun.isalpha() and (noun not in self.stopwords) and (len(noun) > 1): # 단어에 숫자가 포함되지 않는 경우에만 저장 and 불용어 사전에 포함되지 않는 경우 and 단어 길이가 2이상인 단어만 저장
                nouns.append(noun.replace(" ", "")) # 띄어쓰기를 제거한 후 저장

        # Light 명사 추출
        for noun in self.komoran_light.get_nouns(text):
            if noun.isalpha() and (noun not in self.stopwords) \
                and (len(noun) > 1):
                nouns.append(noun.replace(" ", "")) # 띄어쓰기 제거

        return nouns

    # 징역 예측
    def predict_imprisonment(self, nouns_str):
        # 텍스트를 Word2Vec 임베딩으로 변환
        words = nouns_str.split()
        word_vectors = [self.w2v_model[word] for word in words if word in self.w2v_model]

        if len(word_vectors) > 0:
            text_embedding = np.mean(word_vectors, axis=0).reshape(1, -1)
            model_input = self.scaler.transform(text_embedding) # 스케일링 적용
            prediction = self.impr_model.predict(model_input)
            return prediction[0][0]
        else: # 사전에 존재하지 않는 명사인 경우
            return 0

    # 유사 판례 계산
    def get_similar_precedents(self, nouns: [str]) -> [str]:
        result = []
        input_data_vector = self.d2v_model.infer_vector(nouns)
        for case in self.d2v_model.dv.most_similar([input_data_vector]):
            # 결과 5개만 추출
            if len(result) >= 5:
                break
            result.append('{} {}'.format(case[0], int(round(case[1], 2) * 100)))
        return result
        

    def send(self, data):
        if self.ws.keep_running:
            self.ws.send(data)
            print("TurtleMQ ←", data)

worker = Worker()
worker.run()

2023-11-17 11:58:43.536509: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-17 11:58:43.644545: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-17 11:58:43.646495: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Connection opened
TurtleMQ ← {"type":"REGISTER_WORKER", "data":""}
TurtleMQ → {"type":"REGISTER_WORKER","messageId":null,"data":""}
TurtleMQ → {"type":"REQUEST_TASK","messageId":null,"data":"사고후미조치 도주치상 신호 또는 지시 위반 ","taskId":"867469cc-0f91-4584-b235-6d0150f54795"}
사고후미조치 도주치상 신호 또는 지시 위반
['사고후미조치', '도주치상', '후미', '조치', '도주', '치상', '신호', '지시']
TurtleMQ ← {"type": "RESPONSE_TASK", "taskId": "867469cc-0f91-4584-b235-6d0150f54795", "data": "{\"imprisonment\": 40, \"probation\": 0, \"fine\": 0, \"judgementDecision\": \"\\uc608\\uc0c1 \\ud310\\uacb0\\uc740 \\uc544\\uc9c1 \\uc9c0\\uc6d0\\ub418\\uc9c0 \\uc54a\\uc2b5\\ub2c8\\ub2e4.\", \"similarPrecedents\": [\"\\uc11c\\uc6b8\\uc11c\\ubd80\\uc9c0\\ubc29\\ubc95\\uc6d0/2022\\uace0\\ub2e8869 36\", \"\\ucc3d\\uc6d0\\uc9c0\\ubc29\\ubc95\\uc6d0\\uc9c4\\uc8fc\\uc9c0\\uc6d0/2021\\uace0\\ub2e81884 36\", \"\\uc758\\uc815\\ubd80\\uc9c0\\ubc29\\ubc95\\uc6d0/2021\\uace0\\ub2e81494 35\", \"\\ucc3d\\uc6d0\\uc9c0\\ubc29\\ubc95\\uc6d0\\uc9c4\\uc8fc\\uc9c0\\uc6d0