## Worker

In [1]:
import websocket
import rel
import json
import re
from gensim.models import Doc2Vec
from PyKomoran import *


MODEL_PATH = "./model/d2v_1110.model"
TURTLEMQ_URL = "ws://172.18.0.2:8080/turtle/"
KOMORAN_USER_DIC_PATH = "./user_dic/dic1.user"
STOPWORDS_DICT_PATH = "./user_dic/stopwords"


class Worker:
    def __init__(self):
        # Doc2Vec 모델
        self.d2v_model = Doc2Vec.load(MODEL_PATH)

        # 형태소 분석기(light, full)
        self.komoran_light = Komoran("STABLE")
        self.komoran_full = Komoran("EXP")
        self.komoran_full.set_user_dic(KOMORAN_USER_DIC_PATH)

        # 불용어 사전
        self.stopwords = set()
        with open(STOPWORDS_DICT_PATH, "r") as f:
            for line in f.readlines():
                self.stopwords.add(line.strip())


    def on_open(self, ws):
        print("Connection opened")
        self.send('{"type":"REGISTER_WORKER", "data":""}') # 초기 패킷 (워커 등록)


    def on_message(self, ws, message):
        print("TurtleMQ →", message)

        message = json.loads(message)

        if message['type'] == "REQUEST_TASK":
            result = { "imprisonment": 0, "probation": 0, "fine": 0, "judgementDecision": "예상 판결은 아직 지원되지 않습니다.", "similarPrecedents": [] }

            # 텍스트 클렌징 진행
            cleansing_text = self.get_cleansing_text(message['data'])

            # 명사 추출
            extracted_nouns = self.get_nouns_from_text(cleansing_text)

            # 유사 판례 계산
            result['similarPrecedents'] = self.get_similar_precedents(extracted_nouns)

            response = { "type": "RESPONSE_TASK", "taskId": message["taskId"], "data": json.dumps(result) }
            self.send(json.dumps(response))



    def on_error(self, ws, error):
        print(error)

    def on_close(self, ws, close_status_code, close_msg):
        self.on_exit()

    def on_exit(self):
        if self.ws.keep_running:
            self.ws.close()

        print("Connection closed")
        raise SystemExit("Socket connection is closed.")

    def run(self):
        self.ws = websocket.WebSocketApp(TURTLEMQ_URL,
                                on_open=self.on_open,
                                on_message=self.on_message,
                                on_error=self.on_error,
                                on_close=self.on_close)
        
        get_ipython().events.register('post_execute', self.on_exit)

        self.ws.run_forever(dispatcher=rel, reconnect=5)  # 연결 실패면 5초뒤 다시 연결 시도
        rel.dispatch()


    # 텍스트 클렌징
    def get_cleansing_text(self, text) -> str:
        text = re.sub(r'\d+?\.\s\d+\.\s\d+\.', '', text) # 날짜 제거
        text = re.sub(r'\b\w+법원|지원', '', text) # 법원명 제거
        text = re.sub('수사보고|범 죄 사 실|범죄사실', '', text) # 판례의 기본적인 문구 제거
        text = re.sub(r'[「」『』\[\],.:%○]', '', text) # 특수기호 제거
        return text
    

    # 명사 추출
    def get_nouns_from_text(self, text) -> [str]:
        # Full 명사 추출
        nouns = []
        for noun in self.komoran_full.get_nouns(text):
            if noun.isalpha() and (noun not in self.stopwords) and (len(noun) > 1): # 단어에 숫자가 포함되지 않는 경우에만 저장 and 불용어 사전에 포함되지 않는 경우 and 단어 길이가 2이상인 단어만 저장
                nouns.append(noun.replace(" ", "")) # 띄어쓰기를 제거한 후 저장

        # Light 명사 추출
        for noun in self.komoran_light.get_nouns(text):
            if noun.isalpha() and (noun not in self.stopwords) \
                and (len(noun) > 1):
                nouns.append(noun.replace(" ", "")) # 띄어쓰기 제거

        return nouns


    # 유사 판례 계산
    def get_similar_precedents(self, nouns: [str]) -> [str]:
        result = []
        input_data_vector = self.d2v_model.infer_vector(nouns)
        for case in self.d2v_model.dv.most_similar([input_data_vector]):
            # 결과 5개만 추출
            if len(result) >= 5:
                break
            result.append('{} {}'.format(case[0], int(round(case[1], 2) * 100)))
        return result
        

    def send(self, data):
        if self.ws.keep_running:
            self.ws.send(data)
            print("TurtleMQ ←", data)

worker = Worker()
worker.run()

[Errno 111] Connection refused
[Errno 111] Connection refused


: 