## Document To Vector

### Doc2Vec 모델 학습

In [33]:
import logging
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import pandas as pd
import os

INPUT_FILE_PATH = "./output/04_data_output.xlsx" # 입력 파일 경로
MODEL_OUPUT_PATH = "./model/" # 출력 폴더 경로

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # 로깅 설정

# Doc2Vec 모델에 입력될 형식을 클래스로 지정
class Doc2VecInput:
    def __init__(self, filepath):
        self.filepath = filepath

    def __iter__(self):
        dataframe = pd.read_excel(self.filepath) # 입력 파일 읽기
        for _, row in dataframe.iterrows():
            tags = [row['court_name'] + "/" + row['case_number']] # 문서의 ID는 '법원명/판례번호'로 설정
            tokens = row['extracted_nouns'].split() # extracted_nouns를 토큰으로 입력
            yield TaggedDocument(words = tokens, tags = tags) # yield로 값을 계속 누적하여 반환

# 모델 입력 정의
document_input = Doc2VecInput(INPUT_FILE_PATH)

# 모델 초기화 및 단어 구축, 학습 진행
model = Doc2Vec(vector_size = 256, min_count = 2, epochs = 1000)
model.build_vocab(document_input)
model.train(document_input, total_examples = model.corpus_count, epochs = model.epochs)

# 결과 디렉토리가 없으면 생성
if not os.path.exists(MODEL_OUPUT_PATH):
    os.makedirs(MODEL_OUPUT_PATH)

# 모델 저장
model.save(os.path.join(MODEL_OUPUT_PATH, "d2v_1107.model"))

2023-11-07 07:06:35,999 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d128,n5,w5,mc3,s0.001,t3>', 'datetime': '2023-11-07T07:06:35.999948', 'gensim': '4.2.0', 'python': '3.7.6 | packaged by conda-forge | (default, Jun  1 2020, 18:57:50) \n[GCC 7.5.0]', 'platform': 'Linux-5.15.49-linuxkit-pr-x86_64-with-debian-buster-sid', 'event': 'created'}
2023-11-07 07:06:36,003 : INFO : collecting all words and their counts
2023-11-07 07:06:42,515 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-11-07 07:06:44,335 : INFO : PROGRESS: at example #10000, processed 1581325 words (869413 words/s), 11253 word types, 9944 tags
2023-11-07 07:06:46,198 : INFO : PROGRESS: at example #20000, processed 3298534 words (922339 words/s), 14997 word types, 18040 tags
2023-11-07 07:06:46,396 : INFO : collected 15145 word types and 18211 unique tags from a corpus of 20171 examples and 3342240 words
2023-11-07 07:06:46,397 : INFO : Creating a fresh vocabulary
2023-

### 모델 불러오기 및 유사 판례 추출

In [39]:
from gensim.models import Doc2Vec

MODEL_PATH = "./model/d2v_1107.model"

inp_tokens = "무면허운전 음주운전 어린이보호구역".split() # 입력 토큰 지정

# 모델 불러오기 및 결과 도출
model = Doc2Vec.load(MODEL_PATH)
inp_vector = model.infer_vector(inp_tokens)
for case in model.docvecs.most_similar([inp_vector]):
    print(case)

2023-11-07 07:38:46,601 : INFO : loading Doc2Vec object from ./model/d2v_1107_extracted_nouns.model
2023-11-07 07:38:46,678 : INFO : loading dv recursively from ./model/d2v_1107_extracted_nouns.model.dv.* with mmap=None
2023-11-07 07:38:46,679 : INFO : loading wv recursively from ./model/d2v_1107_extracted_nouns.model.wv.* with mmap=None
2023-11-07 07:38:46,680 : INFO : setting ignored attribute cum_table to None
2023-11-07 07:38:47,002 : INFO : Doc2Vec lifecycle event {'fname': './model/d2v_1107_extracted_nouns.model', 'datetime': '2023-11-07T07:38:47.002091', 'gensim': '4.2.0', 'python': '3.7.6 | packaged by conda-forge | (default, Jun  1 2020, 18:57:50) \n[GCC 7.5.0]', 'platform': 'Linux-5.15.49-linuxkit-pr-x86_64-with-debian-buster-sid', 'event': 'loaded'}


('의정부지방법원/2023고합37', 0.4318314790725708)
('의정부지방법원고양지원/2023고합37', 0.4263140857219696)
('울산지방법원/2021고단3406', 0.4126560389995575)
('울산지방법원/2023고단987', 0.4079866111278534)
('의정부지방법원/2022고단1530', 0.3870875835418701)
('의정부지방법원고양지원/2022고단1530', 0.38507965207099915)
('서울남부지방법원/2020고정2086', 0.38165995478630066)
('서울남부지방법원/2023고단1524', 0.3773573338985443)
('서울남부지방법원/2022고단3022', 0.3755127191543579)
('창원지방법원/2021고단2784', 0.3686881959438324)


  # Remove the CWD from sys.path while we load stuff.
