## Document To Vector

### Doc2Vec 모델 학습

In [16]:
import logging
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import pandas as pd
import os

INPUT_FILE_PATH = "./output/04_data_output.xlsx" # 입력 파일 경로
MODEL_OUPUT_PATH = "./model/" # 출력 폴더 경로

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # 로깅 설정

# Doc2Vec 모델에 입력될 형식을 클래스로 지정
class Doc2VecInput:
    def __init__(self, filepath):
        self.filepath = filepath

    def __iter__(self):
        dataframe = pd.read_excel(self.filepath) # 입력 파일 읽기
        for _, row in dataframe.iterrows():
            tags = [row['court_name'] + "/" + row['case_number']] # 문서의 ID는 '법원명/판례번호'로 설정
            tokens = row['extracted_nouns'].split() # extracted_nouns를 토큰으로 입력
            yield TaggedDocument(words = tokens, tags = tags) # yield로 값을 계속 누적하여 반환

# 모델 입력 정의
document_input = Doc2VecInput(INPUT_FILE_PATH)

# 모델 초기화 및 단어 구축, 학습 진행
model = Doc2Vec(vector_size = 256, min_count = 5, epochs = 10)
model.build_vocab(document_input)
model.train(document_input, total_examples = model.corpus_count, epochs = model.epochs)

# 결과 디렉토리가 없으면 생성
if not os.path.exists(MODEL_OUPUT_PATH):
    os.makedirs(MODEL_OUPUT_PATH)

# 모델 저장
model.save(os.path.join(MODEL_OUPUT_PATH, "d2v_1105.model"))

2023-11-05 14:48:42,337 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d256,n5,w5,mc5,s0.001,t3>', 'datetime': '2023-11-05T14:48:42.337518', 'gensim': '4.2.0', 'python': '3.7.6 | packaged by conda-forge | (default, Jun  1 2020, 18:57:50) \n[GCC 7.5.0]', 'platform': 'Linux-5.15.49-linuxkit-pr-x86_64-with-debian-buster-sid', 'event': 'created'}
2023-11-05 14:48:42,341 : INFO : collecting all words and their counts


2023-11-05 14:48:44,937 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-11-05 14:48:46,684 : INFO : PROGRESS: at example #10000, processed 517170 words (296128 words/s), 9568 word types, 7888 tags
2023-11-05 14:48:46,936 : INFO : collected 9763 word types and 8345 unique tags from a corpus of 10495 examples and 543448 words
2023-11-05 14:48:46,937 : INFO : Creating a fresh vocabulary
2023-11-05 14:48:46,966 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 3879 unique words (39.73% of original 9763, drops 5884)', 'datetime': '2023-11-05T14:48:46.966826', 'gensim': '4.2.0', 'python': '3.7.6 | packaged by conda-forge | (default, Jun  1 2020, 18:57:50) \n[GCC 7.5.0]', 'platform': 'Linux-5.15.49-linuxkit-pr-x86_64-with-debian-buster-sid', 'event': 'prepare_vocab'}
2023-11-05 14:48:46,967 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 532713 word corpus (98.02% of original 543448, drops 10735)', 'dateti

### 모델 불러오기 및 유사 판례 추출

In [17]:
from gensim.models import Doc2Vec

MODEL_PATH = "./model/d2v_1105.model"

inp_tokens = "살인 강도".split() # 입력 토큰 지정

# 모델 불러오기 및 결과 도출
model = Doc2Vec.load(MODEL_PATH)
inp_vector = model.infer_vector(inp_tokens)
print(model.docvecs.most_similar([inp_vector]))

2023-11-05 14:49:57,949 : INFO : loading Doc2Vec object from ./model/d2v_1105.model
2023-11-05 14:49:57,974 : INFO : loading dv recursively from ./model/d2v_1105.model.dv.* with mmap=None
2023-11-05 14:49:57,975 : INFO : loading wv recursively from ./model/d2v_1105.model.wv.* with mmap=None
2023-11-05 14:49:57,976 : INFO : setting ignored attribute cum_table to None
2023-11-05 14:49:58,064 : INFO : Doc2Vec lifecycle event {'fname': './model/d2v_1105.model', 'datetime': '2023-11-05T14:49:58.064740', 'gensim': '4.2.0', 'python': '3.7.6 | packaged by conda-forge | (default, Jun  1 2020, 18:57:50) \n[GCC 7.5.0]', 'platform': 'Linux-5.15.49-linuxkit-pr-x86_64-with-debian-buster-sid', 'event': 'loaded'}


[('의정부지방법원/2021고단360', 0.9436867237091064), ('의정부지방법원/2020고단7079', 0.9436052441596985), ('서울남부지방법원/2021고단1250', 0.943051815032959), ('의정부지방법원/2021고단717', 0.9414937496185303), ('서울동부지방법원/2021고단2483', 0.9405165910720825), ('의정부지방법원/2021고단2777', 0.9365559220314026), ('서울동부지방법원/2021고단2887', 0.9355080127716064), ('서울동부지방법원/2022고단1078', 0.9339799880981445), ('의정부지방법원/2021고단2533', 0.9333634376525879), ('서울남부지방법원/2022고단2516', 0.9331834316253662)]


  if __name__ == '__main__':
