# GPT (시간 체크)

In [None]:
import re
import os
import torch
import time
import gc
from typing import List
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings.base import Embeddings

# MPS(Apple Silicon GPU) 사용 여부 확인
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def embed_documents(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            # MPS 사용
            batch_embeddings = self.model.encode(batch, convert_to_tensor=True, device=device)
            embeddings.extend(batch_embeddings.cpu().tolist())  # GPU에서 CPU로 변환 후 리스트로 저장
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        return self.embed_documents([text])[0]

def clean_text(text: str) -> str:
    text = re.sub(r"법제처\s+.*?\s+국가법령정보센터", "", text)
    text = re.sub(r"<.*?>|\[.*?]|\(.*?\)", "", text, flags=re.DOTALL)
    text = re.sub(r"\(.*?\)\s*\d{2,4}[-.\s]\d{3,4}[-.\s]\d{4}", "", text)
    text = re.sub(r"\d{2,4}[-.\s]\d{3,4}[-.\s]\d{4}", "", text)
    return re.sub(r"\s+", " ", text).strip()

def tokenizer_len(text: str) -> int:
    return len(AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large").encode(text))

def process_document(doc: Document) -> List[str]:
    cleaned_text = clean_text(doc.page_content)
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=tokenizer_len,
    )
    
    return splitter.split_text(cleaned_text)

def process_file(file_path: str, model_name: str, persist_directory: str):
    embedding_model = SentenceTransformerEmbeddings(model_name)
    
    # 문서 로드 시간 측정
    loader_start_time = time.time()
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()
    loader_end_time = time.time()
    print(f"문서 로드 시간: {loader_end_time - loader_start_time:.2f} 초")
    
    # 문서 분할 시간 측정
    split_start_time = time.time()
    docs_split = [process_document(doc) for doc.page_content in documents]
    docs_split = [item for sublist in docs_split for item in sublist]
    split_end_time = time.time()
    print(f"문서 분할 시간: {split_end_time - split_start_time:.2f} 초")

    # Chroma 초기화
    client = Chroma(embedding_function=embedding_model, persist_directory=persist_directory, anonymized_telemetry=False)

    # 벡터 추가 시간 측정
    embedding_start_time = time.time()
    batch_size = 1000  # Chroma에 추가할 배치 크기
    for i in range(0, len(docs_split), batch_size):
        batch = docs_split[i:i+batch_size]
        client.add_documents(batch)
        client.persist()  # 매 배치 처리 후 persist 호출
    embedding_end_time = time.time()
    print(f"벡터 추가 및 저장 시간: {embedding_end_time - embedding_start_time:.2f} 초")

    # 메모리 정리
    del embedding_model, loader, documents, docs_split, client
    gc.collect()
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()

    total_time = embedding_end_time - loader_start_time
    return total_time

def main():
    directory_path = 'file/현행법령/'
    persist_directory = "file/chroma_storage/multilingual_law/"
    model_name = "intfloat/multilingual-e5-large"

    file_names = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

    for file_name in file_names:
        file_path = os.path.join(directory_path, file_name)
        print(f"{file_name} 파일 처리 중...")

        total_processing_time = process_file(file_path, model_name, persist_directory)

        print(f"{file_name} 처리 완료. 총 소요 시간: {total_processing_time:.2f} 초")

    print("모든 파일 처리 완료.")

if __name__ == "__main__":
    main()

# 노배치

In [None]:
import re
import os
import torch
import time
import gc
from typing import List
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings.base import Embeddings

# MPS(Apple Silicon GPU) 사용 여부 확인
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        batch_embeddings = self.model.encode(texts, convert_to_tensor=True, device=device)
        return batch_embeddings.cpu().tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.embed_documents([text])[0]

def clean_text(documents: List[Document]) -> List[Document]:
    cleaned_documents = []
    for doc in documents:
        text = doc.page_content
        text = re.sub(r"법제처\s+.*?\s+국가법령정보센터", "", text)
        text = re.sub(r"<.*?>|\[.*?]|\(.*?\)", "", text, flags=re.DOTALL)
        text = re.sub(r"\(.*?\)\s*\d{2,4}[-.\s]\d{3,4}[-.\s]\d{4}", "", text)
        text = re.sub(r"\d{2,4}[-.\s]\d{3,4}[-.\s]\d{4}", "", text)
        text = re.sub(r"\s+", " ", text)
        cleaned_documents.append(Document(
            page_content=text.strip(),
            metadata=doc.metadata
        ))
    return cleaned_documents

def tokenizer_len(text: str) -> int:
    # 토크나이저 성능 테스트용 시간 측정
    start_time = time.time()
    token_length = len(AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large").encode(text))
    end_time = time.time()
    print(f"토큰 길이 계산 시간: {end_time - start_time:.2f} 초")
    return token_length

def process_document(doc: Document, embedding_model) -> List[str]:
    cleaned_text = clean_text(doc)
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=lambda text: len(embedding_model.tokenizer.encode(text)),
    )

    split_start_time = time.time()
    split_texts = splitter.split_text(cleaned_text)
    split_end_time = time.time()

    # 분할된 텍스트의 길이 출력
    print(f"문서 분할 완료: {len(split_texts)}개의 청크 생성, 분할 시간: {split_end_time - split_start_time:.2f} 초")
    return split_texts

def process_file(file_path: str, model_name: str, persist_directory: str):
    embedding_model = SentenceTransformerEmbeddings(model_name)
    
    loader_start_time = time.time()
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()
    loader_end_time = time.time()
    print(f"문서 로드 시간: {loader_end_time - loader_start_time:.2f} 초")
    
    docs_split = []
    
    # 문서 분할 시간 측정 및 각 문서별 처리 시간 확인
    for doc in documents:
        doc_split_start_time = time.time()
        split_texts = process_document(doc, embedding_model)
        docs_split.extend(split_texts)
        doc_split_end_time = time.time()
        print(f"문서 {idx + 1} 분할 완료, 시간: {doc_split_end_time - doc_split_start_time:.2f} 초")

    # 전체 분할 완료 후 벡터 추가
    split_end_time = time.time()
    print(f"전체 문서 분할 완료. 총 생성된 청크 수: {len(docs_split)}, 총 분할 시간: {split_end_time - loader_end_time:.2f} 초")

    client = Chroma(embedding_function=embedding_model, persist_directory=persist_directory)

    embedding_start_time = time.time()
    client.add_documents(docs_split)
    client.persist()
    embedding_end_time = time.time()
    print(f"벡터 추가 및 저장 시간: {embedding_end_time - embedding_start_time:.2f} 초")

    del embedding_model, loader, documents, docs_split, client
    gc.collect()
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()

    total_time = embedding_end_time - loader_start_time
    return total_time

def main():
    directory_path = 'file/현행법령/'
    persist_directory = "file/chroma_storage/multilingual_law/"
    model_name = "intfloat/multilingual-e5-large"

    file_names = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

    for file_name in file_names:
        file_path = os.path.join(directory_path, file_name)
        print(f"{file_name} 파일 처리 중...")

        total_processing_time = process_file(file_path, model_name, persist_directory)

        print(f"{file_name} 처리 완료. 총 소요 시간: {total_processing_time:.2f} 초")

    print("모든 파일 처리 완료.")

if __name__ == "__main__":
    main()

In [None]:
import re
import os
import torch
import time
import gc
from typing import List
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings.base import Embeddings
from concurrent.futures import ProcessPoolExecutor

# MPS(Apple Silicon GPU) 사용 여부 확인
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def embed_documents(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            # MPS 사용
            batch_embeddings = self.model.encode(batch, convert_to_tensor=True, device=device)
            embeddings.extend(batch_embeddings.cpu().tolist())  # GPU에서 CPU로 변환 후 리스트로 저장
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        return self.embed_documents([text])[0]

def clean_text(text: str) -> str:
    text = re.sub(r"법제처\s+.*?\s+국가법령정보센터", "", text)
    text = re.sub(r"<.*?>|\[.*?]|\(.*?\)", "", text, flags=re.DOTALL)
    text = re.sub(r"\(.*?\)\s*\d{2,4}[-.\s]\d{3,4}[-.\s]\d{4}", "", text)
    text = re.sub(r"\d{2,4}[-.\s]\d{3,4}[-.\s]\d{4}", "", text)
    return re.sub(r"\s+", " ", text).strip()

def tokenizer_len(text: str) -> int:
    return len(AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large").encode(text))

def process_document_parallel(doc: Document) -> List[str]:
    cleaned_text = clean_text(doc.page_content)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=tokenizer_len,
    )
    return splitter.split_text(cleaned_text)

def process_file(file_path: str, model_name: str, persist_directory: str):
    embedding_model = SentenceTransformerEmbeddings(model_name)
    
    # 문서 로드 시간 측정
    loader_start_time = time.time()
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()
    loader_end_time = time.time()
    print(f"문서 로드 시간: {loader_end_time - loader_start_time:.2f} 초")

    # 병렬로 문서 분할
    split_start_time = time.time()
    with ProcessPoolExecutor() as executor:
        docs_split = list(executor.map(process_document_parallel, documents))
    
    docs_split = [item for sublist in docs_split for item in sublist]
    split_end_time = time.time()
    print(f"문서 분할 시간: {split_end_time - split_start_time:.2f} 초")

    # Chroma 초기화
    client = Chroma(embedding_function=embedding_model, persist_directory=persist_directory, anonymized_telemetry=False)

    # 벡터 추가 시간 측정
    embedding_start_time = time.time()
    batch_size = 1000  # Chroma에 추가할 배치 크기
    for i in range(0, len(docs_split), batch_size):
        batch = docs_split[i:i+batch_size]
        client.add_documents(batch)
        client.persist()  # 매 배치 처리 후 persist 호출
    embedding_end_time = time.time()
    print(f"벡터 추가 및 저장 시간: {embedding_end_time - embedding_start_time:.2f} 초")

    # 메모리 정리
    del embedding_model, loader, documents, docs_split, client
    gc.collect()
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()

    total_time = embedding_end_time - loader_start_time
    return total_time

def main():
    directory_path = 'file/현행법령/'
    persist_directory = "file/chroma_storage/multilingual_law/"
    model_name = "intfloat/multilingual-e5-large"

    file_names = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

    for file_name in file_names:
        file_path = os.path.join(directory_path, file_name)
        print(f"{file_name} 파일 처리 중...")

        total_processing_time = process_file(file_path, model_name, persist_directory)

        print(f"{file_name} 처리 완료. 총 소요 시간: {total_processing_time:.2f} 초")

    print("모든 파일 처리 완료.")

if __name__ == "__main__":
    main()

# 퍼플렉시티

In [1]:
import os
import torch
from typing import List
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.base import Embeddings
import time

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings = self.model.encode(texts, convert_to_tensor=True, device=device)
        return embeddings.cpu().tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.embed_documents([text])[0]


def process_file(file_path: str, model_name: str, persist_directory: str):
    embedding_model = SentenceTransformerEmbeddings(model_name)
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=lambda text: len(embedding_model.tokenizer.encode(text)),
    )

    docs_split = splitter.split_documents(documents)
    # print(f"문서 분할 완료: {len(docs_split)}개의 청크 생성")

    client = Chroma.from_documents(docs_split, embedding_model, persist_directory=persist_directory)
    client.persist()
    # print("벡터 추가 및 저장 완료")

    # 메모리 관리 - 리소스 해제
    del embedding_model, loader, documents, docs_split, client

    # MPS를 사용하는 경우 (M1 Mac)
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()

    # 가비지 컬렉션 강제 실행
    import gc
    gc.collect()

def main():
    directory_path = 'file/현행법령/'
    persist_directory = "file/chroma_storage/multilingual_law/"
    model_name = "jinaai/jina-embeddings-v3"

    for file_name in os.listdir(directory_path):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(directory_path, file_name)
            # print(f"{file_name} 파일 처리 중...")
            start_time = time.time()
            process_file(file_path, model_name, persist_directory)
            end_time = time.time()
            print(f"{file_name} 처리 완료.\t소요 시간 : {end_time - start_time:.2f} 초")

    print("모든 파일 처리 완료.")

if __name__ == "__main__":
    main()

Using device: mps


  client.persist()


첨단재생의료 및 첨단바이오의약품 안전 및 지원에 관한 법률(법률)(제20331호)(20240521).pdf 처리 완료.	소요 시간 : 24.15 초
차세대전자소송 추진단 설치 및 운영에 관한 규칙(대법원규칙)(제02874호)(20200101).pdf 처리 완료.	소요 시간 : 12.24 초
민사조정법(법률)(제16910호)(20200305).pdf 처리 완료.	소요 시간 : 12.47 초
취업 후 학자금 상환 특별법 시행규칙(교육부령)(제00333호)(20240701).pdf 처리 완료.	소요 시간 : 12.19 초
국군방첩사령부령(대통령령)(제33409호)(20230418).pdf 처리 완료.	소요 시간 : 11.06 초
정부대표 및 특별사절의 임명과 권한에 관한 법률(법률)(제17160호)(20200331).pdf 처리 완료.	소요 시간 : 10.24 초
달빛철도 건설을 위한 특별법(법률)(제20293호)(20240814).pdf 처리 완료.	소요 시간 : 12.48 초
한국마사회법 시행령(대통령령)(제34577호)(20240621).pdf 처리 완료.	소요 시간 : 10.77 초
연안관리법 시행규칙(해양수산부령)(제00460호)(20210219).pdf 처리 완료.	소요 시간 : 12.92 초
허베이 스피리트호 유류오염사고 피해주민의 지원 및 해양환경의 복원 등에 관한 특별법 시행령(대통령령)(제30977호)(20200828).pdf 처리 완료.	소요 시간 : 12.95 초
실내공기질 관리법(법률)(제19720호)(20240315).pdf 처리 완료.	소요 시간 : 14.30 초
의료ㆍ요양 등 지역 돌봄의 통합지원에 관한 법률(법률)(제20415호)(20260327).pdf 처리 완료.	소요 시간 : 12.58 초
대한민국과아메리카합중국간의상호방위조약제4조에의한시설과구역및대한민국에서의합중국군대의지위에관한협정의시행에관한민사특별법시행령(대통령령)(제27960호)(2017033

KeyboardInterrupt: 