In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !sudo apt update
# !sudo apt install -y pciutils
# !curl -fsSL https://ollama.com/install.sh | sh



In [None]:
# # Start Ollama server in background
# !ollama serve > /dev/null 2>&1 &

# # !ollama pull llama3
# # !ollama pull llama3.2:1b
# !ollama pull llama2

In [None]:
# pip install opencv-python sentence-transformers chromadb langchain langchain_ollama langchain_community PyPDF2

In [None]:
from langchain_community.llms import Ollama
import os
import PyPDF2
import threading
import subprocess
import time
import requests
import json
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate


# Ollama 설치 및 실행 (Colab 셀에서 먼저 실행해야 함)
# !sudo apt update
# !sudo apt install -y pciutils
# !curl -fsSL https://ollama.com/install.sh | sh

# Ollama 서버 실행 함수
def run_ollama_serve():
    subprocess.Popen(["ollama", "serve"])

# 서버 시작
def start_ollama_server():
    thread = threading.Thread(target=run_ollama_serve)
    thread.daemon = True
    thread.start()
    time.sleep(5)  # 서버 초기화 시간


# PDF에서 텍스트 추출
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

# 벡터 저장소 생성
def create_vector_store(pdf_text):
    # 텍스트를 청크로 분할
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = text_splitter.split_text(pdf_text)

    # HuggingFace 다국어 임베딩 모델 사용
    embeddings = HuggingFaceEmbeddings(
        model_name="intfloat/multilingual-e5-large-instruct"
    )

    # ChromaDB 벡터 저장소에 저장
    vector_store = Chroma.from_texts(
        texts=chunks,
        embedding=embeddings,
        persist_directory="./chroma_db/CV"
    )

    return vector_store


# Then replace OllamaLLM with Ollama in your code
def load_llm(model_name="llama3.2"):
    llm = Ollama(model=model_name)
    return llm


# 면접 질문에 대한 응답 생성
def answer_interview_question(vector_store, question):
    # 프롬프트 템플릿 정의
    template = """
    You are a cadidate's friend.
    Objectively evaluate answer the candidate's suitability for the job based on the CV information below and the interview question.
    Based on the CV information below, please formulate the best answers to the interview questions to be hired.
    If the question is in Korean, answer in Korean.
    
    CV information:
    {context}

    Interview question: {question}
    Your answer should include
    1. Don't use "they", just use candidate's name.
    2. highlight relevant experience and skills from your CV
    3. include specific achievements and examples
    4. explain what value you can bring to the company
    5. maintain a professional and confident tone
    6. Answer with the numbering format.
    7. Answer the question in detail.
    
    """

    PROMPT = PromptTemplate(
        template=template,
        input_variables=["context", "question"]
    )

    # LLM 및 검색 체인 설정
    llm = load_llm()
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 4}),
        chain_type_kwargs={"prompt": PROMPT}
    )

    # 질문에 대한 응답 생성
    response = qa_chain.invoke({"query": question})

    return response["result"]

# 메인 함수
def main(pdf_path):
    # Ollama 서버 시작
    start_ollama_server()

    question = input("Ask for me the question e.g. Does this cadidate suitable for the job as a data engineer at our company? ")

    # PDF에서 텍스트 추출
    pdf_text = extract_text_from_pdf(pdf_path)

    # 벡터 저장소 생성
    vector_store = create_vector_store(pdf_text)

    # 질문에 대한 응답 생성
    answer = answer_interview_question(vector_store, question)

    return answer




In [None]:
if __name__ == "__main__":
    # pdf_path = "/content/CV.pdf"  # PDF 파일 경로 지정
    pdf_path = "CV.pdf"  # PDF 파일 경로 지정

    result = main(pdf_path)
    print("\nAnswer:")
    print(result)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Error: listen tcp 127.0.0.1:11434: bind: address already in use


In [None]:
# import shutil
# if os.path.exists("./chroma_db"):
#     shutil.rmtree("./chroma_db")
