In [1]:
from langchain_community.llms import Ollama
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceEmbeddings, OllamaEmbeddings
from langchain.vectorstores import utils as chromautils
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_openai import ChatOpenAI
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_openai import OpenAIEmbeddings
import os
import shutil
import chromadb
os.environ["OPENAI_API_KEY"] = "sk-********************************"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = 'hf_********************************'

DATA_PATH = 'markdown/'
CHROMA_DB_PATH = './db/chromadb/'
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

SYSTEM_PROMPT = """<|SYSTEM|>
            The user is a researcher who wants to set up a synthetic biology experiment.
            The assistant is an expert on DNA assembly.
            The assistant have documents that describe experiments; Heading level 2 following two hashes(##) is the top heading in a document, which means the name of an experiment task consisting of many unit processes.
            If given information of the target experiment, the name of the target experiment will be a level-2 heading and the assistant needs to explain its experimental steps which are called "unit process" for each.
            The assistant should specify in English the device or materials(including their volume) which should be used for the experiment.
            When the assistant writes the experimental steps(that is unit process) for planning an experiment, it should follow the format of the documentation provided;
            Heading level 3 with three hashes(###) should be used as each unit process title with no indices other than the hashes such as "Step 1", "1." and so on.
            The title of the unit process should be followed by the details of the unit process including "Material", "Equipment" and its "Method" which should be with 4 hashes(####). "Material", "Equipment" and "Method" as level-4 headings should be written in English.
            More than 4 hashes or no hashes should be used for describing the details of the unit process.
"""

PROMPT_TEMPLATE = """
{system}

Extract the relevant content and hierarchy based on the following context.:
{context}
---

Answer the part of the question based on the above context: {question}
You must answer in a structured format separated by headers, such as above context.
"""

In [2]:
# Load Markdown files.
def load_documents():
    loader = DirectoryLoader(DATA_PATH, 
                             glob="*.md", 
                             show_progress=True, 
                             loader_cls=TextLoader)
    documents = loader.load()
    return documents

In [3]:
documents = load_documents()

100%|██████████| 4/4 [00:00<00:00, 2630.89it/s]


In [4]:
def split_text(documents: list[Document]):
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3")
    ]
    
    text_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        strip_headers = False
    )
    # print(documents[0])

    chunks = []
    for document in documents:
        chunk = text_splitter.split_text(document.page_content)
        chunks.append(chunk)

    # text_splitter = CharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    # chunks = text_splitter.split_documents(documents)

    return chunks

In [5]:
chunks = split_text(documents)

In [6]:
embeddings_model = HuggingFaceEmbeddings(
    model_name='jhgan/ko-sroberta-nli',
    model_kwargs={'device':'cuda'},
    encode_kwargs={'normalize_embeddings':True},
)



In [7]:
# Connect to the ChromaDB client
persist_dir = CHROMA_DB_PATH
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

collection = client.get_or_create_collection(name='labnote')

vectordb = Chroma.from_documents(
    documents=chunks[0], # 여러 파일 올리는 방법 확인 필요
    embedding=embeddings_model,
    client=client,
    collection_name='labnote_4',
    persist_directory=persist_dir
)

vectordb.add_documents(documents=chunks[1])
vectordb.add_documents(documents=chunks[2])
vectordb.add_documents(documents=chunks[3])

print(f"Documents Loaded: {vectordb._collection.count()}")

Documents Loaded: 189


In [8]:
from langchain_core.output_parsers import StrOutputParser

# query it
query = "Spacer_connection에서 Golden Gate assembly mixture 제작"
doc1 = vectordb.similarity_search(query)
## show top two 
print("====================================")
print(len(doc1))
print(doc1[0].page_content)
print(doc1[1].page_content)

retriever = vectordb.as_retriever(search_kwargs={"k": 10})
doc2 = retriever.invoke(query)
print("====================================")
print(len(doc2))
print(doc2[0].page_content)
print(doc2[1].page_content)

4
### \[Liquid handling\] Golden Gate assembly mixture 제작  
#### 20240604  
#### 시약  
-   DNA parts (spacer), lycopene 들어간 vector (V6L, V7L)  
-   DW  
-   T4 DNA ligase (HC) (Promega)  
-   BsaI restriction enzyme (NEB)  
-   10x T4 DNA ligase buffer (Promega)  
#### 소모품  
-   Pipet tip (10p, 200p tip)  
-   PCR tube  
-   PCR tube rack  
#### 장비  
-   Pipet (10p, 2.5p, 200p)  
-   freezer  
#### 방법  
-   대량 자동화 수행 시, Janus, Echo 525 를 위한 추가적인 프로토콜이 필요함
-   농도가 측정된 파트들을 계산하여 10 nM (100 fmol/10 $\mu l$ ) 이상이 되도록 함 (volume은 10 $\mu l$ 로 맞춤)
-   볼륨을 맞추기 위해 DW를 넣어줌
-   10x ligase buffer와 ligase, restriction enzyme을 넣음
-   많은 양을 제작할 때는 stock으로 만든 후 소분
-   실험에 사용된 부품과 양은 Assembly_025.xlsx의 240604 sheet를 참고  
![](images/paste-1.png)  
#### 결과물  
-   Golden Gate assembly를 위한 vector assembly mixture 2 종
### \[Liquid handling\] Golden Gate assembly mixture 제작  
#### 20240604  
#### 시약  
-   DNA parts (spacer), lycopene 들어간 vector (V6L, V7L)  
-   DW  
-   T4 DNA ligase (HC) (Promega)  
-   BsaI

In [9]:
# huggingface model load
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True
)

hf_evee = HuggingFacePipeline.from_model_id(
    model_id="yanolja/EEVE-Korean-Instruct-10.8B-v1.0",  # 사용할 모델의 ID를 지정합니다.
    task="text-generation",  # 수행할 작업을 설정합니다. 여기서는 텍스트 생성입니다.
    # 사용할 GPU 디바이스 번호를 지정합니다. "auto"로 설정하면 accelerate 라이브러리를 사용합니다.
    device=0,
    # 파이프라인에 전달할 추가 인자를 설정합니다. 여기서는 생성할 최대 토큰 수를 10으로 제한합니다.
    pipeline_kwargs=dict(max_new_tokens= 2048,
                         repetition_penalty=1.2,),
    model_kwargs={"quantization_config": quantization_config,
                  "temperature": 0.2},
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Setting the `device` argument to None from 0 to avoid the error caused by attempting to move the model that was already loaded on the GPU using the Accelerate module to the same or another device.


In [10]:
def query_chroma(query_text, llm):

    retriever = vectordb.as_retriever(
        search_type='mmr',
        search_kwargs={'k': 5, 'lambda_mult': 0.2}
        )

    docs = retriever.get_relevant_documents(query)
    # print(docs)
    # print("----------")
    # question = "how can I conduct golden gate assembly today?"
    prompt = ChatPromptTemplate.from_template(
        PROMPT_TEMPLATE.format(system=SYSTEM_PROMPT, context='{context}', question='{question}')
    )

    format_docs = '\n\n'.join([d.page_content for d in docs])

    # print(format_docs)
    chain = prompt | llm | StrOutputParser()

    response = chain.invoke({'context': format_docs, 'question': query_text})

    print("response:\n", response)
    return response

def llm_list():
    options = [
               (Ollama(model="solar:latest"), "Solar model"), 
               (Ollama(model="llama-3-Korean-Bllossom-8B:latest"), "llama-3-Korean-Bllossom-8B"),
               (ChatOpenAI(model="gpt-4o"), "gpt-4o"),
               (hf_evee, "EEVE"),
              ]
    print("Please choose an model:")
    for i, (option, description) in enumerate(options, start=1):
        print(f"{i}. {description}")
    
    choice = input("Enter the number of your choice: ")
    print("\n===================================================\n")
    llm = options[int(choice) -1][0]
    return llm

def query_data():
    llm = llm_list()
    print(llm)
    print("\n===================================================\n")
    user_input = input("Enter your question:").lower()
    query_text = user_input
    query_chroma(query_text, llm)


if __name__ == "__main__":
    query_data()

Please choose an model:
1. Solar model
2. llama-3-Korean-Bllossom-8B
3. gpt-4o
4. EEVE




client=<openai.resources.chat.completions.Completions object at 0x7ff7aeee0b50> async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7ff7a46e5d80> model_name='gpt-4o' openai_api_key=SecretStr('**********') openai_proxy=''




  warn_deprecated(


response:
 ## Multi-module to plasmid (gibson assembly)

### [Thermocycling] Thermocycler를 이용한 Gibson assembly 진행

#### 시약
- Gibson assembly를 위해 제작한 module mixture 종

#### 장비
- Thermocycler (Bio-rad)

#### 방법
- part mixture를 홈에 맞춰 Thermocycler에 넣음
- 뚜껑을 닫고 조임
- Gibson assembly 조건에 맞추어 작동

| Steps | temperature | time  | description      |
|-------|-------------|-------|------------------|
| 1     | 37℃         | 50min | initial reaction |
| 2     | 50℃         | 5h    | reaction         |
| 3     | 4℃          | \~    |                  |

- Gibson assembly 반응이 끝난 뒤 샘플을 냉동고에 보관

#### 결과물
- MVA pathway가 모두 들어간 gibson assembly product -10 μL PCR tube
