**Step 1: Literature search and text collection**

In [1]:
# Search PubMed and get article abstracts

# pip install biopython pandas openpyxl

from Bio import Entrez
import pandas as pd

# Define your email to use with NCBI Entrez
Entrez.email = "your@email.com"

def search_pubmed(keyword):
    
    # Adjust the search term to focus on abstracts
    search_term = f"{keyword}[Abstract]"
    handle = Entrez.esearch(db="pubmed", term=search_term, retmax=500)
    record = Entrez.read(handle)
    handle.close()
    # Get the list of Ids returned by the search
    id_list = record["IdList"]
    return id_list

def fetch_details(id_list):
    ids = ','.join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
    records = Entrez.read(handle)
    handle.close()

    # Create a list to hold our article details
    articles = []

    for pubmed_article in records['PubmedArticle']:
        article = {}
        article_data = pubmed_article['MedlineCitation']['Article']
        article['Title'] = article_data.get('ArticleTitle')
        
        # Directly output the abstract
        abstract_text = article_data.get('Abstract', {}).get('AbstractText', [])
        if isinstance(abstract_text, list):
            abstract_text = ' '.join(abstract_text)
        article['Abstract'] = abstract_text

        article['Journal'] = article_data.get('Journal', {}).get('Title')

        articles.append(article)

    return articles



# Example usage
keyword = "yarrowia carotene"
id_list = search_pubmed(keyword)
articles = fetch_details(id_list)

# Convert our list of articles to a DataFrame
df = pd.DataFrame(articles)

# Saving the DataFrame to an Excel file
excel_filename = keyword+"_pubmed_search_results.xlsx"
df.to_excel(excel_filename, index=False)

print(f"Saved search results to {excel_filename}")


Saved search results to yarrowia carotene_pubmed_search_results.xlsx


**Step 2: Entity and relationship extraction with LLM**

In [2]:
import pandas as pd
import os
# import requests  # not needed now
from openai import OpenAI
from tqdm.auto import tqdm

# Initialize the OpenAI-compatible client pointing to Ollama
# (API key can be any non-empty string; Ollama doesn't validate it)
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

def ask_questions(abstract, questions, system_prompts):
    responses = []
    for question, system_prompt in zip(questions, system_prompts):
        prompt_text = question + " " + str(abstract)
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt_text}
        ]
        
        try:
            # Use the OpenAI-compatible client against Ollama
            completion = client.chat.completions.create(
                model="qwen3:8b",  # Verify model name with `ollama list`
                messages=messages,
                max_tokens=5000,
                timeout=10000  # seconds
            )
            answer = completion.choices[0].message.content
            responses.append(answer.strip() if answer else "")
        except Exception as e:
            tqdm.write(f"Error getting response: {e}")
            responses.append("")
    
    return responses

# ---------------------------------------------------
# Example usage reading from Excel and saving results
# ---------------------------------------------------

# Read the Excel file
file_path = excel_filename  # Replace with your file path
df = pd.read_excel(file_path)

questions = [" "]  # Maintain a placeholder for text structure
system_prompts = [
    "/no_think You are a specialized analyzer for scientific paper abstracts with a focus on identifying causal relationships between key entities in biological studies. Your primary task is to extract and identify all causal relationships present in an abstract between the following entities: Performance, Species, Genes, Methods of genetic engineering (such as knockout or expression), Enzymes, Proteins, and Bioprocess conditions (e.g., growth conditions). For each abstract provided, identify every causal relationship between these entities. Your output should strictly follow this format: (Entity A, Entity B), (Entity C, Entity D), ... with no additional text.",
]

total_rows = len(df)

# Prepare a list to collect outputs (avoid per-row df writes)
extracted_entities = [""] * total_rows

# Use tqdm for a persistent, top-line progress bar
# tqdm.write() will print messages without breaking the bar; the bar stays on top.
with tqdm(total=total_rows, desc="Processing abstracts", dynamic_ncols=True, leave=True) as pbar:
    # Do NOT clear the screen each iteration
    for i, row in df.iterrows():
        # Get response from Ollama
        response = ask_questions(row['Abstract'], [questions[0]], [system_prompts[0]])[0]
        extracted_entities[i] = response

        # Print outputs without disrupting the bar
        tqdm.write(f"Response for Row {i+1}:")
        tqdm.write(f"Answer to Question 2: {response}\n")

        # Advance the progress bar
        pbar.update(1)

# Assign the collected results in one go
df['Extracted entities'] = extracted_entities

# Save the updated DataFrame
output_file_path = 'updated(Qwen3_8b)_' + keyword + '_causal.xlsx'
df.to_excel(output_file_path, index=False)


Processing abstracts:   0%|                                                                     | 0/97 [00:00<…

Response for Row 1:
Answer to Question 2: <think>

</think>

(Helicase-CDA system, Performance), (YALI1_A01766g, Helicase-CDA system), (Helicase-CDA system, Genes), (Helicase-CDA system, Bioprocess conditions), (YALI1_B16239g, Genes), (YALI1_B16239g, Proteins), (G1637A substitution, YALI1_B16239g), (YALI1_B16239g, Enzymes), (ERG1, YALI1_B16239g), (Helicase-CDA system, Methods of genetic engineering), (β-carotene production, Performance), (CDA-14, β-carotene production), (Helicase-CDA system, β-carotene production), (Y. lipolytica, Species), (NHEJ, Methods of genetic engineering), (Helicase-CDA system, NHEJ)

Response for Row 2:
Answer to Question 2: <think>

</think>

()

Response for Row 3:
Answer to Question 2: <think>

</think>

(Yarrowia lipolytica, canthaxanthin biosynthesis pathway)  
(Yarrowia lipolytica, acetyl-CoA flux)  
(Yarrowia lipolytica, hexose catabolism)  
(Methylococcus capsulatus biomass, methanotroph-derived medium)  
(molasses, canthaxanthin titer)  
(canthaxanthin

**Step 3: Combine entities with similar meanings**

In [3]:
import pandas as pd
import re
import requests
import numpy as np
import concurrent.futures

##################################################
# 1) READ EXCEL AND EXTRACT ENTITIES
##################################################
df = pd.read_excel(output_file_path, engine="openpyxl")
df["Extracted entities"] = df["Extracted entities"].fillna("")
column_values = df["Extracted entities"].astype(str).tolist()

pattern = r"\(([^,]+), ([^)]+)\)"
entities = []
for value in column_values:
    matches = re.findall(pattern, value)
    for (e1, e2) in matches:
        entities.append(e1)
        entities.append(e2)

# Remove duplicates (preserving the order of first appearance)
entities = list(dict.fromkeys(entities))

##################################################
# 2) GET OLLAMA EMBEDDINGS (PARALLEL)
##################################################
def get_ollama_embedding(text, model):
    """
    Calls Ollama's OpenAI-style /v1/embeddings endpoint.
    Returns a Python list of floats or None on error.
    """
    try:
        r = requests.post(
            "http://localhost:11434/v1/embeddings",
            json={"model": model, "input": text},
            timeout=30
        )
        r.raise_for_status()
        data = r.json()
        # data["data"][0]["embedding"] => the actual embedding vector
        return data["data"][0]["embedding"]
    except Exception as e:
        print(f"Error embedding '{text}': {e}")
        return None

model_name = "nomic-embed-text:latest"  # Replace with your actual Ollama model name

# --- PARALLELIZE EMBEDDING REQUESTS ---
all_embeddings = []
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
    # Submit a future for each entity
    future_to_entity = {executor.submit(get_ollama_embedding, ent, model_name): ent for ent in entities}
    
    # Collect results as they complete
    for future in concurrent.futures.as_completed(future_to_entity):
        ent = future_to_entity[future]
        try:
            emb = future.result()
            all_embeddings.append((ent, emb))
        except Exception as e:
            print(f"Error for entity '{ent}': {e}")
            all_embeddings.append((ent, None))

# Re-sort embeddings back to original entity order
emb_dict = dict(all_embeddings)  # { "EntityString": embedding or None }
vectors = []
for ent in entities:
    emb_vec = emb_dict[ent]
    if emb_vec is not None:
        vectors.append(np.array(emb_vec, dtype=np.float32))
    else:
        vectors.append(None)

##################################################
# 3) COSINE SIMILARITY (VECTORIZED IN NUMPY)
##################################################
# We need a consistent dimensionality, so fill None embeddings with zeros
valid_vectors = [v for v in vectors if v is not None]
if not valid_vectors:
    print("No valid embeddings found, cannot proceed.")
    exit()

dim = len(valid_vectors[0])
for i, v in enumerate(vectors):
    if v is None:
        vectors[i] = np.zeros(dim, dtype=np.float32)

# Create a single 2D array: shape (N, D)
matrix = np.stack(vectors)  # shape (N, D)

# Dot product matrix (N x N)
dot_matrix = matrix @ matrix.T
norms = np.linalg.norm(matrix, axis=1, keepdims=True)  # shape (N,1)
denominator = norms @ norms.T                           # shape (N,N)
similarity_matrix = dot_matrix / denominator

threshold = 0.8
N = len(entities)
similar_phrases = {}

# --------------------------------------------------------
# We use np.triu_indices(N, k=1) => all i<j pairs in [0..N-1].
# This covers every unique pair exactly once, no duplication.
# --------------------------------------------------------
upper_indices = np.triu_indices(N, k=1)  # i<j
sim_vals = similarity_matrix[upper_indices]  # 1D array: sim for each pair (i<j)
above_thresh = np.where(sim_vals > threshold)[0]

for idx in above_thresh:
    i = upper_indices[0][idx]  # row index
    j = upper_indices[1][idx]  # col index
    # If sim > threshold, we say entity j is similar to entity i
    similar_phrases[entities[j]] = entities[i]

##################################################
# 4) REPLACE SIMILAR PHRASES IN THE DATAFRAME
##################################################
total_rows = len(df)
for row_idx in range(total_rows):
    if row_idx % 100 == 0 or row_idx == total_rows - 1:
        print(f"Progress: {100.0 * row_idx / total_rows:.1f}%")

    cell_value = str(df.at[row_idx, "Extracted entities"])
    
    # If "Yarrowia" appears in the cell, skip it
    if "Yarrowia" in cell_value:
        continue

    for similar, original in similar_phrases.items():
        # Also skip if "Yarrowia" is in the phrase itself
        if "Yarrowia" in similar:
            continue
        if similar in cell_value:
            cell_value = cell_value.replace(similar, original)

    df.at[row_idx, "Extracted entities"] = cell_value
modified_file_path = 'modified_' + output_file_path
df.to_excel(modified_file_path, index=False, engine="openpyxl")
print("Done. Saved modified file.")


Progress: 0.0%
Progress: 99.0%
Done. Saved modified file.


**Step 4.1: Plot knowledge graph**

In [4]:
from pyvis.network import Network
import pandas as pd
import re
import networkx as nx

# Load the Excel file
filepath = modified_file_path
df = pd.read_excel(filepath, engine='openpyxl')

# Initialize NetworkX Graph
G = nx.Graph()

# Nodes to exclude
words_to_exclude = []

# Regular expression to match the pattern (entity A, entity B)
pattern = r'\(([^,]+), ([^\)]+)\)'

# Iterate over the DataFrame rows to extract entity pairs and their sources
for _, row in df.iterrows():
    value = row['Extracted entities']
    source = row['Title']  # Extract source for each pair

    matches = re.findall(pattern, value)
    for entity_a, entity_b in matches:
        # Check if any word to exclude is part of the entity names
        if not any(word in entity_a for word in words_to_exclude) and not any(word in entity_b for word in words_to_exclude):
            G.add_node(entity_a, label=entity_a)
            G.add_node(entity_b, label=entity_b)
            G.add_edge(entity_a, entity_b, title=source)

def search_network(graph, keywords, depth=1):
    # Ensure all keywords are lowercase for case-insensitive search
    keyword_list = [kw.lower() for kw in keywords]

    # Helper function to check if a node label contains all keywords
    def contains_all_keywords(label):
        return all(kw in label.lower() for kw in keyword_list)

    # Collect nodes that contain all keywords in their label
    nodes_of_interest = set()
    for node, attr in graph.nodes(data=True):
        if 'label' in attr and contains_all_keywords(attr['label']):
            nodes_of_interest.add(node)

    # Expand search to include neighbors up to the specified depth
    for _ in range(depth):
        neighbors = set()
        for node in nodes_of_interest:
            neighbors.update(nx.neighbors(graph, node))
        nodes_of_interest.update(neighbors)
    
    # Return a subgraph containing only relevant nodes and edges
    return graph.subgraph(nodes_of_interest).copy()

# Perform search with a list of keywords
word_combinations = ["carotene"]  # Replace with your keywords
filtered_graph = search_network(G, word_combinations)

# Extract node names from the filtered graph
node_names = list(filtered_graph.nodes())

# Prepare a simple text summary of node names
node_names_text = ", ".join(node_names)

# Now, `node_names_text` contains a clean, comma-separated list of node names, ready for summarization
print(node_names_text)

# Initialize Pyvis network with the filtered graph
net = Network(height="2160px", width="100%", bgcolor="#222222", font_color="white")
net.from_nx(filtered_graph)

# Continue with setting options and saving the network as before
net.set_options("""
{
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -80000,
      "centralGravity": 0.5,
      "springLength": 75,
      "springConstant": 0.05,
      "damping": 0.09,
      "avoidOverlap": 0.5
    },
    "maxVelocity": 100,
    "minVelocity": 0.1,
    "solver": "barnesHut",
    "timestep": 0.3,
    "stabilization": {
        "enabled": true,
        "iterations": 500,
        "updateInterval": 10,
        "onlyDynamicEdges": false,
        "fit": true
    }
  },
  "nodes": {
    "font": {
      "size": 30,
      "color": "white"
    }
  }
}
""")

# Save and show the network
net.write_html('filtered_entity_' + "_".join(word_combinations) + '_network.html')


Iterative overexpression, crtE, crtYB, β-carotene, Lipid droplets, crtE, DO-stat Fed-batch fermentation, β-Carot, (Pentose phosphate pathway, β-Carotene conversion, acetic acid consumption concentration, β-carot, Y. lipolytica, 11 genes in β-Carot, (Pentose phosphate pathway, β-Carotene synthesis pathway, β-Carotene yield, increasing precursor β-carotene supply, Genotoxicity, Yield, Hxk, FPP, Clinical evaluations, rate-limiting enzyme tHMGR, Multiple gene copies, PspCrtW, pH, erg13, β-Carot, (Pentose phosphate pathway, β-Carotene synthesis, NADP+ -dependent glyceraldehyde-3-phosphate dehydrogenase, (Pentose phosphate pathway, genes in β-Carotene production pathway, β-Carot, (Pentose phosphate pathway, β-Carotene yield, Genes, Multifunctional carotene synthase, Zeaxanthin, Clinical pathology, crtW, (Pentose phosphate pathway, genes in β-Carotene production pathway, content, HBFD, <i>Y. lipolytica</i> PO1h, β-carotene synthesis related genes, Phytoene, Engineered strain, Escherichia coli

**Step 4.2 produce summarization report**

In [5]:
from IPython.display import Markdown
import requests  # Required for API calls

def trim_text(text, max_length):
    if len(text) > max_length:
        return text[:max_length].rsplit(' ', 1)[0] + "..."  # Trim to max_length, avoid cutting words in half
    else:
        return text

# Apply the trimming function to node_names_text
cut_off_chunk_size = 5000
trimmed_node_names_text = trim_text(node_names_text, cut_off_chunk_size)
keyword = ", ".join(word_combinations)

# Construct the prompt with the potentially trimmed node_names_text
prompt = "These are the terms related to " + filepath + keyword + ", categorize them and write a summary report.   " + trimmed_node_names_text

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

try:
    # Use Ollama's OpenAI-compatible API
    response = requests.post(
        'http://localhost:11434/v1/chat/completions',
        json={
            'model': 'qwen3:8b',  # Verify with `ollama list`
            'messages': messages,
            'max_tokens': 5000,
            'timeout': 10000  # 10 seconds
        }
    )
    response.raise_for_status()
    response1 = response.json()['choices'][0]['message']['content']
    
except Exception as e:
    print(f"API Error: {e}")
    response1 = "Failed to generate response"

display(Markdown(response1))

<think>
Okay, let's see. The user has a list of terms related to modified_updated(Qwen3_8b)_yarrowia carotene_causal.xlsx, and they want them categorized and a summary report written. First, I need to understand what each term refers to. 

Looking at the terms, many are related to β-Carotene, like β-Carotene synthesis pathway, genes involved in its production, fermentation processes, and specific enzymes like crtE, crtYB, crtZ. Then there's mention of Yarrowia lipolytica, which is a yeast used for producing carotenoids. There are also terms about metabolic pathways like the Pentose phosphate pathway, mevalonate pathway, and maybe the phosphoketolase-phosphotransacetylase (PK-PTA) pathway. 

Some terms relate to genetic engineering methods such as CRISPR-iCas9, Codon-adapted CarRA, and Multigene cassette. There are also terms about fermentation conditions and optimization, like DO-stat Fed-batch fermentation, optimized medium, and specific nutrient conditions. 

Other terms mention safety and toxicity evaluations, like Genotoxicity, Ames test, Micronucleus test, and Safety profile. There's also some mention of bioprocess metrics, such as Yield, β-Carotene content, titer, and mg/g DCW. 

I need to categorize these into sections. Let me start by grouping similar terms. For example:

1. **Genetic Engineering & Synthetic Biology**: Terms like CRISPR-iCas9, multigene cassette, codon-optimized genes, etc.
2. **Metabolic Pathways**: Pentose phosphate pathway, mevalonate pathway, carotenogenesis genes, etc.
3. **Enzymes & Key Genes**: crtE, crtYB, crtZ, CarB, CrtW, HMG-CoA reductase, etc.
4. **Fermentation & Bioprocess Optimization**: Fed-batch, DO-stat, optimized medium, nutrient conditions, etc.
5. **Yeast Strain Engineering**: Yarrowia lipolytica, engineered strains, lipid overproducer, etc.
6. **Carotenoid Types & Synthesis**: β-Carotene, Astaxanthin, Zeaxanthin, etc.
7. **Safety & Toxicity Studies**: Genotoxicity, Ames test, micronucleus test, safety profile.
8. **Performance & Yield Metrics**: Yield, titer, content, mg/g DCW, etc.
9. **Experimental Methods**: 13C labeling, flux adjustment, quantitative analysis, etc.

Wait, some terms like "β-Carotene ketolase" might fall under enzymes, and "Lipid droplets" could be under sub-organelles or lipid metabolism. Also, terms like "Clinical evaluations" and "CD-1 mice" relate to safety and toxicity. 

I should make sure not to miss any terms and check if some belong to multiple categories. For example, "Pentose phosphate pathway" is under metabolic pathways but is also mentioned with genes, so maybe include it in both if needed. However, the user wants them categorized, so perhaps group by main theme. 

Now, structuring the summary report would involve an overview, then each section detailing the categories, and a conclusion. I need to ensure that all terms are covered and the summary is comprehensive but concise. Also, the user might be looking for insights into how these elements interrelate in carotenoid production, so highlighting key areas like genetic engineering, pathway optimization, and bioprocess conditions would be important. 

I should also check for any repeated terms, like "β-Carot" and "β-Carotene" which might be typos or variations. Need to treat them as the same term. Terms like "Genetic engineering methods" and "Genetic engineering" can be grouped together. 

Finally, the summary should highlight the main strategies used in β-carotene production using Y. lipolytica, the key gene clusters, and the methods for optimizing yield and safety. Make sure to mention the integration of different techniques like metabolic pathway engineering, genetic modifications, and bioprocess optimization.
</think>

### Summary Report: Categorization of Terms Related to β-Carotene Production in *Y. lipolytica*  

#### **1. Genetic Engineering & Synthetic Biology**  
- **Key Genes**:  
  - *crtE*, *crtYB*, *crtZ*, *crtI*, *crtW*, *crtB*, *crtI* (carotenogenesis genes).  
  - *HMG-CoA reductase* (rate-limiting enzyme tHMGR), *GGS1/crtE*, *CarB*, *CarRA*, *CarRP*, *CarB* (codon-adapted).  
  - *Genes in β-Carotene Production Pathway* (11 genes, Pathway 11 genes).  
- **Genetic Tools**:  
  - CRISPR-iCas9 for gene editing, Multigene cassette, Helicase-CDA system, Codon-adapted genes.  
  - *MT2 promoter*, *Strong promoters* for overexpression.  
  - *mIAA7 degron* for regulated protein expression.  
- **Strain Optimization**:  
  - **Engineered *Yarrowia lipolytica*** (e.g., strain PO1h, lipid overproducer strains).  
  - **Copy number of β-carotene synthesis-related genes** (Multiple gene copies).  
  - **Multifunctional carotene synthase**, **PspCrtW**.  

---

#### **2. Metabolic Pathways & Enzymatic Mechanisms**  
- **Key Pathways**:  
  - **Pentose Phosphate Pathway (PPP)** (repeatedly mentioned, critical for NADPH production).  
  - **Mevalonate pathway** (key gene: *HMG-CoA*).  
  - **Phosphoketolase-phosphotransacetylase (PK-PTA) pathway**.  
- **Intermediate Metabolites**:  
  - **FPP** (farnesyl pyrophosphate), **Acetyl-CoA**, **Mevalonate**, **GPP** (geranyl pyrophosphate).  
  - **Intermediate accumulation** (e.g., lycopene inhibition).  
- **Enzymes**:  
  - **β-Carotene ketolase**, **Hydroxylase (crtZ)**, **NADP+-dependent glyceraldehyde-3-phosphate dehydrogenase**, **Redox rebalancing**.  
  - **Lycopene inhibition** as a regulatory mechanism.  

---

#### **3. Fermentation & Bioprocess Optimization**  
- **Fermentation Strategies**:  
  - **DO-stat Fed-batch fermentation**, **Optimized medium** (e.g., YPD/YNB flask cultures, Canola oil-containing yeast-peptone).  
  - **Specific nutrient conditions** for lipid and carotenoid biosynthesis.  
  - **Peroxisomes** as sub-organelles involved in carotenoid synthesis.  
- **Bioprocess Metrics**:  
  - **β-Carotene yield**, **titer**, **content (mg/g DCW)**, **Lipid droplets**.  
  - **Flux adjustment**, **ATP expenditure**, **Growth profile**, **Performance** (e.g., yield optimization).  
- **Strain Engineering**:  
  - **Morphological engineering** (yeast form, hyphae).  
  - **Lipid biosynthesis pathway engineering**, **Central carbon pathway engineering**.  

---

#### **4. Carotenoid Types & Synthesis Outcomes**  
- **Target Compounds**:  
  - **β-Carotene**, **Astaxanthin**, **Zeaxanthin**, **Phytoene**, **Lycopene**.  
  - **Synthetic β-carotene** and **Astaxanthin production synthesis**.  
- **Conversion Pathways**:  
  - **β-Carotene biosynthetic pathway**, **β-Carotene synthesis pathway**, **Carotenoid-producing strains**.  
  - **β-Carotene conversion**, **β-Carotene hydroxylase (crtZ)**, **β-Carotene ketolase** (CrtW).  

---

#### **5. Safety, Toxicity, and Regulatory Aspects**  
- **Genotoxicity Studies**:  
  - **Ames test**, **Micronucleus test**, **Genotoxicity** (e.g., *Y. lipolytica* safety profile).  
  - **Clinical evaluations**, **Clinical pathology**, **Histopathological evaluations**, **Safety profile**.  
- **Toxicity Data**:  
  - **NOAEL** (no observed adverse effect level), **Adverse effects**.  
  - **Chinese Hamster Ovary WBL cells** for toxicity testing.  

---

#### **6. Experimental Methods & Analytical Tools**  
- **Quantification Techniques**:  
  - **13C metabolite labeling**, **Quantification of β-carotene content**, **Mass spectrometry**.  
- **Analytical Metrics**:  
  - **Yield**, **Titer**, **Content**, **DCW (dry cell weight)**, **Bioprocess conditions**.  
- **Modeling & Optimization**:  
  - **Machine learning**, **Flux adjustment**, **Redox rebalancing**, **GGS1 Expression differences**.  

---

### **Key Insights**  
1. **Genome Engineering**: Multigene cassette and codon-optimized genes (e.g., *crtE*, *crtYB*) are critical for enhancing β-carotene synthesis.  
2. **Pathway Optimization**: The PPP and mevalonate pathway are central for NADPH and isoprenoid precursors, while targeted flux adjustments boost yield.  
3. **Bioprocess Innovation**: Fed-batch fermentation and optimized media (e.g., lipid-rich substrates) maximize carotenoid production.  
4. **Safety Validation**: Rigorous genotoxicity and clinical evaluations ensure the safety of engineered strains for industrial applications.  
5. **Integration of Tools**: CRISPR-iCas9, Helicase-CDA system, and metabolic engineering synergize to achieve high β-carotene titer (mg/g DCW) in *Y. lipolytica*.  

This report highlights the multidisciplinary approach to β-carotene production, blending genetic, metabolic, and bioprocess engineering for scalable, safe, and efficient bioproduction.