This notebook display the Gene DB for Prochlorococcus

# Includes and setup

In [1]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import anthropic
import google.generativeai
import gradio as gr
from IPython.display import Markdown, display, update_display

In [2]:
import pandas as pd
import numpy as np

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from prompts import * 


In [5]:
# Load environment variables in a file called .env
# Print the key prefixes to help with any debugging

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
else:
    print("Google API Key not set")

OpenAI API Key not set
Anthropic API Key not set
Google API Key not set


In [6]:
anthropic_client = anthropic.Anthropic(
    # defaults to os.environ.get("ANTHROPIC_API_KEY")
    #api_key="my_api_key",
)
MODEL = "claude-3-5-sonnet-20241022"
max_tokens=4096
temperature=0

# Load list of MED4 genes

In [7]:
med4_genome_dpath = '../genomes/MED4'
#os.listdir(med4_genome_dpath)

In [9]:
gff_named_df = pd.read_csv(os.path.join(med4_genome_dpath, 'MED4_named_genes_NCBI.csv'))
gff_named_df

Unnamed: 0,gene_name,gene,locus_tag,product,protein_id
0,"dnaN , TX50_RS00020",dnaN,TX50_RS00020,DNA polymerase III subunit beta,WP_011131639.1
1,"purL , TX50_RS00030",purL,TX50_RS00030,phosphoribosylformylglycinamidine synthase sub...,WP_011131641.1
2,"purF , TX50_RS00035",purF,TX50_RS00035,amidophosphoribosyltransferase,WP_011131642.1
3,"queG , TX50_RS00050",queG,TX50_RS00050,tRNA epoxyqueuosine(34) reductase QueG,WP_011131645.1
4,"nusB , TX50_RS00060",nusB,TX50_RS00060,transcription antitermination factor NusB,WP_036930720.1
...,...,...,...,...,...
498,"dnaK , TX50_RS09095",dnaK,TX50_RS09095,molecular chaperone DnaK,WP_011133331.1
499,"rpsF , TX50_RS09105",rpsF,TX50_RS09105,30S ribosomal protein S6,WP_011133333.1
500,"mraY , TX50_RS09125",mraY,TX50_RS09125,phospho-N-acetylmuramoyl-pentapeptide-transferase,WP_036930713.1
501,"uvrA , TX50_RS09140",uvrA,TX50_RS09140,excinuclease ABC subunit UvrA,WP_011133339.1


# Load anthropic batch results

In [17]:
batch_result_fpath = os.path.join('batch results', 'anthropic_batch_single_prompt_26012025_results.jsonl')

In [16]:
import jsonlines

In [18]:
with jsonlines.open(batch_result_fpath) as reader:
    batch_results_jsonl = [obj for obj in reader]

In [39]:
def _get_gene_name_from_custom_id(id):
    '''
         extract and return the gene_name from the custom id
         example : 'batch-id-dnaN--TX50_RS00020' --> dnaN, TX50_RS00020
    '''
    return id.replace('batch-id-', '').replace('--', ' , ')
    

In [24]:
message_status = [m['result']['type'] for m in batch_results_jsonl]
#message_status


In [25]:
# check the status of the batch jobs
from collections import Counter
Counter(message_status)

Counter({'succeeded': 503})

In [40]:
gff_named_df.loc[gff_named_df.gene_name.isin([_get_gene_name_from_custom_id(batch_results_jsonl[0]['custom_id'])])]

Unnamed: 0,gene_name,gene,locus_tag,product,protein_id
0,"dnaN , TX50_RS00020",dnaN,TX50_RS00020,DNA polymerase III subunit beta,WP_011131639.1


In [41]:
batch_result_dict = {
    _get_gene_name_from_custom_id(m['custom_id']) :  m['result']['message']['content'][0]['text']
    for m in batch_results_jsonl            
}

In [26]:
batch_results_jsonl[0].keys()

dict_keys(['custom_id', 'result'])

In [35]:
batch_results_jsonl[0]['custom_id']

'batch-id-dnaN--TX50_RS00020'

In [43]:
# print(batch_results_jsonl[0]['result']['message']['content'][0]['text'])

# Parse the replies into multiple elements

## example

In [62]:
gene = 'dnaN , TX50_RS00020'

In [66]:
text = '<literature_review>\n' + batch_result_dict[gene]

In [83]:
def parse_claude_message(message):
    tags = {}
    for tag, content in re.findall(r'<(\w+)>(.*?)</\1>', message, re.DOTALL):
        tags[tag] = content.strip()
    return tags

In [90]:
parsed_text = parse_claude_message(text)
parsed_text.keys()

dict_keys(['literature_review', 'database_entry'])

In [94]:
left_text = text
for i in parsed_text: 
    left_text = left_text.replace(parsed_text[i], '')
left_text
    

'<literature_review>\n\n\n</literature_review>\n\n<database_entry>\n  \n</database_entry>'

In [96]:
database_entry_parse = parse_claude_message(parsed_text['database_entry'])

In [97]:
left_text = parsed_text['database_entry']
for i in database_entry_parse: 
    left_text = left_text.replace(database_entry_parse[i], '')
left_text
    

'<primary_function>\n    \n  </primary_function>\n\n  <physiological_contribution>\n    \n  </physiological_contribution>\n\n  <stress_responses>\n    \n  </stress_responses>\n\n  <uptake_exudation>\n    \n  </uptake_exudation>\n\n  <phylogenetic_persistence>\n    \n  </phylogenetic_persistence>\n\n  <coculture_role>\n    \n  </coculture_role>\n\n  <references>\n    \n  </references>'

In [63]:
print(batch_result_dict[gene])



1. Key Search Terms and Databases:
- Primary databases: PubMed, Web of Science, NCBI Gene, UniProt
- Search terms: "dnaN Prochlorococcus", "DNA polymerase III beta subunit cyanobacteria", "sliding clamp Prochlorococcus", "WP_011131639.1", "DNA replication Prochlorococcus"

2. Literature Overview:
- ~15 directly relevant papers
- Date range: 1998-2023
- Main focus: DNA replication mechanisms in cyanobacteria, sliding clamp function in prokaryotes

3. Key Sources:

a) [Robinson et al., 2013] "Structure and mechanism of the β-sliding clamp from E. coli"
- Detailed structural analysis of the β-sliding clamp
- Established conserved mechanism across prokaryotes

b) [Johnson & O'Donnell, 2005] "Cellular DNA Replicases"
- Comprehensive review of DNA polymerase III function
- Describes essential role of β-subunit in processivity

c) [Partensky et al., 1999] "Prochlorococcus: Advantages and Limits of Minimalism"
- Overview of Prochlorococcus genome streamlining
- Discusses conservation of essen

# Build gradio app

In [58]:
def get_gene_db_results(gene_name):
    return batch_result_dict[gene_name], batch_result_dict[gene_name]

In [64]:
import gradio as gr

with gr.Blocks() as demo:
    with gr.Row():
        genename = gr.Textbox(label="Gene Name")
        submit_btn = gr.Button("Submit")
    with gr.Row(equal_height=True):        
        output1 = gr.Textbox(label="Output Box")
        output2 = gr.Textbox(label="Output Box")
    submit_btn.click(fn=get_gene_db_results, inputs=genename, outputs=[output1,output2], api_name="Gene DB")

demo.launch()

# demo = gr.Interface(
#     fn=get_gene_db_results,
#     inputs=["text",],
#     outputs=["text"],
#     examples=list(batch_result_dict.keys()),
#     examples_per_page=30,
#     title="*Prochlorococcus* Gene Database",
#     flagging_mode='never',
#     #description="Here's a sample toy calculator.",
# )

# demo.launch()


* Running on local URL:  http://127.0.0.1:7873

To create a public link, set `share=True` in `launch()`.




In [None]:
import gradio as gr

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            num_1 = gr.Number(value=4)
            operation = gr.Radio(["add", "subtract", "multiply", "divide"])
            num_2 = gr.Number(value=0)
            submit_btn = gr.Button(value="Calculate")
        with gr.Column():
            result = gr.Number()

    submit_btn.click(
        calculator, inputs=[num_1, operation, num_2], outputs=[result], api_name=False
    )
    examples = gr.Examples(
        examples=[
            [5, "add", 3],
            [4, "divide", 2],
            [-4, "multiply", 2.5],
            [0, "subtract", 1.2],
        ],
        inputs=[num_1, operation, num_2],
    )

if __name__ == "__main__":
    demo.launch(show_api=False)