# House keeping and setups

In [1]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import anthropic
import google.generativeai
import gradio as gr
from IPython.display import Markdown, display, update_display

In [2]:
import pandas as pd
import numpy as np

In [3]:
%load_ext autoreload
%autoreload 2

In [58]:
from prompts import * 


In [5]:
# Load environment variables in a file called .env
# Print the key prefixes to help with any debugging

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
perplexity_api_key = os.getenv('PERPLEXITY_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
else:
    print("Google API Key not set")


if perplexity_api_key:
    print(f"Perplexity API Key exists and begins {perplexity_api_key[:6]}")
else:
    print("Perplexity API Key not set")

OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins sk-ant-
Google API Key exists and begins AIzaSyCF
Perplexity API Key exists and begins pplx-H


|Model	              |Context Length |
|---------------------|---------------|
|sonar-reasoning-pro  |	127k          |
|sonar-reasoning	  | 127k          |
|sonar-pro	          |200k	          |
|sonar	              |127k	          |


In [69]:
from openai import OpenAI
perplexity_client = OpenAI(api_key=perplexity_api_key, base_url="https://api.perplexity.ai")
MODEL = 'sonar-pro'
max_tokens=4096
temperature=0

## perplexity API example

In [7]:

messages = [
    {
        "role": "system",
        "content": (
            "You are an artificial intelligence assistant and you need to "
            "engage in a helpful, detailed, polite conversation with a user."
        ),
    },
    {   
        "role": "user",
        "content": (
            "How many stars are in the universe?"
        ),
    },
]


# chat completion without streaming
response = perplexity_client.chat.completions.create(
    model=MODEL,
    messages=messages,
)
#print(response)

# # chat completion with streaming
# response_stream = client.chat.completions.create(
#     model="sonar-pro",
#     messages=messages,
#     stream=True,
# )
# for response in response_stream:
#     print(response)


In [8]:
response.to_dict()

{'id': 'baef9c5b-f737-4d39-b3e5-ddaf97f96469',
 'choices': [{'finish_reason': 'stop',
   'index': 0,
   'message': {'content': "Hello I'm glad you're interested in learning about the number of stars in the universe. Estimating this number is quite complex, as it involves understanding the vastness of the cosmos and the limitations of our current observational capabilities.\n\n### Current Estimates\n\nAstronomers have made several estimates based on observations and simulations. Here are a few key points:\n\n1. **Observable Universe**: The observable universe is the part of the universe we can see, limited by how far light has had time to travel since the Big Bang. Estimates for the number of stars in the observable universe vary widely:\n   - Some sources suggest there are approximately **200 billion trillion** (200 sextillion) stars[2][4].\n   - Others estimate around **70 billion trillion** (7 x 10^22) stars[3].\n\n2. **Galaxies and Stars**: The number of galaxies in the observable u

In [10]:
display(Markdown(response.choices[0].message.content))

Hello I'm glad you're interested in learning about the number of stars in the universe. Estimating this number is quite complex, as it involves understanding the vastness of the cosmos and the limitations of our current observational capabilities.

### Current Estimates

Astronomers have made several estimates based on observations and simulations. Here are a few key points:

1. **Observable Universe**: The observable universe is the part of the universe we can see, limited by how far light has had time to travel since the Big Bang. Estimates for the number of stars in the observable universe vary widely:
   - Some sources suggest there are approximately **200 billion trillion** (200 sextillion) stars[2][4].
   - Others estimate around **70 billion trillion** (7 x 10^22) stars[3].

2. **Galaxies and Stars**: The number of galaxies in the observable universe is estimated to be around **2 trillion**[4]. If we assume each galaxy has about 100 billion stars (similar to the Milky Way), we can estimate the total number of stars. However, this method might overestimate, as not all galaxies are as large as the Milky Way[5].

3. **Milky Way**: Our own galaxy, the Milky Way, is thought to contain between **100 billion and 400 billion stars**[1][5].

### Challenges in Estimation

- **Observational Limitations**: We can only observe objects within a certain distance from Earth, as light from farther objects hasn't had time to reach us yet.
- **Variability Among Galaxies**: Galaxies vary greatly in size and star content, making it difficult to use a single galaxy as a representative model.
- **Stellar Lifecycles**: Stars are born and die over cosmic time, affecting the total number of stars at any given moment.

In summary, while we don't have an exact count, estimates suggest there are tens to hundreds of sextillion stars in the observable universe. However, these numbers are subject to change as our understanding and observational capabilities improve.

In [81]:
def get_messages(x, template_prompt, placeholders_map):
    ''' get messages based on a line in the gff DF 
        x : tuple - a line from the dataframe
        columns2placeholders - dict mapping colum names to placeholders in the prompt
    '''
    user_prompt = template_prompt
    for k,v in placeholders_map.items():
        user_prompt = user_prompt.replace(k, x[v])
        
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": user_prompt,
                }
            ]
        },
        # {
        #     "role": "assistant",
        #     "content": [
        #         {
        #             "type": "text",
        #             "text": "<literature_review>"
        #         }
        #     ]
        # }
    ]
    return messages
    

In [39]:
def get_messages_system_unstructured(x, template_prompt, placeholders_map):
    ''' get messages based on a line in the gff DF 
        x : tuple - a line from the dataframe
        columns2placeholders - dict mapping colum names to placeholders in the prompt
    '''
    user_prompt = template_prompt
    for k,v in placeholders_map.items():
        user_prompt = user_prompt.replace(k, x[v])
        
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": user_prompt,
                }
            ]
        },
    ]
    return messages
    

# Load list of MED4 genes

In [12]:
med4_genome_dpath = '../genomes/MED4'
os.listdir(med4_genome_dpath)

['cds_from_genomic.fna',
 'GCF_000011465.1_ASM1146v1_genomic.fna',
 'GCF_000011465.1_ASM1146v1_genomic.fna.fai',
 'genomic.gbff',
 'genomic.gff',
 'genomic.gtf',
 'MED4.bed',
 'MED4_biocyc_pathways.txt',
 'MED4_named_genes_NCBI.csv',
 'MED4_pathways.csv',
 'MED4_PMM2locus.csv',
 'MED4_protein.faa',
 'MED4_user_ko.txt',
 'Pro_MED4.gff',
 'README.get_pmm',
 'sequence_report.jsonl',
 'uniprot-taxonomy-59919.tab']

In [13]:
import gffpandas.gffpandas as gffpd

In [14]:
annotation = gffpd.read_gff3(os.path.join(med4_genome_dpath, 'genomic.gff'))

In [15]:
gff_df = annotation.filter_feature_of_type(['CDS']).attributes_to_columns()

In [16]:
gff_columns = [#'phase','Name', 'Note', 'exception', 'gbkey', 'inference', 'partial','pseudo',
               'gene_name', 'gene', 'locus_tag', 
       'product', 'protein_id', ]
#gff_df[gff_columns]

In [17]:
mask = gff_df.gene.isna()
gff_df.loc[mask, 'gene_name'] = gff_df.loc[mask, 'locus_tag'] 
gff_df.loc[~mask, 'gene_name'] =gff_df.loc[~mask, 'gene'] + ' , ' + gff_df.loc[~mask, 'locus_tag'] 

In [18]:
gff_df[gff_columns]

Unnamed: 0,gene_name,gene,locus_tag,product,protein_id
2,"dnaN , TX50_RS00020",dnaN,TX50_RS00020,DNA polymerase III subunit beta,WP_011131639.1
4,TX50_RS00025,,TX50_RS00025,hypothetical protein,WP_011131640.1
6,"purL , TX50_RS00030",purL,TX50_RS00030,phosphoribosylformylglycinamidine synthase sub...,WP_011131641.1
8,"purF , TX50_RS00035",purF,TX50_RS00035,amidophosphoribosyltransferase,WP_011131642.1
10,TX50_RS00040,,TX50_RS00040,DNA topoisomerase 4 subunit A,WP_011131643.1
...,...,...,...,...,...
3856,"uvrA , TX50_RS09140",uvrA,TX50_RS09140,excinuclease ABC subunit UvrA,WP_011133339.1
3858,TX50_RS09145,,TX50_RS09145,AAA family ATPase,WP_011133340.1
3860,TX50_RS09150,,TX50_RS09150,AarF/ABC1/UbiB kinase family protein,WP_011133341.1
3862,TX50_RS09155,,TX50_RS09155,alpha/beta hydrolase,WP_011133342.1


In [19]:
gff_named_df = gff_df.loc[~gff_df.gene.isna(), gff_columns].drop_duplicates()

In [20]:
gff_named_df.to_csv(os.path.join(med4_genome_dpath, 'MED4_named_genes_NCBI.csv'), index=False)

In [21]:
placeholders_map = {
    '{{GENE_NAME}}' : 'gene_name',
    '{{GENE_PRODUCT}}' : 'product',
    '{{GENE_PROTEIN_ID}}' : 'protein_id',
}

# Example

In [22]:
gff_df.loc[gff_df.gene.isin(['pstA']), gff_columns].squeeze()

gene_name                         pstA , TX50_RS03870
gene                                             pstA
locus_tag                                TX50_RS03870
product       phosphate ABC transporter permease PstA
protein_id                             WP_011132358.1
Name: 1621, dtype: object

In [74]:
template_prompt = short_prompt_unstructured # _unstructured_from_copilot
template_prompt = prompt_build_gene_entry

In [60]:
x = gff_df.loc[gff_df.gene.isin(['pstA']), gff_columns].squeeze()

get_messages_system_unstructured(x, template_prompt, placeholders_map)

[{'role': 'user',
  'content': [{'type': 'text',
    'text': "\ngive me an overview of the gene pstA , TX50_RS03870 (phosphate ABC transporter permease PstA) in prochlorococcus med4. \nSpecifically its general function, its role in stress response and its potential role in coculture with heterotrophic bacteria, \nand in nutrient uptake and exchange with heterotrophic bacteria.\nRemember to rely solely on published research and factual information. If you don't have information on a specific aspect of the gene or topic, clearly state this in the relevant section. End the section with a list of references in standard format, Use citations in the format [Author, Year] throughout your response.\n"}]}]

In [68]:
print(get_messages_system_unstructured(x, template_prompt, placeholders_map))
print("\ngive me an overview of the gene pstA , TX50_RS03870 (phosphate ABC transporter permease PstA) in prochlorococcus med4. \nSpecifically its general function, its role in stress response and its potential role in coculture with heterotrophic bacteria, \nand in nutrient uptake and exchange with heterotrophic bacteria.\nRemember to rely solely on published research and factual information. If you don't have information on a specific aspect of the gene or topic, clearly state this in the relevant section. End the section with a list of references in standard format, Use citations in the format [Author, Year] throughout your response.\n")

[{'role': 'user', 'content': [{'type': 'text', 'text': "\ngive me an overview of the gene pstA , TX50_RS03870 (phosphate ABC transporter permease PstA) in prochlorococcus med4. \nSpecifically its general function, its role in stress response and its potential role in coculture with heterotrophic bacteria, \nand in nutrient uptake and exchange with heterotrophic bacteria.\nRemember to rely solely on published research and factual information. If you don't have information on a specific aspect of the gene or topic, clearly state this in the relevant section. End the section with a list of references in standard format, Use citations in the format [Author, Year] throughout your response.\n"}]}]

give me an overview of the gene pstA , TX50_RS03870 (phosphate ABC transporter permease PstA) in prochlorococcus med4. 
Specifically its general function, its role in stress response and its potential role in coculture with heterotrophic bacteria, 
and in nutrient uptake and exchange with heterotr

In [82]:

# Replace placeholders like {{GENE_NAME}} with real values,
# because the SDK does not support variables.
response = perplexity_client.chat.completions.create(
    model=MODEL,
    messages=get_messages(x, template_prompt, placeholders_map),
    
    max_tokens=max_tokens,
    temperature=temperature,
)



In [83]:
#response

In [87]:
#display(Markdown(response.choices[0].message.content))
print(response.choices[0].message.content)

<research_process>

1. Key search terms and databases:
- Databases: PubMed, Web of Science, Google Scholar
- Search terms: "Prochlorococcus pstA", "phosphate ABC transporter permease PstA", "Prochlorococcus phosphate transport", "Prochlorococcus pst operon", "Prochlorococcus phosphate limitation"

2. Overview of available literature:
- Number of relevant papers: Approximately 20-25 directly relevant papers
- Date range: 2000-2025
- Main research focuses: Phosphate transport in Prochlorococcus, gene expression under phosphate limitation, ABC transporter function in cyanobacteria

3. Summaries of key sources:

[Martiny et al., 2006] Analyzed Prochlorococcus genomes and found that pstA is part of the pst operon, which is present in all sequenced strains. The operon is upregulated under phosphate limitation.

[Scanlan et al., 2009] Reviewed phosphate acquisition in marine cyanobacteria, including Prochlorococcus. PstA is described as a critical component of the high-affinity phosphate upta

In [88]:
response.citations

['https://www.biorxiv.org/content/10.1101/2021.07.20.453049v1.full.pdf',
 'https://onlinelibrary.wiley.com/doi/10.1111/j.1462-2920.2011.02612.x',
 'https://eprints.soton.ac.uk/436684/1/Carter_Gates_Michael_PhD_Thesis_Dec_2019.pdf',
 'https://pubmed.ncbi.nlm.nih.gov/19187282/',
 'https://string-db.org/network/167539.Pro_0596',
 'https://www.researchgate.net/publication/268987175_Prochlorococcus_The_structure_and_function_of_collective_diversity',
 'https://www.uniprot.org/uniprotkb/Q7VCZ3/entry',
 'https://string-db.org/network/59920.PMN2A_0309',
 'https://www.science.org/doi/10.1126/sciadv.adp1949',
 'https://journals.asm.org/doi/10.1128/AEM.01178-18']

In [86]:
print(response.to_json())

{
  "id": "1131a9fd-f31c-4f27-a901-2e09381d66d3",
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "<research_process>\n\n1. Key search terms and databases:\n- Databases: PubMed, Web of Science, Google Scholar\n- Search terms: \"Prochlorococcus pstA\", \"phosphate ABC transporter permease PstA\", \"Prochlorococcus phosphate transport\", \"Prochlorococcus pst operon\", \"Prochlorococcus phosphate limitation\"\n\n2. Overview of available literature:\n- Number of relevant papers: Approximately 20-25 directly relevant papers\n- Date range: 2000-2025\n- Main research focuses: Phosphate transport in Prochlorococcus, gene expression under phosphate limitation, ABC transporter function in cyanobacteria\n\n3. Summaries of key sources:\n\n[Martiny et al., 2006] Analyzed Prochlorococcus genomes and found that pstA is part of the pst operon, which is present in all sequenced strains. The operon is upregulated under phosphate limitation.\n\

# Run batch

Run the queries in batch. start only with those that have a name.

As perplexity does not support running in batch, run the queries one at a time.


In [89]:
def get_custom_batch_id(x):
    #print(x)
    batchid = f'batch-id-{x["gene_name"]}'
    batchid = batchid.replace(', ', '-')
    batchid = batchid.replace(' ', '-')
    return batchid


In [90]:
def get_batch_request(x, placeholders_map):
    return dict(
        custom_id=get_custom_batch_id(x),
        params=dict(
            model=MODEL,
            max_tokens=max_tokens,
            temperature=temperature,
            messages=get_messages(x, template_prompt, placeholders_map)
        )
    )


In [91]:
get_batch_request(x, placeholders_map)

{'custom_id': 'batch-id-pstA--TX50_RS03870',
 'params': {'model': 'sonar-pro',
  'max_tokens': 4096,
  'temperature': 0,
  'messages': [{'role': 'user',
    'content': [{'type': 'text',
      'text': "\nYou are a highly skilled research assistant specializing in microbiology, with a focus on Prochlorococcus bacteria. Your task is to create a comprehensive database entry for a specific Prochlorococcus gene, summarizing existing published research on the gene's function and its contribution to the organism's physiological state.\n\n\nHere are details of the gene you will be researching:\n<gene_name>\npstA , TX50_RS03870\n</gene_name>\n\n<gene_product>\nphosphate ABC transporter permease PstA\n</gene_product>\n\n<gene_protein_id>\nWP_011132358.1\n</gene_protein_id>\n\nBegin by conducting a thorough literature review. Wrap your research process inside <research_process> tags, including:\n\n1. Key search terms and databases you would use.\n2. An overview of the available literature, includi

In [92]:
gff_named_df.loc[gff_named_df.duplicated()]

Unnamed: 0,gene_name,gene,locus_tag,product,protein_id


In [93]:
batch_requests = gff_named_df.apply(lambda x: get_batch_request(x, placeholders_map), axis=1).to_list()

In [94]:
batch_requests[5]

{'custom_id': 'batch-id-ftsY--TX50_RS00065',
 'params': {'model': 'sonar-pro',
  'max_tokens': 4096,
  'temperature': 0,
  'messages': [{'role': 'user',
    'content': [{'type': 'text',
      'text': "\nYou are a highly skilled research assistant specializing in microbiology, with a focus on Prochlorococcus bacteria. Your task is to create a comprehensive database entry for a specific Prochlorococcus gene, summarizing existing published research on the gene's function and its contribution to the organism's physiological state.\n\n\nHere are details of the gene you will be researching:\n<gene_name>\nftsY , TX50_RS00065\n</gene_name>\n\n<gene_product>\nsignal recognition particle-docking protein FtsY\n</gene_product>\n\n<gene_protein_id>\nWP_011131648.1\n</gene_protein_id>\n\nBegin by conducting a thorough literature review. Wrap your research process inside <research_process> tags, including:\n\n1. Key search terms and databases you would use.\n2. An overview of the available literature

In [95]:
len(batch_requests)

503

In [75]:


message_batch = anthropic_client.beta.messages.batches.create(
    requests=batch_requests,
)
print(message_batch)


BetaMessageBatch(id='msgbatch_01FcV432uRZzKGFFejvr2dcj', archived_at=None, cancel_initiated_at=None, created_at=datetime.datetime(2025, 1, 25, 21, 47, 58, 453189, tzinfo=datetime.timezone.utc), ended_at=None, expires_at=datetime.datetime(2025, 1, 26, 21, 47, 58, 453189, tzinfo=datetime.timezone.utc), processing_status='in_progress', request_counts=BetaMessageBatchRequestCounts(canceled=0, errored=0, expired=0, processing=503, succeeded=0), results_url=None, type='message_batch')


In [96]:
out_dpath = os.path.join('batch results', 'perplexity_single_structured_prompt_09022025',)
os.makedirs(out_dpath, exist_ok=True)

In [None]:
# naive batch call - no parallelism, call one by one

In [100]:
for i in batch_requests:
    response = perplexity_client.chat.completions.create(**i['params'])
    results = {'custom_id' : i['custom_id'], 'result': response.to_dict()}
    fpath = os.path.join(out_dpath, f'perplexity_result_{i["custom_id"]}.json')
    print('.', end='')
    with open(fpath, 'w', encoding ='utf8') as json_file:
        json.dump(results, json_file, ensure_ascii = False)


.................................................................................................................................................................................................................................................................................................................................................................................

AuthenticationError: <html>
<head><title>401 Authorization Required</title></head>
<body>
<center><h1>401 Authorization Required</h1></center>
<hr><center>openresty/1.25.3.1</center>
<script>(function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'90f4791aee57c222',t:'MTczOTExMDU0Mi4wMDAwMDA='};var a=document.createElement('script');a.nonce='';a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();</script></body>
</html>

In [101]:
# concat the files

In [106]:
# Output file
ofpath = os.path.join('batch results', 'perplexity_single_structured_prompt_09022025.jsonl',)

with open(ofpath, "w", encoding='utf8') as ofh:
    for fname in os.listdir(out_dpath):
        if fname.endswith('.json'):
            fpath = os.path.join(out_dpath, fname)
            with open(fpath, "r", encoding='utf8') as ifh:
                text = ifh.read()
                ofh.write(text)
                ofh.write('\n')
                
