# House keeping and setups

In [1]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import anthropic
import google.generativeai
import gradio as gr
from IPython.display import Markdown, display, update_display

In [2]:
import pandas as pd
import numpy as np

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from prompts import * 
from prompts import * 

In [7]:
# Load environment variables in a file called .env
# Print the key prefixes to help with any debugging

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
perplexity_api_key = os.getenv('PERPLEXITY_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
else:
    print("Google API Key not set")


if perplexity_api_key:
    print(f"Perplexity API Key exists and begins {perplexity_api_key[:6]}")
else:
    print("Perplexity API Key not set")

OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins sk-ant-
Google API Key exists and begins AIzaSyCF
Perplexity API Key exists and begins pplx-H


|Model	              |Context Length |
|---------------------|---------------|
|sonar-reasoning-pro  |	127k          |
|sonar-reasoning	  | 127k          |
|sonar-pro	          |200k	          |
|sonar	              |127k	          |


In [9]:
from openai import OpenAI
MODEL = 'sonar'

messages = [
    {
        "role": "system",
        "content": (
            "You are an artificial intelligence assistant and you need to "
            "engage in a helpful, detailed, polite conversation with a user."
        ),
    },
    {   
        "role": "user",
        "content": (
            "How many stars are in the universe?"
        ),
    },
]

client = OpenAI(api_key=perplexity_api_key, base_url="https://api.perplexity.ai")

# chat completion without streaming
response = client.chat.completions.create(
    model=MODEL,
    messages=messages,
)
print(response)

# # chat completion with streaming
# response_stream = client.chat.completions.create(
#     model="sonar-pro",
#     messages=messages,
#     stream=True,
# )
# for response in response_stream:
#     print(response)


ChatCompletion(id='9ad416b0-94c7-4370-887c-f0aa70e1e646', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Hello I'm happy to help you explore one of the most fascinating questions in astronomy: how many stars are in the universe?\n\nEstimating the number of stars in the universe is a complex task, as it involves counting not just the stars we can see but also those that are too distant or too faint to be detected. Here's a breakdown of the current understanding:\n\n1. **Number of Stars in the Milky Way**: Our home galaxy, the Milky Way, is estimated to contain between 100 billion and 400 billion stars, depending on the source and method of estimation[1][4][5].\n\n2. **Number of Galaxies**: The observable universe is thought to contain approximately 2 trillion galaxies, although this number can vary based on the sensitivity of telescopes and observational methods[2][3][5].\n\n3. **Total Number of Stars in the Universe**: Estimates for

In [10]:
response

ChatCompletion(id='9ad416b0-94c7-4370-887c-f0aa70e1e646', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Hello I'm happy to help you explore one of the most fascinating questions in astronomy: how many stars are in the universe?\n\nEstimating the number of stars in the universe is a complex task, as it involves counting not just the stars we can see but also those that are too distant or too faint to be detected. Here's a breakdown of the current understanding:\n\n1. **Number of Stars in the Milky Way**: Our home galaxy, the Milky Way, is estimated to contain between 100 billion and 400 billion stars, depending on the source and method of estimation[1][4][5].\n\n2. **Number of Galaxies**: The observable universe is thought to contain approximately 2 trillion galaxies, although this number can vary based on the sensitivity of telescopes and observational methods[2][3][5].\n\n3. **Total Number of Stars in the Universe**: Estimates for

In [7]:
def get_messages(x, placeholders_map):
    ''' get messages based on a line in the gff DF 
        x : tuple - a line from the dataframe
        columns2placeholders - dict mapping colum names to placeholders in the prompt
    '''
    user_prompt = prompt_build_gene_entry
    for k,v in placeholders_map.items():
        user_prompt = user_prompt.replace(k, x[v])
        
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": user_prompt,
                }
            ]
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text",
                    "text": "<literature_review>"
                }
            ]
        }
    ]
    return messages
    

# Load list of MED4 genes

In [8]:
med4_genome_dpath = '../genomes/MED4'
os.listdir(med4_genome_dpath)

['cds_from_genomic.fna',
 'GCF_000011465.1_ASM1146v1_genomic.fna',
 'GCF_000011465.1_ASM1146v1_genomic.fna.fai',
 'genomic.gbff',
 'genomic.gff',
 'genomic.gtf',
 'MED4.bed',
 'MED4_biocyc_pathways.txt',
 'MED4_pathways.csv',
 'MED4_PMM2locus.csv',
 'MED4_protein.faa',
 'MED4_user_ko.txt',
 'Pro_MED4.gff',
 'README.get_pmm',
 'sequence_report.jsonl',
 'uniprot-taxonomy-59919.tab']

In [9]:
import gffpandas.gffpandas as gffpd

In [10]:
annotation = gffpd.read_gff3(os.path.join(med4_genome_dpath, 'genomic.gff'))

In [11]:
gff_df = annotation.filter_feature_of_type(['CDS']).attributes_to_columns()

In [12]:
gff_columns = [#'phase','Name', 'Note', 'exception', 'gbkey', 'inference', 'partial','pseudo',
               'gene_name', 'gene', 'locus_tag', 
       'product', 'protein_id', ]
#gff_df[gff_columns]

In [13]:
mask = gff_df.gene.isna()
gff_df.loc[mask, 'gene_name'] = gff_df.loc[mask, 'locus_tag'] 
gff_df.loc[~mask, 'gene_name'] =gff_df.loc[~mask, 'gene'] + ' , ' + gff_df.loc[~mask, 'locus_tag'] 

In [14]:
gff_df[gff_columns]

Unnamed: 0,gene_name,gene,locus_tag,product,protein_id
2,"dnaN , TX50_RS00020",dnaN,TX50_RS00020,DNA polymerase III subunit beta,WP_011131639.1
4,TX50_RS00025,,TX50_RS00025,hypothetical protein,WP_011131640.1
6,"purL , TX50_RS00030",purL,TX50_RS00030,phosphoribosylformylglycinamidine synthase sub...,WP_011131641.1
8,"purF , TX50_RS00035",purF,TX50_RS00035,amidophosphoribosyltransferase,WP_011131642.1
10,TX50_RS00040,,TX50_RS00040,DNA topoisomerase 4 subunit A,WP_011131643.1
...,...,...,...,...,...
3856,"uvrA , TX50_RS09140",uvrA,TX50_RS09140,excinuclease ABC subunit UvrA,WP_011133339.1
3858,TX50_RS09145,,TX50_RS09145,AAA family ATPase,WP_011133340.1
3860,TX50_RS09150,,TX50_RS09150,AarF/ABC1/UbiB kinase family protein,WP_011133341.1
3862,TX50_RS09155,,TX50_RS09155,alpha/beta hydrolase,WP_011133342.1


In [16]:
gff_named_df = gff_df.loc[~gff_df.gene.isna(), gff_columns].drop_duplicates()

In [18]:
gff_named_df.to_csv(os.path.join(med4_genome_dpath, 'MED4_named_genes_NCBI.csv'), index=False)

In [15]:
placeholders_map = {
    '{{GENE_NAME}}' : 'gene_name',
    '{{GENE_PRODUCT}}' : 'product',
    '{{GENE_PROTEIN_ID}}' : 'protein_id',
}

# Example

In [16]:
gff_df.loc[gff_df.gene.isin(['pstA']), gff_columns].squeeze()

gene_name                         pstA , TX50_RS03870
gene                                             pstA
locus_tag                                TX50_RS03870
product       phosphate ABC transporter permease PstA
protein_id                             WP_011132358.1
Name: 1621, dtype: object

In [28]:
x = gff_df.loc[gff_df.gene.isin(['pstA']), gff_columns].squeeze()

get_messages(x, placeholders_map)

[{'role': 'user',
  'content': [{'type': 'text',
    'text': "\nYou are a highly skilled research assistant specializing in microbiology, with a focus on Prochlorococcus bacteria. Your task is to create a comprehensive database entry for a specific Prochlorococcus gene, summarizing existing published research on the gene's function and its contribution to the organism's physiological state.\n\n\nHere are details of the gene you will be researching:\n<gene_name>\npstA , TX50_RS03870\n</gene_name>\n\n<gene_product>\nphosphate ABC transporter permease PstA\n</gene_product>\n\n<gene_protein_id>\nWP_011132358.1\n</gene_protein_id>\n\nBegin by conducting a thorough literature review. Wrap your research process inside <research_process> tags, including:\n\n1. Key search terms and databases you would use.\n2. An overview of the available literature, including:\n   - Number of relevant papers\n   - Date range of the research\n   - Main research focuses\n3. Summaries of 5-7 key sources, each wit

In [29]:

# Replace placeholders like {{GENE_NAME}} with real values,
# because the SDK does not support variables.
message = anthropic_client.messages.create(
    model=MODEL,
    max_tokens=max_tokens,
    temperature=temperature,
    messages=get_messages(x, placeholders_map)
)
#print(message.content)


In [30]:
print (message.content[0].text)


1. Key Search Terms and Databases:
- Primary databases: PubMed, Web of Science, NCBI Gene, UniProt
- Search terms: "Prochlorococcus pstA", "phosphate ABC transporter Prochlorococcus", "TX50_RS03870", "WP_011132358.1", "phosphate transport cyanobacteria"

2. Literature Overview:
- ~25 relevant papers (1995-2023)
- Main focus areas: phosphate transport, stress response, marine phosphate acquisition
- Core research period: 2005-2023

3. Key Sources:

[Martiny et al., 2006] Nature
- Identified pstA as part of the phosphate-acquisition genes in Prochlorococcus
- Demonstrated differential regulation under phosphate limitation

[Casey et al., 2009] ISME Journal
- Characterized phosphate transport system in Prochlorococcus MED4
- Showed upregulation of pstA under P-stress conditions

[Moore et al., 2005] Nature
- Described genomic adaptations for phosphate acquisition
- Linked pstA to survival in low-phosphate environments

[Scanlan et al., 2009] Microbiology and Molecular Biology Reviews
- C

# Run batch

Run the queries in batch. start only with those that have a name.

In [51]:
def get_custom_batch_id(x):
    #print(x)
    batchid = f'batch-id-{x["gene_name"]}'
    batchid = batchid.replace(', ', '-')
    batchid = batchid.replace(' ', '-')
    return batchid


In [52]:
def get_batch_request(x, placeholders_map):
    return dict(
        custom_id=get_custom_batch_id(x),
        params=dict(
            model=MODEL,
            max_tokens=max_tokens,
            temperature=temperature,
            messages=get_messages(x, placeholders_map)
        )
    )


In [53]:
get_batch_request(x, placeholders_map)

{'custom_id': 'batch-id-pstA--TX50_RS03870',
 'params': {'model': 'claude-3-5-sonnet-20241022',
  'max_tokens': 4096,
  'temperature': 0,
  'messages': [{'role': 'user',
    'content': [{'type': 'text',
      'text': "\nYou are a highly skilled research assistant specializing in microbiology, with a focus on Prochlorococcus bacteria. Your task is to create a comprehensive database entry for a specific Prochlorococcus gene, summarizing existing published research on the gene's function and its contribution to the organism's physiological state.\n\n\nHere are details of the gene you will be researching:\n<gene_name>\npstA , TX50_RS03870\n</gene_name>\n\n<gene_product>\nphosphate ABC transporter permease PstA\n</gene_product>\n\n<gene_protein_id>\nWP_011132358.1\n</gene_protein_id>\n\nBegin by conducting a thorough literature review. Wrap your research process inside <research_process> tags, including:\n\n1. Key search terms and databases you would use.\n2. An overview of the available li

In [67]:
gff_named_df.loc[gff_named_df.duplicated()]

Unnamed: 0,gene_name,gene,locus_tag,product,protein_id


In [72]:
batch_requests = gff_named_df.apply(lambda x: get_batch_request(x, placeholders_map), axis=1).to_list()

In [73]:
batch_requests[5]

{'custom_id': 'batch-id-ftsY--TX50_RS00065',
 'params': {'model': 'claude-3-5-sonnet-20241022',
  'max_tokens': 4096,
  'temperature': 0,
  'messages': [{'role': 'user',
    'content': [{'type': 'text',
      'text': "\nYou are a highly skilled research assistant specializing in microbiology, with a focus on Prochlorococcus bacteria. Your task is to create a comprehensive database entry for a specific Prochlorococcus gene, summarizing existing published research on the gene's function and its contribution to the organism's physiological state.\n\n\nHere are details of the gene you will be researching:\n<gene_name>\nftsY , TX50_RS00065\n</gene_name>\n\n<gene_product>\nsignal recognition particle-docking protein FtsY\n</gene_product>\n\n<gene_protein_id>\nWP_011131648.1\n</gene_protein_id>\n\nBegin by conducting a thorough literature review. Wrap your research process inside <research_process> tags, including:\n\n1. Key search terms and databases you would use.\n2. An overview of the ava

In [74]:
len(batch_requests)

503

In [75]:


message_batch = anthropic_client.beta.messages.batches.create(
    requests=batch_requests,
)
print(message_batch)


BetaMessageBatch(id='msgbatch_01FcV432uRZzKGFFejvr2dcj', archived_at=None, cancel_initiated_at=None, created_at=datetime.datetime(2025, 1, 25, 21, 47, 58, 453189, tzinfo=datetime.timezone.utc), ended_at=None, expires_at=datetime.datetime(2025, 1, 26, 21, 47, 58, 453189, tzinfo=datetime.timezone.utc), processing_status='in_progress', request_counts=BetaMessageBatchRequestCounts(canceled=0, errored=0, expired=0, processing=503, succeeded=0), results_url=None, type='message_batch')
