# Single prompt from perplexity

This notebook implements creating a simple DB using a single prompt from perplexity, using the builtin web search in the model. Not using any of the complex reasoning model as this is the first attempt - but using the sonar-pro model and the relatively high context to improve the result.

Because perplexity does not support structured outputs, the result undergo a second transformation by chatgpt-4o-mini which change the format to the DB format.

In [1]:
import os
import json
from IPython.display import Markdown, display, update_display
import pandas as pd
import numpy as np
import gffpandas.gffpandas as gffpd


In [2]:
# get api keys
import os

# load environment variables from .env file (requires `python-dotenv`)
from dotenv import load_dotenv

load_dotenv(override=True)



True

In [3]:
os.environ["LANGCHAIN_PROJECT"] = "cc1a3_plx_single"

In [4]:
%load_ext autoreload
%autoreload 2

In [32]:
import gene_db_classes 
import gene_db_prompts 
import extract_citation

# Initialize the models and create templates

In [6]:
# Initialize the chat model with perplexity sonar-pro
# pip install -qU "langchain-perplexity"
from langchain.chat_models import init_chat_model
perplexity_llm = init_chat_model("sonar-pro", model_provider="perplexity", temperature=0, max_tokens=3000)

# perlexity additional parameters
extra_body_perplexity = dict(search_mode="academic", web_search_options=dict(search_context_size="high"))

# no structured output for tier 1 models in perplexity
#structured_llm = llm.with_structured_output(gene_db_classes.GeneResearchSummarySimple)

In [7]:
# Initialize the chat model with OpenAI's GPT-4o-mini
# pip install -qU "langchain[openai]"
from langchain.chat_models import init_chat_model

openai_llm = init_chat_model("gpt-4o-mini", model_provider="openai", temperature=0.0)
structured_openai_llm = openai_llm.with_structured_output(gene_db_classes.GeneResearchSummarySimple)

In [8]:

openai_web_search_tool = {"type": "web_search_preview"}
structured_openai_llm_with_tool = openai_llm.bind_tools(
    [openai_web_search_tool],
    response_format=gene_db_classes.GeneResearchSummarySimple,
    strict=True,
)



In [9]:

from langchain_core.prompts import ChatPromptTemplate
prompt_template = ChatPromptTemplate([
    #("system", gene_db_prompts.system_prompt_from_gemini_without_literature_placeholder),
    ("user", gene_db_prompts.prompt_for_single_prompt_perplexity_prochlorococcus)
])
def get_prompt_for_row(x):
    return prompt_template.invoke(dict(
        gene_name_or_id=x['Name_gene'],
        locus_tag=x['locus_tag'],
        product=x['product'],
        protein_id=x['protein_id'], 
        old_locus_tag=x['old_locus_tag'],
    ))



In [10]:
second_prompt_template = ChatPromptTemplate([
    #("system", gene_db_prompts.system_prompt_from_gemini_without_literature_placeholder),
    ("user", gene_db_prompts.second_prompt_for_single_prompt_perplexity_prochlorococcus)
])

def _get_citations_urls(perplexity_response):
    citations_urls_list =perplexity_response.additional_kwargs['citations']
    formatted_list = [f'[{i}] {url}' for i, url in enumerate(citations_urls_list, start=1) ]
    return '\n'.join(formatted_list)

    

def get_second_prompt_for_gene(gene_name, perplexity_response):
    return second_prompt_template.invoke(dict(
        gene_name_or_id=gene_name,
        review_text=perplexity_response.content,
        citations_urls=_get_citations_urls(perplexity_response),
    ))


# Load MED4 genes

In [11]:
import os
med4_genome_dpath = '../../genomes/MED4'
os.listdir(med4_genome_dpath)

['cds_from_genomic.fna',
 'GCF_000011465.1_ASM1146v1_genomic.fna',
 'GCF_000011465.1_ASM1146v1_genomic.fna.fai',
 'genomic.gbff',
 'genomic.gff',
 'genomic.gtf',
 'MED4.bed',
 'MED4_biocyc_pathways.txt',
 'MED4_named_genes_NCBI.csv',
 'MED4_pathways.csv',
 'MED4_PMM2locus.csv',
 'MED4_protein.faa',
 'MED4_user_ko.txt',
 'Pro_MED4.gff',
 'README.get_pmm',
 'sequence_report.jsonl',
 'uniprot-taxonomy-59919.tab']

In [12]:
annotation = gffpd.read_gff3(os.path.join(med4_genome_dpath, 'genomic.gff'))
gff_cds_df = annotation.filter_feature_of_type(['CDS']).attributes_to_columns()
gff_gene_df = annotation.filter_feature_of_type(['gene']).attributes_to_columns()

gff_df = pd.merge(gff_cds_df, gff_gene_df, on='locus_tag', suffixes=('_cds', '_gene'), how='left')

gff_columns = [#'phase','Name', 'Note', 'exception', 'gbkey', 'inference', 'partial','pseudo',
               'gene_name', 'gene', 'locus_tag', 
       'product', 'protein_id', ]
#gff_df[gff_columns]


In [13]:
gff_columns = [
    #'seq_id_cds', 'source_cds', 'type_cds', 'start_cds', 'end_cds',
    #'score_cds', 'strand_cds', 'phase_cds', 'attributes_cds', 'Dbxref',
    #'ID_cds', 
    #'Name_cds', #'Note', 'Parent', 'end_range', 'exception',
    #'gbkey_cds', 
    #'gene_cds', #'inference', 
    'locus_tag', #'partial', 
    'product',
       'protein_id', 'pseudo', #'transl_table', 'seq_id_gene', 'source_gene',
       #'type_gene', 'start_gene', 'end_gene', 'score_gene', 'strand_gene',
       #'phase_gene', 'attributes_gene', 'ID_gene', 
       'Name_gene', #'gbkey_gene',
      'gene_gene', 
      'gene_biotype', 'old_locus_tag'
]

In [14]:
gff_df.columns

Index(['seq_id_cds', 'source_cds', 'type_cds', 'start_cds', 'end_cds',
       'score_cds', 'strand_cds', 'phase_cds', 'attributes_cds', 'Dbxref',
       'ID_cds', 'Name_cds', 'Note', 'Parent', 'end_range', 'exception',
       'gbkey_cds', 'gene_cds', 'inference', 'locus_tag', 'partial', 'product',
       'protein_id', 'pseudo', 'transl_table', 'seq_id_gene', 'source_gene',
       'type_gene', 'start_gene', 'end_gene', 'score_gene', 'strand_gene',
       'phase_gene', 'attributes_gene', 'ID_gene', 'Name_gene', 'gbkey_gene',
       'gene_gene', 'gene_biotype', 'old_locus_tag'],
      dtype='object')

In [15]:
gff_df[gff_columns].head(1).T

Unnamed: 0,0
locus_tag,TX50_RS00020
product,DNA polymerase III subunit beta
protein_id,WP_011131639.1
pseudo,
Name_gene,dnaN
gene_gene,dnaN
gene_biotype,protein_coding
old_locus_tag,PMM0001


In [16]:
gff_gene_df.gene_biotype.value_counts(dropna=False)

gene_biotype
protein_coding    1860
tRNA                37
rRNA                 3
tmRNA                1
ncRNA                1
SRP_RNA              1
RNase_P_RNA          1
Name: count, dtype: int64

These are different types of gene biotypes from NCBI's GFF (General Feature Format) annotation files, each representing a distinct functional category of genes:
* protein_coding: Genes that encode proteins - these are transcribed into mRNA and then translated into amino acid sequences to form functional proteins. This represents the majority of genes in most genomes.
* tmRNA: Transfer-messenger RNA genes that encode a unique type of RNA molecule found in bacteria. tmRNA rescues ribosomes that become stalled during translation by acting as both a tRNA and an mRNA, allowing the ribosome to complete translation and tag the incomplete protein for degradation.
* tRNA: Transfer RNA genes that produce tRNA molecules responsible for bringing amino acids to the ribosome during protein synthesis. Each tRNA recognizes specific codons in the mRNA and carries the corresponding amino acid.
* rRNA: Ribosomal RNA genes that encode the RNA components of ribosomes. These are essential structural and catalytic components of the protein synthesis machinery, including the 16S, 23S, and 5S rRNAs in bacteria.
* ncRNA: Non-coding RNA genes that produce functional RNA molecules that are not translated into proteins. This is a broad category that includes various regulatory RNAs like microRNAs, long non-coding RNAs, and small interfering RNAs.
* SRP_RNA: Signal Recognition Particle RNA genes that encode the RNA component of the signal recognition particle. This ribonucleoprotein complex recognizes signal sequences on newly synthesized proteins and directs them to the endoplasmic reticulum for proper cellular localization.
* RNase_P_RNA: Ribonuclease P RNA genes that encode the catalytic RNA subunit of RNase P, an enzyme responsible for processing the 5' end of precursor tRNA molecules during tRNA maturation.
These biotype classifications help researchers understand the functional diversity of genes beyond just protein-coding sequences, highlighting the important roles that various RNA molecules play in cellular processes.

In [17]:
gff_df[~gff_df.Name_gene.isna()][gff_columns]


Unnamed: 0,locus_tag,product,protein_id,pseudo,Name_gene,gene_gene,gene_biotype,old_locus_tag
0,TX50_RS00020,DNA polymerase III subunit beta,WP_011131639.1,,dnaN,dnaN,protein_coding,PMM0001
1,TX50_RS00025,hypothetical protein,WP_011131640.1,,TX50_RS00025,,protein_coding,PMM0002
2,TX50_RS00030,phosphoribosylformylglycinamidine synthase sub...,WP_011131641.1,,purL,purL,protein_coding,PMM0003
3,TX50_RS00035,amidophosphoribosyltransferase,WP_011131642.1,,purF,purF,protein_coding,PMM0004
4,TX50_RS00040,DNA topoisomerase 4 subunit A,WP_011131643.1,,TX50_RS00040,,protein_coding,PMM0005
...,...,...,...,...,...,...,...,...
1861,TX50_RS09140,excinuclease ABC subunit UvrA,WP_011133339.1,,uvrA,uvrA,protein_coding,PMM1712
1862,TX50_RS09145,AAA family ATPase,WP_011133340.1,,TX50_RS09145,,protein_coding,PMM1713
1863,TX50_RS09150,AarF/ABC1/UbiB kinase family protein,WP_011133341.1,,TX50_RS09150,,protein_coding,PMM1714
1864,TX50_RS09155,alpha/beta hydrolase,WP_011133342.1,,TX50_RS09155,,protein_coding,PMM1715


In [18]:
# focus on protein coding genes
# to start with, we will only use protein coding genes that are not pseudogenes

gff_df_filter = gff_df[gff_df.gene_biotype.isin(['protein_coding']) & gff_df.pseudo.isna() & ~gff_df.gene_gene.isna()][gff_columns]


In [19]:
out_dpath = 'batch_results'
os.makedirs(out_dpath, exist_ok=True)

In [20]:
gff_df_filter.to_csv(os.path.join(out_dpath, 'MED4_genes_filter.csv'), index=False)


# Run a sample query to test the integration

In [21]:
gene_row = gff_df_filter.iloc[14]
gene_row

locus_tag                                           TX50_RS00130
product          type I glyceraldehyde-3-phosphate dehydrogenase
protein_id                                        WP_011131661.1
pseudo                                                      None
Name_gene                                                    gap
gene_gene                                                    gap
gene_biotype                                      protein_coding
old_locus_tag                                            PMM0023
Name: 22, dtype: object

In [22]:

query = get_prompt_for_row(gene_row) 
query

ChatPromptValue(messages=[HumanMessage(content="\nYou are a research assistant tasked with creating a comprehensive summary of a specific gene's function and its role in the coculture of Prochlorococcus and Alteromonas under nitrogen limitation conditions. Your goal is to provide a detailed, fact-based summary using published scientific literature.\n\nThe gene you will be researching is:\n<gene_name>\ngap\n</gene_name>\n\n<organism>\nProchlorococcus MED4\n</organism>\n\n<locus_tag>\nTX50_RS00130\n</locus_tag>\n\n<product>\ntype I glyceraldehyde-3-phosphate dehydrogenase\n</product>\n\n<protein_id>\nWP_011131661.1 \n</protein_id>\n\n<old_locus_tag>\nPMM0023\n</old_locus_tag>\n\n\nIn your research summary, cover the following areas:\n1. Gene function\n2. Impact on physiological state\n3. Involvement in stress response\n4. Role in nutrient exchange (uptake and release)\n5. Contribution to oxidative stress\n6. Influence on bacterial interaction in coculture\n\nFollow these steps to conduct

In [23]:
print(query.to_string())

Human: 
You are a research assistant tasked with creating a comprehensive summary of a specific gene's function and its role in the coculture of Prochlorococcus and Alteromonas under nitrogen limitation conditions. Your goal is to provide a detailed, fact-based summary using published scientific literature.

The gene you will be researching is:
<gene_name>
gap
</gene_name>

<organism>
Prochlorococcus MED4
</organism>

<locus_tag>
TX50_RS00130
</locus_tag>

<product>
type I glyceraldehyde-3-phosphate dehydrogenase
</product>

<protein_id>
WP_011131661.1 
</protein_id>

<old_locus_tag>
PMM0023
</old_locus_tag>


In your research summary, cover the following areas:
1. Gene function
2. Impact on physiological state
3. Involvement in stress response
4. Role in nutrient exchange (uptake and release)
5. Contribution to oxidative stress
6. Influence on bacterial interaction in coculture

Follow these steps to conduct your research and present your findings:

1. Search for and review published 

In [24]:

sample_result = perplexity_llm.invoke(query, extra_body=extra_body_perplexity)
sample_result.model_dump()


{'content': '<research_summary>\n<gene_function>\nThe gene gap (TX50_RS00130, old locus tag PMM0023) in Prochlorococcus MED4 encodes a type I glyceraldehyde-3-phosphate dehydrogenase (GAPDH). This enzyme is a key component of the glycolytic pathway and the Calvin-Benson-Bassham (CBB) cycle, catalyzing the conversion of glyceraldehyde-3-phosphate to 1,3-bisphosphoglycerate, a critical step in both central carbon metabolism and photosynthetic carbon fixation. GAPDH is essential for energy production and carbon assimilation in Prochlorococcus, supporting its role as a dominant primary producer in oligotrophic oceans[3]. While direct studies on the gap gene in Prochlorococcus MED4 are limited, its function is well-conserved across cyanobacteria and other photosynthetic organisms.\n</gene_function>\n\n<physiological_impact>\nGAPDH activity is central to the physiological state of Prochlorococcus, as it directly links photosynthetic carbon fixation to energy metabolism. Under nitrogen limita

In [25]:
print(sample_result.content)


<research_summary>
<gene_function>
The gene gap (TX50_RS00130, old locus tag PMM0023) in Prochlorococcus MED4 encodes a type I glyceraldehyde-3-phosphate dehydrogenase (GAPDH). This enzyme is a key component of the glycolytic pathway and the Calvin-Benson-Bassham (CBB) cycle, catalyzing the conversion of glyceraldehyde-3-phosphate to 1,3-bisphosphoglycerate, a critical step in both central carbon metabolism and photosynthetic carbon fixation. GAPDH is essential for energy production and carbon assimilation in Prochlorococcus, supporting its role as a dominant primary producer in oligotrophic oceans[3]. While direct studies on the gap gene in Prochlorococcus MED4 are limited, its function is well-conserved across cyanobacteria and other photosynthetic organisms.
</gene_function>

<physiological_impact>
GAPDH activity is central to the physiological state of Prochlorococcus, as it directly links photosynthetic carbon fixation to energy metabolism. Under nitrogen limitation, Prochlorococc

In [26]:
print(sample_result.additional_kwargs['citations'])


['https://dx.plos.org/10.1371/journal.pone.0133207', 'https://sfamjournals.onlinelibrary.wiley.com/doi/10.1046/j.1462-2920.2003.00456.x', 'https://www.frontiersin.org/articles/10.3389/fgene.2021.586293/full', 'https://www.nature.com/articles/s42003-024-07359-z', 'http://biorxiv.org/lookup/doi/10.1101/2024.05.14.594148', 'https://pnas.org/doi/full/10.1073/pnas.0601301103', 'https://aslopubs.onlinelibrary.wiley.com/doi/10.1002/lno.12683', 'https://dx.plos.org/10.1371/journal.pone.0109327', 'https://pnas.org/doi/full/10.1073/pnas.1733211100', 'https://journals.asm.org/doi/10.1128/JB.01948-06']


In [36]:
[(url, extract_citation.format_apa_citation(extract_citation.extract_citation_info(url)))
   for url in sample_result.additional_kwargs['citations']]


Error fetching the URL: 403 Client Error: Forbidden for url: https://sfamjournals.onlinelibrary.wiley.com/doi/10.1046/j.1462-2920.2003.00456.x
Error fetching the URL: 403 Client Error: Forbidden for url: https://pnas.org/doi/full/10.1073/pnas.0601301103
Error fetching the URL: 403 Client Error: Forbidden for url: https://aslopubs.onlinelibrary.wiley.com/doi/10.1002/lno.12683
Error fetching the URL: 403 Client Error: Forbidden for url: https://pnas.org/doi/full/10.1073/pnas.1733211100
Error fetching the URL: 403 Client Error: Forbidden for url: https://journals.asm.org/doi/10.1128/JB.01948-06


[('https://dx.plos.org/10.1371/journal.pone.0133207',
  'María-Carmen Fernández-Pinos,, Marta Casado,, Gemma Caballero,, Erik R. Zinser,, Jordi Dachs,, & Benjamin Piña (Year not found). PLOS One. PLOS ONE, 10, e0133207. https://doi.org/10.1371/journal.pone.0133207'),
 ('https://sfamjournals.onlinelibrary.wiley.com/doi/10.1046/j.1462-2920.2003.00456.x',
  'Could not generate citation'),
 ('https://www.frontiersin.org/articles/10.3389/fgene.2021.586293/full',
  'Author not found (2021/02/09). Dynamic Allocation of Carbon Storage and Nutrient-Dependent Exudation in a Revised Genome-Scale Model ofProchlorococcus. Frontiers in Genetics, 12, 586293. https://doi.org/10.3389/fgene.2021.586293'),
 ('https://www.nature.com/articles/s42003-024-07359-z',
  'Author not found (Year not found). Improved enzyme functional annotation prediction using contrastive learning with structural inference. Communications Biology, 7, 1. https://doi.org/10.1038/s42003-024-07359-z'),
 ('http://biorxiv.org/lookup/d

# run openai to do reformating 

In [27]:
query2 = get_second_prompt_for_gene(gene_row['Name_gene'], sample_result)


In [28]:
query2

ChatPromptValue(messages=[HumanMessage(content='\nI have the following information about the gene gap in Prochlorococcus MED4:\n\n\n# REVIEW text to be reformatted:\n<research_summary>\n<gene_function>\nThe gene gap (TX50_RS00130, old locus tag PMM0023) in Prochlorococcus MED4 encodes a type I glyceraldehyde-3-phosphate dehydrogenase (GAPDH). This enzyme is a key component of the glycolytic pathway and the Calvin-Benson-Bassham (CBB) cycle, catalyzing the conversion of glyceraldehyde-3-phosphate to 1,3-bisphosphoglycerate, a critical step in both central carbon metabolism and photosynthetic carbon fixation. GAPDH is essential for energy production and carbon assimilation in Prochlorococcus, supporting its role as a dominant primary producer in oligotrophic oceans[3]. While direct studies on the gap gene in Prochlorococcus MED4 are limited, its function is well-conserved across cyanobacteria and other photosynthetic organisms.\n</gene_function>\n\n<physiological_impact>\nGAPDH activity 

In [29]:
sample_result_reformated = structured_openai_llm.invoke(query2)
sample_result_reformated.model_dump()

{'gene_name': 'gap',
 'gene_function': 'The gene gap encodes a type I glyceraldehyde-3-phosphate dehydrogenase (GAPDH), which is essential for energy production and carbon assimilation in Prochlorococcus, supporting its role as a dominant primary producer in oligotrophic oceans.',
 'research_findings': [{'finding_category': 'physiological',
   'finding_sub_category': 'nitrogen limitation',
   'finding_description': 'GAPDH activity is crucial for maintaining metabolic flux, allowing Prochlorococcus to store or release fixed carbon depending on nutrient availability.',
   'finding_evidence': 'metabolic modeling and physiological studies',
   'finding_type': 'physiological',
   'url': 'https://www.frontiersin.org/articles/10.3389/fgene.2021.586293/full',
   'title': 'Dynamic Allocation of Carbon Storage and Nutrient-Dependent Exudation in a Revised Genome-Scale Model of Prochlorococcus',
   'citation': 'Dynamic Allocation of Carbon Storage and Nutrient-Dependent Exudation in a Revised Gen

In [30]:
sample_result_reformated2 = structured_openai_llm_with_tool.invoke(query2)
sample_result_reformated2.model_dump()

{'content': [{'type': 'text',
   'text': '{"gene_name":"gap","gene_function":"Encodes glyceraldehyde-3-phosphate dehydrogenase (GAPDH), a key enzyme in glycolysis and the Calvin-Benson-Bassham cycle, facilitating the conversion of glyceraldehyde-3-phosphate to 1,3-bisphosphoglycerate.","research_findings":[{"finding_category":"physiological","finding_sub_category":"carbon metabolism","finding_description":"GAPDH activity is central to the physiological state of Prochlorococcus, linking photosynthetic carbon fixation to energy metabolism.","finding_evidence":"Metabolic modeling studies in Prochlorococcus.","finding_type":"gene function","url":"https://www.frontiersin.org/articles/10.3389/fgene.2021.586293/full","title":"Dynamic Allocation of Carbon Storage and Nutrient-Dependent Exudation in a Revised Genome-Scale Model of Prochlorococcus","citation":"Frontiers in Genetics, 2021","organism":"Prochlorococcus","phylogenetic_distance":"Direct","additional_notes":"The study provides insight

In [31]:
formatted_response2 = gene_db_classes.GeneResearchSummarySimple.model_validate_json(sample_result_reformated2.content[0]['text'])
formatted_response2.model_dump()


{'gene_name': 'gap',
 'gene_function': 'Encodes glyceraldehyde-3-phosphate dehydrogenase (GAPDH), a key enzyme in glycolysis and the Calvin-Benson-Bassham cycle, facilitating the conversion of glyceraldehyde-3-phosphate to 1,3-bisphosphoglycerate.',
 'research_findings': [{'finding_category': 'physiological',
   'finding_sub_category': 'carbon metabolism',
   'finding_description': 'GAPDH activity is central to the physiological state of Prochlorococcus, linking photosynthetic carbon fixation to energy metabolism.',
   'finding_evidence': 'Metabolic modeling studies in Prochlorococcus.',
   'finding_type': 'gene function',
   'url': 'https://www.frontiersin.org/articles/10.3389/fgene.2021.586293/full',
   'title': 'Dynamic Allocation of Carbon Storage and Nutrient-Dependent Exudation in a Revised Genome-Scale Model of Prochlorococcus',
   'citation': 'Frontiers in Genetics, 2021',
   'organism': 'Prochlorococcus',
   'phylogenetic_distance': 'Direct',
   'additional_notes': 'The study 