# Single prompt from perplexity

This notebook implements creating a simple DB using a single prompt from perplexity, using the builtin web search in the model. Not using any of the complex reasoning model as this is the first attempt - but using the sonar-pro model and the relatively high context to improve the result.

Because perplexity does not support structured outputs, the result undergo a second transformation by chatgpt-4o-mini which change the format to the DB format.

In [14]:
import os
import json
from IPython.display import Markdown, display, update_display
import pandas as pd
import numpy as np


In [15]:
from langchain_core.prompts import ChatPromptTemplate


In [16]:
# get api keys
import os

# load environment variables from .env file (requires `python-dotenv`)
from dotenv import load_dotenv

load_dotenv(override=True)



True

In [None]:
# disable langsmith .. no subscription
os.environ["LANGCHAIN_PROJECT"] = "cc1a3_plx_single"
os.environ["LANGSMITH_TRACING"] = 'FALSE'


In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [100]:
import gene_db_classes 
import gene_db_prompts 

# Initialize the models and create templates

In [101]:
# Initialize the chat model with OpenAI's GPT-4o-mini
# pip install -qU "langchain[openai]"
from langchain.chat_models import init_chat_model

openai_llm = init_chat_model("gpt-4o-mini", model_provider="openai", temperature=0.0)
structured_openai_llm = openai_llm.with_structured_output(gene_db_classes.GeneResearchSummarySimple)

In [111]:

import json
import pathvalidate
import hashlib


second_prompt_template = ChatPromptTemplate([
    ("system", gene_db_prompts.system_second_prompt_for_single_prompt_perplexity_prochlorococcus),
    ("user", gene_db_prompts.second_prompt_for_single_prompt_perplexity_prochlorococcus)
])


# second_prompt_template = ChatPromptTemplate(
#     gene_db_prompts.second_prompt_for_single_prompt_perplexity_prochlorococcus
# )

def _get_citations_urls(gene_first_report):
    citations_urls_list =gene_first_report['citations']
    formatted_list = [f'[{i}] {url}' for i, url in enumerate(citations_urls_list, start=1) ]
    return '\n'.join(formatted_list)

def url2fname(url, max_length=100):
    fprefix = pathvalidate.sanitize_filename(url)
    if len(fprefix) > max_length:
        hash_suffix = hashlib.md5(url.encode()).hexdigest()[:8]
        fprefix_shorter = fprefix[:max_length -  10]  # Leave room for hash
        fprefix = f'{fprefix_shorter}_{hash_suffix}'
    fname =  f'{fprefix}_bibtex.json'
    return fname

def _get_bibtex_of_url(idx: int, url: str, citation_bibtex_dpath : str):
    #return ''
    fpath = os.path.join(citation_bibtex_dpath, url2fname(url))

    if os.path.isfile(fpath):  # Specifically a file
        with open(fpath,) as fh:
            bibtex_dict = json.load(fh)
        if (bibtex_dict['status'] == 'success') and (not bibtex_dict['content'].startswith('Problem')):

            # valid bibtex
            bibtex_entry = f'\n\n## [{idx}] {url}\n\n {bibtex_dict["content"]}\n'
            return bibtex_entry
    # else
    return ''

def _get_json_schema():
    schema = gene_db_classes.GeneResearchSummarySimple.model_json_schema()
    return(json.dumps(schema, indent=2))

def _get_review_content(prompt_params: dict):
    # remove the 'think' section
    tag = '</think>'
    content = prompt_params['content']
    if tag in content:

        return  content.partition(tag)[2]
    #else
    return content


def _get_citations_bibtex(gene_first_report : dict, citation_bibtex_dpath : str):
    citations_urls_list =gene_first_report['citations']
    return ''.join([_get_bibtex_of_url(idx, url, citation_bibtex_dpath)
        for idx, url in enumerate(citations_urls_list, start=1)])
 

def get_second_prompt_for_gene(gene_first_report : dict, citation_bibtex_dpath : str):
    prompt_params = gene_first_report.copy()
    prompt_params['review_text'] = _get_review_content(prompt_params)
    prompt_params['citations_url'] = _get_citations_urls(prompt_params)
    prompt_params['cytations_bibtex'] = _get_citations_bibtex(prompt_params, citation_bibtex_dpath)
    prompt_params['json_schema'] = _get_json_schema()


    return second_prompt_template.invoke(prompt_params)


In [None]:
def _get_custom_batch_id(gene_first_report):
    #print(x)
    batchid = f'batch-id-{gene_first_report["locus_tag"]}'
    batchid = batchid.replace(', ', '-')
    batchid = batchid.replace(' ', '-')
    return batchid

def get_openai_batch_query_dict_for_gene(gene_first_report : dict, citation_bibtex_dpath : str):
    query2 = get_second_prompt_for_gene(gene_first_report, citation_bibtex_dpath)
    system_message = query2.messages[0].content
    user_message = query2.messages[1].content

    chat_gpt_batch_request_dict = {
        'custom_id': _get_custom_batch_id(gene_first_report),
        'method': 'POST',
        'url': '/v1/chat/completions',
        'body': {
            'model': 'gpt-4o-mini',
            'temperature' : 0.0,
            'messages': [
                {'role': 'system', 'content': system_message},
                {'role': 'user', 'content': user_message}
            ],
            'max_tokens': 5000
        }
    }

    return chat_gpt_batch_request_dict   




In [118]:
import jsonlines
def create_openai_batch_file_for_second_prompt(first_review_dpath : str, citation_bibtex_dpath : str, batch_fpath : str):
    with jsonlines.open(batch_fpath, mode='w') as writer:
        for fname in os.listdir(first_review_dpath):
            if fname.endswith('_first_query.json'):
                first_review_fpath = os.path.join(first_review_dpath, fname)
                print(fname)
                with open(first_review_fpath,'r') as fh:
                    gene_first_report = json.load(fh)
                    print(gene_first_report)
                    batch_query_dict = get_openai_batch_query_dict_for_gene(gene_first_report , citation_bibtex_dpath)
                    print(batch_query_dict)
                    writer.write(batch_query_dict)
    writer.close()


# create batch file, upload it and run the batch process
https://platform.openai.com/docs/guides/batch 

In [119]:
# create the file
first_review_dpath = 'batch_results/first_pass/'
citation_bibtex_dpath = 'batch_results/citations'
batch_fpath = 'batch_results/second_pass_batchfile.jsonl'
create_openai_batch_file_for_second_prompt(first_review_dpath, citation_bibtex_dpath, batch_fpath)


TX50_RS06200_first_query.json
{'locus_tag': 'TX50_RS06200', 'product': 'translation initiation factor IF-1', 'protein_id': 'WP_011132784.1', 'pseudo': nan, 'gene_name_or_id': 'infA', 'gene_gene': 'infA', 'gene_biotype': 'protein_coding', 'old_locus_tag': 'PMM1151', 'content': '<think>\nWe are given a gene named `infA` (translation initiation factor IF-1) from Prochlorococcus MED4 (locus tag TX50_RS06200, old locus tag PMM1151). We are to research this gene\'s function and roles in various aspects, focusing on Prochlorococcus, cyanobacteria, or similar marine microorganisms.\n\nSteps:\n1. We have search results from various sources. We must extract relevant information about `infA` (IF-1) from these sources.\n2. The search results are about cyanobacteria and related topics, but we must check if they mention `infA` or IF-1 specifically.\n\nReview of search results:\n\n[1] PMC3791720: This study is about translation initiation factor 3 (IF-3) in cyanobacteria (F. diplosiphon). It mentions

In [None]:
gene_locustag = 'TX50_RS02905'
gene_locustag = 'TX50_RS03495'

first_report_dpath = 'batch_results/first_pass/'
first_report_fpath = os.path.join(first_report_dpath, f'{gene_locustag}_first_query.json')
with open(first_report_fpath,) as fh:
    gene_first_report = json.load(fh)


In [105]:
first_report_fpath

'batch_results/first_pass/TX50_RS03495_first_query.json'

In [106]:
gene_first_report

{'locus_tag': 'TX50_RS03495',
 'product': '23S rRNA pseudouridine(2604) synthase RluF',
 'protein_id': 'WP_011132292.1',
 'pseudo': nan,
 'gene_name_or_id': 'rluF',
 'gene_gene': 'rluF',
 'gene_biotype': 'protein_coding',
 'old_locus_tag': 'PMM0658',
 'content': '<think>\nWe are given a gene named "rluF" with locus tag TX50_RS03495, product "23S rRNA pseudouridine(2604) synthase RluF", protein ID WP_011132292.1, and old locus tag PMM0658. The task is to research this gene in Prochlorococcus, cyanobacteria, or similar marine microorganisms, and if not found, extend to related cyanobacteria, gram-negative bacteria, or marine autotrophs.\n\nWe have four search results. Let\'s analyze them:\n\n[1] A PDF from MPG.PuRe about Marine Bacteroidetes. It discusses their role in organic matter recycling and carbon flux. It mentions techniques like NanoSIMS and HISH-SIMS. It also talks about Cytophagia and Bacteroidia classes. There is no mention of the gene rluF or pseudouridine synthases.\n\n[2] 

In [96]:
citation_bibtex_dpath = 'batch_results/citations'
query2 = get_second_prompt_for_gene(gene_first_report, citation_bibtex_dpath)


In [107]:
print(query2.messages[0].content)


You are a scientific literature analyst specialized in reformatting gene research summaries. Your task is to convert research reviews into structured JSON format following the `GeneResearchSummarySimple` schema.

## Required JSON Schema:
{
  "$defs": {
    "ResearchFindingSimple": {
      "description": "Description of the research process and findings related to a specific finding on a specific gene",
      "properties": {
        "finding_category": {
          "description": "Category of the finding, e.g. 'physiological', 'gene function', 'stress response', 'nutrient uptake', 'nutrient exudation', 'coculture role'",
          "title": "Finding Category",
          "type": "string"
        },
        "finding_sub_category": {
          "description": "sub category of the finding, e.g. 'high light stress' for stress response, nutrient name/type for nutrient exchange",
          "title": "Finding Sub Category",
          "type": "string"
        },
        "finding_description": {
   

In [109]:
query2.messages[1].content


'\nConvert the following research review about gene `rluF` in Prochlorococcus MED4 into the required JSON format.\n\n## Gene Information:\n- **Gene**: rluF\n- **Locus tag**: TX50_RS03495\n- **Old locus tag**: PMM0658\n- **Protein ID**: WP_011132292.1\n- **Product**: 23S rRNA pseudouridine(2604) synthase RluF\n\n## Research Review to Convert:\n\n<research_summary>\n<gene_function>\nThe gene *rluF* encodes a 23S rRNA pseudouridine synthase responsible for isomerizing uridine to pseudouridine at position 2604 in the 23S ribosomal RNA. This post-transcriptional modification enhances ribosomal stability and translational fidelity by optimizing RNA-protein interactions and RNA structure. In *Prochlorococcus* and other cyanobacteria, ribosomal RNA modifications like pseudouridylation are conserved mechanisms for maintaining proteome integrity under fluctuating environmental conditions. No direct studies on *rluF* in *Prochlorococcus* were identified, but its role is inferred from homologs in 

In [None]:


chat_gpt_batch_request_dict = {
    'custom_id': 'request-1',
    'method': 'POST',
    'url': '/v1/chat/completions',
    'body': {
        'model': 'gpt-4o-mini',
        'messages': [
            {'role': 'system', 'content': 'You are a helpful assistant.'},
            {'role': 'user', 'content': 'Hello world!'}
        ],
  'max_tokens': 5000
  }
}




{'custom_id': 'request-1',
 'method': 'POST',
 'url': '/v1/chat/completions',
 'body': {'model': 'gpt-3.5-turbo-0125',
  'messages': [{'role': 'system', 'content': 'You are a helpful assistant.'},
   {'role': 'user', 'content': 'Hello world!'}],
  'max_tokens': 1000}}

In [110]:
sample_result_reformated = structured_openai_llm.invoke(query2)
sample_result_reformated.model_dump()

{'gene_name_or_id': 'rluF',
 'locus_tag': 'TX50_RS03495',
 'old_locus_tag': 'PMM0658',
 'protein_id': 'WP_011132292.1',
 'product': '23S rRNA pseudouridine(2604) synthase RluF',
 'gene_function': 'Encodes a 23S rRNA pseudouridine synthase responsible for isomerizing uridine to pseudouridine at position 2604 in the 23S ribosomal RNA, enhancing ribosomal stability and translational fidelity.',
 'research_findings': [{'finding_category': 'physiological',
   'finding_sub_category': 'ribosomal stability',
   'finding_description': 'Pseudouridine modifications catalyzed by RluF influence cellular physiology by stabilizing ribosomal structure, which supports efficient protein synthesis.',
   'finding_evidence': 'Inferred from studies in *Escherichia coli* regarding ribosome assembly and function.',
   'finding_type': 'gene function',
   'url': ['https://pure.mpg.de/rest/items/item_2484987_2/component/file_3253133/content?download=true',
    'https://ouci.dntb.gov.ua/en/works/9ZNJbOr9/'],
   '

In [32]:
formatted_response2 = gene_db_classes.GeneResearchSummarySimple.model_validate_json(sample_result_reformated2.content[0]['text'])
formatted_response2.model_dump()


{'gene_name': 'gap',
 'gene_function': 'Encodes glyceraldehyde-3-phosphate dehydrogenase (GAPDH), a key enzyme in glycolysis and the Calvin-Benson-Bassham (CBB) cycle.',
 'research_findings': [{'finding_category': 'physiological',
   'finding_sub_category': 'carbon metabolism',
   'finding_description': 'GAPDH is essential for central carbon metabolism, impacting ATP generation and biosynthesis. Under nitrogen limitation, Prochlorococcus reallocates carbon fluxes, increasing glycogen storage and organic acid exudation. Disruption of gap impairs energy production and carbon assimilation, leading to reduced growth and metabolic imbalance under nutrient stress.',
   'finding_evidence': 'Genome-scale metabolic modeling and experimental data on Prochlorococcus MED4.',
   'finding_type': 'gene function',
   'url': 'https://www.frontiersin.org/articles/10.3389/fgene.2021.586293/full',
   'title': 'Dynamic Allocation of Carbon Storage and Nutrient-Dependent Exudation in a Revised Genome-Scale 

In [64]:
query2.model_dump()['messages']

[{'content': '\nI have the following information about the gene rluF in Prochlorococcus MED4:\n\n# Information about the gene:\n    gene_name_or_id: rluF\n    locus_tag: str = TX50_RS03495\n    old_locus_tag: PMM0658\n    protein_id: str = WP_011132292.1\n    product: str = 23S rRNA pseudouridine(2604) synthase RluF\n\n\n# REVIEW text to be reformatted:\n\n<research_summary>\n<gene_function>\nThe gene *rluF* encodes a 23S rRNA pseudouridine synthase responsible for isomerizing uridine to pseudouridine at position 2604 in the 23S ribosomal RNA. This post-transcriptional modification enhances ribosomal stability and translational fidelity by optimizing RNA-protein interactions and RNA structure. In *Prochlorococcus* and other cyanobacteria, ribosomal RNA modifications like pseudouridylation are conserved mechanisms for maintaining proteome integrity under fluctuating environmental conditions. No direct studies on *rluF* in *Prochlorococcus* were identified, but its role is inferred from 