This notebook display the Gene DB for Prochlorococcus

# Includes and setup

In [1]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import anthropic
import google.generativeai
import gradio as gr
from IPython.display import Markdown, display, update_display

In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from prompts import * 


In [5]:
# Load environment variables in a file called .env
# Print the key prefixes to help with any debugging

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
else:
    print("Google API Key not set")

OpenAI API Key not set
Anthropic API Key not set
Google API Key not set


In [6]:
anthropic_client = anthropic.Anthropic(
    # defaults to os.environ.get("ANTHROPIC_API_KEY")
    #api_key="my_api_key",
)
MODEL = "claude-3-5-sonnet-20241022"
max_tokens=4096
temperature=0

# Load list of MED4 genes

In [7]:
med4_genome_dpath = '../genomes/MED4'
#os.listdir(med4_genome_dpath)

In [8]:
gff_named_df = pd.read_csv(os.path.join(med4_genome_dpath, 'MED4_named_genes_NCBI.csv'))
gff_named_df

Unnamed: 0,gene_name,gene,locus_tag,product,protein_id
0,"dnaN , TX50_RS00020",dnaN,TX50_RS00020,DNA polymerase III subunit beta,WP_011131639.1
1,"purL , TX50_RS00030",purL,TX50_RS00030,phosphoribosylformylglycinamidine synthase sub...,WP_011131641.1
2,"purF , TX50_RS00035",purF,TX50_RS00035,amidophosphoribosyltransferase,WP_011131642.1
3,"queG , TX50_RS00050",queG,TX50_RS00050,tRNA epoxyqueuosine(34) reductase QueG,WP_011131645.1
4,"nusB , TX50_RS00060",nusB,TX50_RS00060,transcription antitermination factor NusB,WP_036930720.1
...,...,...,...,...,...
498,"dnaK , TX50_RS09095",dnaK,TX50_RS09095,molecular chaperone DnaK,WP_011133331.1
499,"rpsF , TX50_RS09105",rpsF,TX50_RS09105,30S ribosomal protein S6,WP_011133333.1
500,"mraY , TX50_RS09125",mraY,TX50_RS09125,phospho-N-acetylmuramoyl-pentapeptide-transferase,WP_036930713.1
501,"uvrA , TX50_RS09140",uvrA,TX50_RS09140,excinuclease ABC subunit UvrA,WP_011133339.1


# Load anthropic batch results

In [9]:
batch_result_fpath = os.path.join('batch results', 'anthropic_batch_single_prompt_26012025_results.jsonl')

In [10]:
import jsonlines

In [11]:
with jsonlines.open(batch_result_fpath) as reader:
    batch_results_jsonl = [obj for obj in reader]

In [53]:
def _get_gene_name_from_custom_id(id):
    '''
         extract and return the gene_name from the custom id
         example : 'batch-id-dnaN--TX50_RS00020' --> dnaN, TX50_RS00020
    '''
    #return id.replace('batch-id-', '').replace('--', ' , ')
    id1 = id.replace('batch-id-', '')
    return (id1.split('-')[0])
    #return id.replace('batch-id-', '').replace('--', ' , ')
    

In [54]:
message_status = [m['result']['type'] for m in batch_results_jsonl]
#message_status


In [55]:
# check the status of the batch jobs
from collections import Counter
Counter(message_status)

Counter({'succeeded': 503})

In [57]:
gff_named_df.loc[gff_named_df.gene.isin([_get_gene_name_from_custom_id(batch_results_jsonl[0]['custom_id'])])]

Unnamed: 0,gene_name,gene,locus_tag,product,protein_id
0,"dnaN , TX50_RS00020",dnaN,TX50_RS00020,DNA polymerase III subunit beta,WP_011131639.1


In [58]:
batch_result_dict = {
    _get_gene_name_from_custom_id(m['custom_id']) :  m['result']['message']['content'][0]['text']
    for m in batch_results_jsonl            
}

In [59]:
batch_results_jsonl[0].keys()

dict_keys(['custom_id', 'result'])

In [18]:
batch_results_jsonl[0]['custom_id']

'batch-id-dnaN--TX50_RS00020'

In [19]:
# print(batch_results_jsonl[0]['result']['message']['content'][0]['text'])

# Parse the replies into multiple elements

## example

In [78]:
gene = 'dnaN'

In [79]:
text = '<literature_review>\n' + batch_result_dict[gene]

In [80]:
def parse_claude_message(message):
    tags = {}
    for tag, content in re.findall(r'<(\w+)>(.*?)</\1>', message, re.DOTALL):
        tags[tag] = content.strip()
    return tags

In [81]:
def get_left_text(text, parsed_text):
    left_text = text
    for i in parsed_text: 
        left_text = left_text.replace(parsed_text[i], '')
    return (left_text)
    

In [82]:
def get_inner_tags(text):
    parsed_text = parse_claude_message(text)
    left_text = text
    for i in parsed_text: 
        left_text = left_text.replace(parsed_text[i], '')
    return (left_text)
    

In [192]:

def parse_database_batch_message(gene):
    message = '<literature_review>\n' + batch_result_dict[gene]
    # remove <conservation> tags
    message = message.replace('<conservation>', 'Conservation: ')
    message = message.replace('      Conservation:', '  Conservation:')
    message = message.replace('</conservation>', '')
    database_sections = ['primary_function', 'physiological_contribution', 'stress_responses', 'uptake_exudation', 
        'phylogenetic_persistence', 'coculture_role', 'references']
    parsed_text = parse_claude_message(message)
    
    #assert set(parsed_text.keys()) == {'literature_review', 'database_entry'}, f'Wrong top keys: {parsed_text.keys()}, gene: {gene}'
    assert 'database_entry' in parsed_text.keys() , f'Missing database_entry. Wrong top keys: {parsed_text.keys()}, gene: {gene}'
    if 'literature_review' in parsed_text.keys():
        literature_review = parsed_text['literature_review']
    else:
        literature_review = ''
        print(f'Missing literature_review ({gene})')
        
    database_entry = parsed_text['database_entry']
    parsed_database_entry = parse_claude_message(database_entry)
    assert set(parsed_database_entry.keys()) == set(database_sections), f'Wrong database_sections: {parsed_database_entry.keys()}, gene: {gene}' 
    for i in ['stress_responses', 'uptake_exudation', 'references']:
        parsed_database_entry[i] = parse_claude_message(parsed_database_entry[i])
    return literature_review, parsed_database_entry
    #print(parsed_database_entry)
    # for i in database_sections:
    #     print(i)
    #     print(get_inner_tags(parsed_database_entry[i]))
    
 

In [177]:
literature_review, database_entry = parse_database_batch_message(gene)

In [178]:
def database_section_dict2text(section_dict):
    items_list = [f'* {name}: {text}' for name, text in section_dict.items()]
    return '\n\n'.join(items_list)

def database_section2text(section_name, section_content):
    if isinstance(section_content, dict):
        section_content = database_section_dict2text(section_content)
    section_name = section_name.capitalize()
    return f'\n### {section_name}\n\n{section_content}\n'

In [179]:
def database_entry2text(database_entry):
    title = '# Database Entry\n'
    return title + '\n'.join([database_section2text(section_name, section_content) for section_name, section_content in database_entry.items()])


In [180]:
def literature_review2text(literature_review):
    lines = literature_review.split('\n')
    new_lines = [f'### {line}\n' if line and line[0].isdigit() else line for line in lines]
    title = '# Literature Review\n'
    return title + '\n'.join(new_lines)


In [118]:
display(Markdown(literature_review2text(literature_review)))


# Literature Review
### 1. Key Search Terms and Databases:

- Primary databases: PubMed, Web of Science, NCBI Gene, UniProt
- Search terms: "dnaN Prochlorococcus", "DNA polymerase III beta subunit cyanobacteria", "sliding clamp Prochlorococcus", "WP_011131639.1", "DNA replication Prochlorococcus"

### 2. Literature Overview:

- ~15 directly relevant papers
- Date range: 1998-2023
- Main focus: DNA replication mechanisms in cyanobacteria, sliding clamp function in prokaryotes

### 3. Key Sources:


a) [Robinson et al., 2013] "Structure and mechanism of the β-sliding clamp from E. coli"
- Detailed structural analysis of the β-sliding clamp
- Established conserved mechanism across prokaryotes

b) [Johnson & O'Donnell, 2005] "Cellular DNA Replicases"
- Comprehensive review of DNA polymerase III function
- Describes essential role of β-subunit in processivity

c) [Partensky et al., 1999] "Prochlorococcus: Advantages and Limits of Minimalism"
- Overview of Prochlorococcus genome streamlining
- Discusses conservation of essential DNA replication machinery

### 4. Challenges:

- Limited direct studies on dnaN in Prochlorococcus
- Most information inferred from other bacterial systems
- Few studies on regulation under different conditions

### 5. Research Gaps:

- Limited information on strain-specific variations
- Lack of data on expression changes in coculture
- Unknown stress response patterns

### 6. Initial Observations:

- Highly conserved essential gene
- Critical for DNA replication fidelity
- Part of core genome maintained despite genome streamlining

### 7. Key Themes:

- Essential for cell division
- Highly conserved structure and function
- Part of minimal genome requirements

### 8. Potential Implications:

- May influence growth rate regulation
- Could be involved in stress response
- Potential target for studying cell cycle control

In [110]:
database_entry

{'primary_function': "The dnaN gene encodes the β-sliding clamp subunit of DNA polymerase III, essential for processive DNA replication in Prochlorococcus. It forms a ring-shaped homodimer that encircles DNA and tethers the polymerase to the template, enabling efficient and accurate DNA synthesis [Johnson & O'Donnell, 2005].\n    Conservation: Highly conserved across all prokaryotes, with maintained structure and function across all known Prochlorococcus strains.",
 'physiological_contribution': 'Functions as a central coordinator of DNA replication and repair processes, essential for cell division and genome maintenance. The β-clamp also serves as a platform for other proteins involved in DNA metabolism [Robinson et al., 2013].\n    Conservation: Core function is universally conserved across prokaryotes, including all Prochlorococcus ecotypes.',
 'stress_responses': {'response1': 'Limited direct data available for Prochlorococcus-specific stress responses.\n  Conservation: Information

In [111]:
display(Markdown(database_entry2text(database_entry)))


### Primary_function

The dnaN gene encodes the β-sliding clamp subunit of DNA polymerase III, essential for processive DNA replication in Prochlorococcus. It forms a ring-shaped homodimer that encircles DNA and tethers the polymerase to the template, enabling efficient and accurate DNA synthesis [Johnson & O'Donnell, 2005].
    Conservation: Highly conserved across all prokaryotes, with maintained structure and function across all known Prochlorococcus strains.


### Physiological_contribution

Functions as a central coordinator of DNA replication and repair processes, essential for cell division and genome maintenance. The β-clamp also serves as a platform for other proteins involved in DNA metabolism [Robinson et al., 2013].
    Conservation: Core function is universally conserved across prokaryotes, including all Prochlorococcus ecotypes.


### Stress_responses

* response1: Limited direct data available for Prochlorococcus-specific stress responses.
  Conservation: Information not available.

* response2: May be involved in DNA damage response pathways based on studies in other bacteria.
  Conservation: Likely conserved but requires experimental validation.


### Uptake_exudation

* uptake: No direct evidence for involvement in uptake processes.
  Conservation: Not applicable.

* exudation: No known role in exudation processes.
  Conservation: Not applicable.


### Phylogenetic_persistence

The dnaN gene is present in all sequenced Prochlorococcus strains and shows high sequence conservation. It represents part of the core genome maintained despite extensive genome streamlining in Prochlorococcus [Partensky et al., 1999].


### Coculture_role

No specific studies available on dnaN expression or function in coculture conditions.
    Conservation: Information not available.


### References

* ref1: Johnson A, O'Donnell M. (2005). Cellular DNA Replicases: Components and Dynamics at the Replication Fork. Annual Review of Biochemistry, 74:283-315.

* ref2: Robinson A, et al. (2013). Structure and mechanism of the β-sliding clamp from E. coli. Nucleic Acids Research, 41(6):3812-3824.

* ref3: Partensky F, et al. (1999). Prochlorococcus: Advantages and Limits of Minimalism. Annual Review of Marine Science, 1:227-252.


# Build gradio app

In [157]:
import gradio as gr

df_to_display = gff_named_df[['gene', 'product']]
def filter_records(query):
    filtered_df = df_to_display.loc[
        df_to_display['gene'].str.contains(query, case=False, na=False) |
        df_to_display['product'].str.contains(query, case=False, na=False) 
    ]
    return filtered_df

def gene_desc2text(gene):
    row_select = df_to_display.loc[df_to_display['gene'].isin([gene])].squeeze()
    return f'# {row_select["gene"]}:    {row_select["product"]}\n'

def on_select(evt: gr.SelectData):
    #  'col_value', 'index', 'row_value', 'selected', 'target', 'value']
    return evt.row_value[0]


In [158]:
gene_desc2text(gene)

'# dnaN:    DNA polymerase III subunit beta\n'

In [159]:
def get_gene_db_results(gene):
    desc = gene_desc2text(gene)
    
    literature_review, database_entry = parse_database_batch_message(gene)
    return desc, literature_review2text(literature_review), database_entry2text(database_entry)

In [164]:
with gr.Blocks() as demo:
    output_gene_desc = gr.Markdown()
    with gr.Row(equal_height=True):        
        output_database_entry = gr.Markdown(label="Database Entry", container=True, show_label=True, show_copy_button=True)
        with gr.Accordion("Literature Review"):
            output_lit_review = gr.Markdown(label="Literature Review", container=True, show_label=True, show_copy_button=True)
    with gr.Row():
        genename = gr.Textbox(label="Gene Name")
        genename.submit(fn=get_gene_db_results, inputs=genename, outputs=[output_gene_desc,output_lit_review, output_database_entry], api_name="Gene DB")
        submit_btn = gr.Button("Submit")
    examples = gr.Examples(
        examples=sorted(list(batch_result_dict.keys())), inputs=genename, examples_per_page=50)
    with gr.Accordion("List of Available Genes", open=False, ):
        filter_textbox =  gr.Textbox(label="Filter")
        
        gene_dataframe = gr.Dataframe(df_to_display, max_height=400, wrap=True, inputs=['filtertextbox'])
        gene_dataframe.select(on_select, outputs=genename)
        filter_textbox.submit(fn=filter_records, inputs=filter_textbox, outputs=gene_dataframe)
        
    submit_btn.click(fn=get_gene_db_results, inputs=genename, outputs=[output_gene_desc,output_lit_review, output_database_entry], api_name="Gene DB")

demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7898
* Running on public URL: https://710734d2af5ad6113f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [165]:

literature_review, database_entry = parse_database_batch_message(gene)


In [166]:
database_entry

{'primary_function': "The dnaN gene encodes the β-sliding clamp subunit of DNA polymerase III, essential for processive DNA replication in Prochlorococcus. It forms a ring-shaped homodimer that encircles DNA and tethers the polymerase to the template, enabling efficient and accurate DNA synthesis [Johnson & O'Donnell, 2005].\n    Conservation: Highly conserved across all prokaryotes, with maintained structure and function across all known Prochlorococcus strains.",
 'physiological_contribution': 'Functions as a central coordinator of DNA replication and repair processes, essential for cell division and genome maintenance. The β-clamp also serves as a platform for other proteins involved in DNA metabolism [Robinson et al., 2013].\n    Conservation: Core function is universally conserved across prokaryotes, including all Prochlorococcus ecotypes.',
 'stress_responses': {'response1': 'Limited direct data available for Prochlorococcus-specific stress responses.\n  Conservation: Information

In [248]:
def database_section2list(section_name, section_content):
    if isinstance(section_content, dict):
        if section_name == 'uptake_exudation':
            return (section_content.items())
        else:
            return ((section_name, v) for v in section_content.values())
    else: 
        return [(section_name, section_content)]

def get_database_entry_df(gene):
    literature_review, database_entry = parse_database_batch_message(gene)
    content_list = [
        dict(gene=gene, section_name=name, section_content=content) 
        for section_name, section_content in database_entry.items()
        for name,content in database_section2list(section_name, section_content)
    ]
    return pd.DataFrame(content_list)


In [249]:

df_database = pd.concat([get_database_entry_df(gene) for gene in batch_result_dict], ignore_index=True)


Missing literature_review (trpA)
Missing literature_review (lpxD)
Missing literature_review (ispE)
Missing literature_review (bchI)
Missing literature_review (lysA)


In [276]:
df_database.head(10)

Unnamed: 0,gene,section_name,section_content
0,dnaN,primary_function,The dnaN gene encodes the β-sliding clamp subu...
1,dnaN,physiological_contribution,Functions as a central coordinator of DNA repl...
2,dnaN,stress_responses,Limited direct data available for Prochlorococ...
3,dnaN,stress_responses,May be involved in DNA damage response pathway...
4,dnaN,uptake,No direct evidence for involvement in uptake p...
5,dnaN,exudation,No known role in exudation processes.\n Conse...
6,dnaN,phylogenetic_persistence,The dnaN gene is present in all sequenced Proc...
7,dnaN,coculture_role,No specific studies available on dnaN expressi...
8,dnaN,references,"Johnson A, O'Donnell M. (2005). Cellular DNA R..."
9,dnaN,references,"Robinson A, et al. (2013). Structure and mecha..."


# References

In [235]:
refereces_series = df_database.loc[df_database.section_name.isin(['references']), 'section_content']

In [240]:
reference_df = refereces_series.str.extract(r'^([^\(]+) (\(\d+\))\.? ([^\.]+)\.([^,]+),(.*)$')
reference_df = reference_df.rename(columns={0: 'Author', 1:'Year', 2:'Title', 3:'Journal', 4:'Pages'})

In [245]:
reference_df.drop_duplicates(subset='Title').head(40)

Unnamed: 0,Author,Year,Title,Journal,Pages
8,"Johnson A, O'Donnell M.",(2005),Cellular DNA Replicases: Components and Dynami...,Annual Review of Biochemistry,74:283-315.
9,"Robinson A, et al.",(2013),Structure and mechanism of the β-sliding clamp...,coli. Nucleic Acids Research,41(6):3812-3824.
10,"Partensky F, et al.",(1999),Prochlorococcus: Advantages and Limits of Mini...,Annual Review of Marine Science,1:227-252.
19,"Kettler, G.C., et al.",(2007),Patterns and Implications of Gene Gain and Los...,PLoS Genetics,3(12): e231.
20,"Casey, J.R., et al.",(2016),Carbon fate in marine systems: partitioning be...,ISME Journal,10: 1827-1840.
21,"Rangel, O.A., et al.",(2019),Purine biosynthesis in marine prokaryotes: eco...,Environmental Microbiology,21(5): 1778-1793.
30,,,,,
76,"Dufresne, A., et al.",(2003),Genome sequence of the cyanobacterium Prochlor...,PNAS,"100(17), 10020-10025."
77,"García-Fernández, J.M., et al.",(2004),Streamlined regulation and gene loss as adapti...,Microbiol Mol Biol Rev,"68(4), 630-638."
78,"Kettler, G.C., et al.",(2007),Patterns and implications of gene gain and los...,PLoS Genetics,"3(12), e231."


In [244]:
reference_df.Title.value_counts().head(40)

Title
Patterns and Implications of Gene Gain and Loss in the Evolution of Prochlorococcus                                            62
Ecological genomics of marine picocyanobacteria                                                                                35
Genome sequence of the cyanobacterium Prochlorococcus marinus SS120, a nearly minimal oxyphototrophic genome                   30
Bacterial vesicles in marine ecosystems                                                                                        26
Global gene expression of Prochlorococcus ecotypes in response to changes in nitrogen availability                             22
Transcriptome and proteome dynamics of a light-dark synchronized bacterial cell cycle                                          17
Prochlorococcus: the structure and function of collective diversity                                                            13
Ecotypic variation in phosphorus-acquisition mechanisms within marine picocyanobacte

In [230]:
refereces_series = refereces_series.str.replace(r'^(\w+).* et al. \(', r'\1 (', regex=True)
refereces_series = refereces_series.str.replace(r',? (\W|\d)+(\W|\d|e)*$', r'', regex=True)
refereces_series = refereces_series.str.replace(r'\s+', r' ', regex=True)
refereces_series = refereces_series.str.replace(').', ')', regex=False)
refereces_series = refereces_series.str.strip()

#refereces_series = refereces_series.str.replace(r',?\s*\d+\(\d+\)[,:]?\s*e\d+\.?', r'', regex=True)

In [231]:
refereces_series.value_counts()

section_content
Kettler (2007) Patterns and Implications of Gene Gain and Loss in the Evolution of Prochlorococcus. PLoS Genetics                              71
Dufresne (2003) Genome sequence of the cyanobacterium Prochlorococcus marinus SS120, a nearly minimal oxyphototrophic genome. PNAS             34
Biller (2014) Bacterial vesicles in marine ecosystems. Science                                                                                 27
Berube (2015) Physiology and evolution of nitrate acquisition in Prochlorococcus. ISME J                                                       12
Waldbauer (2012) Transcriptome and proteome dynamics of a light-dark synchronized bacterial cell cycle. PLoS One                               12
                                                                                                                                               ..
Thompson (2011) Science                                                                                     

In [234]:
refereces_series.value_counts().head(40)

section_content
Kettler (2007) Patterns and Implications of Gene Gain and Loss in the Evolution of Prochlorococcus. PLoS Genetics                                                         71
Dufresne (2003) Genome sequence of the cyanobacterium Prochlorococcus marinus SS120, a nearly minimal oxyphototrophic genome. PNAS                                        34
Biller (2014) Bacterial vesicles in marine ecosystems. Science                                                                                                            27
Berube (2015) Physiology and evolution of nitrate acquisition in Prochlorococcus. ISME J                                                                                  12
Waldbauer (2012) Transcriptome and proteome dynamics of a light-dark synchronized bacterial cell cycle. PLoS One                                                          12
Scanlan (2009) Ecological genomics of marine picocyanobacteria. Microbiology and Molecular Biology Reviews             

# Uptake

In [251]:
uptake_series = df_database.loc[df_database.section_name.isin(['uptake']), 'section_content']


In [252]:
uptake_series.value_counts()

section_content
No direct evidence for involvement in uptake processes.\n  Conservation: Not applicable.                                                                        77
No direct evidence for involvement in uptake processes\n  Conservation: Not applicable                                                                          51
No direct role in uptake processes identified\n  Conservation: Not applicable                                                                                   46
No direct role in uptake processes identified.\n  Conservation: Not applicable.                                                                                 36
No direct evidence for involvement in uptake processes.\n  Conservation: Information not available.                                                             27
                                                                                                                                                                ..
No dir

In [387]:
uptake_df = df_database.loc[
    df_database.section_name.isin(['uptake']) & 
    ~df_database.section_content.str.startswith('No direct evidence') & 
    ~df_database.section_content.str.startswith('No direct role') &
    ~df_database.section_content.str.startswith('No direct involvement in uptake processes') &
    ~df_database.section_content.str.startswith('No specific information available about uptake') &
    ~df_database.section_content.str.startswith('No specific data available on uptake processes') &
    ~df_database.section_content.str.startswith('No direct information available on uptake pro') &
    ~df_database.section_content.str.startswith('No direct information available about uptake') & 
    ~df_database.section_content.str.startswith('No specific data available regarding uptake')  &
    ~df_database.section_content.str.startswith('No specific information available for uptake') &
    ~df_database.section_content.str.startswith('Information not available for specific uptake processes') &
    ~df_database.section_content.str.startswith('No specific information available') &
    ~df_database.section_content.str.startswith('Insufficient data available on specific uptake processes') &
    ~df_database.section_content.str.startswith('Insufficient data available for specific uptake') &
    ~df_database.section_content.str.startswith('[No direct role in uptake processes identified]') 

  
]


In [388]:
with pd.option_context('display.max_colwidth', None):
    display(uptake_df[['gene', 'section_content']].style.set_properties(**{'text-align': 'left'}))    

Unnamed: 0,gene,section_content
37,queG,"Requires cobalamin as cofactor, suggesting dependence on external vitamin B12 sources.  Conservation: Likely conserved across Prochlorococcus strains due to general B12 auxotrophy."
502,ndhI,"Critical component of the CO2 uptake system through NDH-1 complex activity  Conservation: Conserved mechanism but with strain-specific variations in efficiency [Wang et al., 2016]"
810,sat,"Facilitates sulfate uptake through activation of sulfate for subsequent metabolism  Conservation: Mechanism conserved across cyanobacteria [Casey et al., 2009]"
888,ntcA,"Regulates expression of nitrogen transport systems including ammonium and urea transporters  Conservation: Core transport systems are conserved but with strain-specific variations [Berube, 2015]."
978,amt,"AMT facilitates high-affinity NH4+ transport with Km values in the nanomolar range [García-Fernández, 2004].  Conservation: Transport kinetics are conserved but show some strain-specific optimization [Martiny, 2009]."
1225,pip,"May be involved in the processing of external peptides for nitrogen acquisition, though direct evidence is limited.  Conservation: Similar roles observed in other marine bacteria, but specific conservation patterns unknown."
1246,ntrB,Facilitates active transport of nitrate across cell membrane as part of ABC transporter complex  Conservation: Function conserved in strains with complete nitrate transport systems.
1257,cynS,"Cyanate uptake mechanisms are linked to cynS expression, though specific transporters are not fully characterized.  Conservation: Data insufficient to determine conservation status of uptake mechanisms."
1617,larC,"May be involved in nickel utilization pathways, but specific mechanisms not characterized in Prochlorococcus.  Conservation: Information not available."
1924,glk,Works in conjunction with glucose transporters to facilitate glucose utilization.  Conservation: Mechanism conserved across strains but efficiency varies.


In [389]:
uptake_df.shape

(37, 3)

# Exudation

In [277]:
exudation_series = df_database.loc[df_database.section_name.isin(['exudation']), 'section_content']


In [278]:
exudation_series.value_counts()

section_content
No direct evidence for involvement in exudation processes.\n  Conservation: Not applicable.                                 39
No direct role in exudation processes identified\n  Conservation: Not applicable                                            33
No direct role in exudation processes identified.\n  Conservation: Not applicable.                                          29
No known role in exudation processes.\n  Conservation: Not applicable.                                                      28
No direct evidence for involvement in exudation processes\n  Conservation: Not applicable                                   21
                                                                                                                            ..
No known exudation processes associated with nadA.\n  Conservation: Not applicable                                           1
No specific data available on exudation processes\n  Conservation: Unable to determine conserva

In [382]:
exudation_df = df_database.loc[
    df_database.section_name.isin(['exudation']) & 
    ~df_database.section_content.str.startswith('No direct evidence') & 
    ~df_database.section_content.str.startswith('No direct role') &
    ~df_database.section_content.str.startswith('No known role') &
    ~df_database.section_content.str.startswith('No specific data available') &
    ~df_database.section_content.str.startswith('No specific information available') &
    ~df_database.section_content.str.startswith('No documented role') &
    ~df_database.section_content.str.startswith('No known exudation processes') &
    ~df_database.section_content.str.startswith('Insufficient data available regarding exudation processes') &
    ~df_database.section_content.str.startswith('No known direct role') &
    ~df_database.section_content.str.startswith('Insufficient data available') &
    ~df_database.section_content.str.startswith('No known direct role ') &
    ~df_database.section_content.str.startswith('Limited data available on ') &
    ~df_database.section_content.str.startswith('No documented exudation processes') &
    ~df_database.section_content.str.startswith('No documented') &
    ~df_database.section_content.str.startswith('No information available') &
    ~df_database.section_content.str.startswith('No information available regarding exudation processes') &
    ~df_database.section_content.str.startswith('No direct involvement in ') &
    ~df_database.section_content.str.startswith('No specific information available about') &
    ~df_database.section_content.str.startswith('No specific data available on') &
    ~df_database.section_content.str.startswith('No direct information available on') &
    ~df_database.section_content.str.startswith('No direct information available about') & 
    ~df_database.section_content.str.startswith('No specific data available regarding ')  &
    ~df_database.section_content.str.startswith('No specific information available for ') &
    ~df_database.section_content.str.startswith('Information not available for specific ') &
    ~df_database.section_content.str.startswith('No specific information available') &
    ~df_database.section_content.str.startswith('Insufficient data available on specific ') &
    ~df_database.section_content.str.startswith('Insufficient data available for specific') &
    ~df_database.section_content.str.startswith('[No direct role in exudation processes') 

  
]


In [383]:
exudation_df.shape

(27, 3)

In [384]:
with pd.option_context('display.max_colwidth', None):
    display(exudation_df[['gene', 'section_content']].style.set_properties(**{'text-align': 'left'}))

Unnamed: 0,gene,section_content
1574,rplS,"Protein has been detected in membrane vesicles [Biller et al., 2014].  Conservation: Vesicle-associated presence appears conserved across marine bacteria."
1695,lepB,"Involved in processing of secreted proteins, though specific substrates in Prochlorococcus are not well-characterized.  Conservation: General role in protein secretion is conserved across bacteria."
2173,chrA,Active efflux of chromate ions across cell membrane  Conservation: Mechanism appears conserved across ChrA family transporters
2184,arsJ,"Facilitates export of arsenical compounds through MFS transport mechanism  Conservation: Transport mechanism appears conserved across species expressing arsJ [Chen, 2016]"
2195,arsB,Active efflux of arsenite compounds  Conservation: Mechanism appears conserved across cyanobacterial arsB homologs
2364,lptB,Indirect role in membrane vesicle formation and release  Conservation: Process appears conserved but insufficient data available
2659,lptC,"May influence exudation through membrane organization, but direct evidence lacking.  Conservation: Not applicable due to lack of data."
2844,rpmB,"Present in membrane vesicles, suggesting potential role in intercellular communication [Biller et al., 2014].  Conservation: Vesicle-associated presence appears conserved across marine bacteria."
2951,secF,"Involved in protein secretion pathway, though specific secreted proteins not well characterized in Prochlorococcus  Conservation: Basic mechanism conserved but specific targets unknown"
2962,secD,"Involved in protein secretion pathway, though specific secreted proteins in Prochlorococcus are not well characterized.  Conservation: Basic secretion mechanism appears conserved, but substrate specificity may vary."


# Stress response

In [298]:
stress_responses_series = df_database.loc[df_database.section_name.isin(['stress_responses']), 'section_content']


In [299]:
stress_responses_series.value_counts()


section_content
Limited data available on specific stress responses.\n  Conservation: Information not available.                                                                                             14
No direct studies available on stress responses in Prochlorococcus.\n  Conservation: Information not available.                                                                               5
No specific data available for other stress responses.\n  Conservation: Information not available.                                                                                            5
Limited data available on other stress responses.\n  Conservation: Insufficient data to determine conservation of other stress responses.                                                     4
Limited direct evidence for stress response role.\n  Conservation: Information not available.                                                                                                 4
                        

In [316]:
stress_responses_df = df_database.loc[
    df_database.section_name.isin(['stress_responses']) & 
    ~df_database.section_content.str.startswith('Limited data available') & 
    ~df_database.section_content.str.startswith('No direct studies available') & 
    ~df_database.section_content.str.startswith('No specific data available') & 
    ~df_database.section_content.str.startswith('Limited data available') & 
    ~df_database.section_content.str.startswith('Limited direct evidence') & 
    ~df_database.section_content.str.startswith('Limited direct evidence') & 
    ~df_database.section_content.str.startswith('Data not available') &
    ~df_database.section_content.str.startswith('Insufficient data available ') &
    ~df_database.section_content.str.startswith('Limited direct data available') &
    ~df_database.section_content.str.startswith('No direct experimental evidence') &
    ~df_database.section_content.str.startswith('No direct evidence') 
  
  
]


In [319]:
stress_responses_df.shape, stress_responses_df.gene.nunique()

((718, 3), 424)

In [337]:
with pd.option_context('display.max_colwidth', None):
    display(stress_responses_df[['gene', 'section_content']].head(40).style.set_properties(**{'text-align': 'left'}))    

Unnamed: 0,gene,section_content
3,dnaN,May be involved in DNA damage response pathways based on studies in other bacteria.  Conservation: Likely conserved but requires experimental validation.
13,purL,"Nitrogen limitation leads to increased purL expression to maintain purine synthesis efficiency.  Conservation: Response pattern observed across multiple strains, suggesting conserved regulatory mechanism."
24,purF,"Expression levels show moderate increase under nitrogen limitation conditions [Casey et al., 2016].  Conservation: Response appears conserved across high-light adapted strains, less studied in low-light ecotypes."
35,queG,"May be involved in translation optimization under nutrient stress, particularly in relation to cobalamin availability.  Conservation: Response pattern unknown in Prochlorococcus specifically."
36,queG,Potential role in maintaining translation fidelity under various environmental conditions.  Conservation: Insufficient data for conservation assessment in Prochlorococcus.
46,nusB,May be involved in general stress response regulation through modulation of transcription termination/antitermination decisions.  Conservation: Limited data available specific to Prochlorococcus; role inferred from other bacterial systems.
47,nusB,Potential role in cold stress response through regulation of RNA structure formation.  Conservation: Insufficient data to confirm conservation in Prochlorococcus.
57,ftsY,Maintains membrane protein targeting under high light stress  Conservation: Response conserved across high-light adapted strains
58,ftsY,Involved in temperature stress response through regulation of membrane protein insertion  Conservation: Temperature response mechanisms appear conserved but with strain-specific variations
70,argH,"Nitrogen limitation response: ArgH expression may be modulated under nitrogen stress conditions.  Conservation: Response appears conserved across strains, though detailed studies are limited."


# coculture

In [338]:
coculture_role_series = df_database.loc[df_database.section_name.isin(['coculture_role']), 'section_content']


In [339]:
coculture_role_series.value_counts()

section_content
Limited data available on expression changes in coculture conditions.\n    Conservation: Unable to assess conservation due to limited data.                                                                                                                                 4
Limited data available on expression changes in coculture conditions.\n    Conservation: Unable to assess conservation due to insufficient data.                                                                                                                            3
Limited direct evidence for specific roles in coculture conditions\n    Conservation: Information not available                                                                                                                                                             2
Limited data available on expression changes in coculture conditions.\n    Conservation: Unable to assess conservation due to lack of data.                                   

In [379]:
coculture_role_df = df_database.loc[
    df_database.section_name.isin(['coculture_role']) & 
    ~df_database.section_content.str.startswith('Limited data available on expression changes in coculture conditions') & 
    ~df_database.section_content.str.startswith('Limited direct evidence ') &
    ~df_database.section_content.str.startswith('Insufficient data available ') &
    ~df_database.section_content.str.startswith('No specific data available') &
    ~df_database.section_content.str.startswith('No specific studies available ') &
    ~df_database.section_content.str.startswith('No direct studies available ') &
    ~df_database.section_content.str.startswith('No specific studies ') &
    ~df_database.section_content.str.startswith('No studies have directly examined ') &
    ~df_database.section_content.str.startswith('No significant changes ') &
    ~df_database.section_content.str.startswith('Limited data available on ') &
    ~df_database.section_content.str.startswith('Limited direct research on ') &
     ~df_database.section_content.str.startswith('Limited direct studies on ') &
     ~df_database.section_content.str.startswith('No direct evidence available') &
     ~df_database.section_content.str.startswith('No direct studies of ') &
     ~df_database.section_content.str.startswith('No specific information available ') &
     ~df_database.section_content.str.startswith('No studies available ') &
     ~df_database.section_content.str.startswith('No direct studies on ') &
     ~df_database.section_content.str.startswith('No published research available ') &
     ~df_database.section_content.str.startswith('No direct experimental evidence available ') &
     ~df_database.section_content.str.startswith('Limited direct studies available ') &
     ~df_database.section_content.str.startswith('Limited direct information available') &
     ~df_database.section_content.str.contains('not available in current literature')  &
     ~df_database.section_content.str.startswith('No direct evidence for specific role ') &
     ~df_database.section_content.str.startswith('[No specific information available ') 
    # ~df_database.section_content.str.startswith('No specific information available') &
    # ~df_database.section_content.str.startswith('Insufficient data available on specific ') &
    # ~df_database.section_content.str.startswith('Insufficient data available for specific') 

  
]


In [380]:
coculture_role_df.shape

(17, 3)

In [381]:
with pd.option_context('display.max_colwidth', None):
    display(coculture_role_df[['gene', 'section_content']].style.set_properties(**{'text-align': 'left'}))

Unnamed: 0,gene,section_content
208,cbiD,"While specific coculture studies focusing on cbiD are limited, the gene's role in B12 synthesis suggests it may influence interactions with B12-dependent heterotrophic bacteria.  Conservation: Information about conservation in coculture conditions is not available."
1442,coxB,"Expression levels of coxB show moderate changes in coculture with heterotrophic bacteria, suggesting potential adaptation of respiratory metabolism in response to bacterial presence [Casey, 2016].  Conservation: Response patterns appear similar across strains but with varying magnitude."
1453,groL,"Expression levels of groL show moderate increases when Prochlorococcus is grown in coculture with heterotrophic bacteria, particularly under oxidative stress conditions [Morris et al., 2010].  Conservation: This response appears consistent across multiple strains tested in coculture conditions, though magnitude of response varies."
1872,petE,"Limited data available. Some evidence suggests modified expression patterns in presence of heterotrophic bacteria [Morris et al., 2012].  Conservation: Insufficient data to determine conservation of coculture responses."
2046,acs,"Limited direct studies available. Some evidence suggests increased expression in presence of heterotrophic bacteria, possibly related to acetate availability in coculture conditions.  Conservation: Insufficient data to determine conservation of coculture response."
2108,nrdJ,"Expression of nrdJ may be influenced by the presence of B12-producing heterotrophic bacteria, though direct studies are limited.  Conservation: Insufficient data to determine conservation of coculture responses."
2715,metH,"Expression levels increase in coculture with B12-producing heterotrophic bacteria. Forms part of the molecular basis for bacterial-algal symbiosis in marine environments.  Conservation: Coculture response appears conserved across studied strains, though detailed studies are limited."
2823,dnaK,"Expression levels change during coculture with heterotrophic bacteria, particularly under oxidative stress conditions [Morris et al., 2010].  Conservation: Response appears conserved across studied Prochlorococcus-heterotroph interactions, though detailed studies are limited."
3096,ureE,"Expression levels increase in coculture with heterotrophic bacteria, suggesting enhanced urea utilization in mixed communities [Christie-Oleza et al., 2017]  Conservation: Response observed across multiple Prochlorococcus strains, though magnitude varies"
3118,urtA,"Expression levels change in response to heterotrophic bacterial presence, suggesting integration in bacterial community interactions. Studies indicate potential upregulation when cocultured with specific heterotrophic bacteria that may provide urea through degradation of organic matter.  Conservation: Coculture responses appear conserved at the genus level but with strain-specific variations in magnitude."
