In [1]:
import pandas as pd
from pathlib import Path

def process_files_pandas(input_path):
    output_csv = input_path.parent / f"{input_path.stem}_with_ids.csv"
    output_fasta = input_path.with_suffix(".fasta")

    # Read CSV and add identifiers
    df = pd.read_csv(input_path)
    df.insert(0, 'identifier', [f"phage_{i:04d}" for i in range(1, len(df) + 1)])

    # Create FASTA
    with open(output_fasta, 'w') as f:
        for _, row in df.iterrows():
            f.write(f">{row['identifier']}\n{row['sequence']}\n")

    # Remove sequence column and save CSV
    df_clean = df.drop('sequence', axis=1)
    df_clean.to_csv(output_csv, index=False)

# Choose which version to run
csv_path = Path("raw/all_members_7k_function_500_hyp.csv")
csv_path = Path("raw/foldseek_clusterreps_7k_function_500_hyp.csv")
process_files_pandas(csv_path)

In [11]:
import pandas as pd

df = pd.read_csv("processed/foldseek.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   identifier  7500 non-null   object
 1   phrog       7500 non-null   int64 
 2   product     7002 non-null   object
 3   function    7500 non-null   object
dtypes: int64(1), object(3)
memory usage: 234.5+ KB


In [12]:
df['product'].unique()

array(['head-tail adaptor', 'FmdB-like transcriptional regulator',
       'Rz-like spanin', 'minor head protein', 'HNH endonuclease',
       'major tail protein', nan, 'DNA binding protein',
       'tail length tape measure protein', 'DNA polymerase',
       'tail fiber protein', 'transcriptional repressor',
       'replication initiation protein', 'homing endonuclease', 'holin',
       'polymerase', 'major head protein', 'integrase',
       'DNA methyltransferase', 'aminotransferase', 'endolysin',
       'RecT-like ssDNA annealing protein',
       'anaerobic ribonucleoside reductase large subunit',
       'phosphoheptose isomerase', 'minor tail protein', 'tail protein',
       'virion structural protein', 'Tail assembly chaperone',
       'tail collar fiber protein', 'terminase small subunit',
       'RNA replicase beta subunit', 'tail completion or Neck1 protein',
       'head closure Hc1', 'sigma factor', '2OG-Fe(II) oxygenase',
       'Arc-like repressor', 'ADP-ribosyltransferase e

### Classification system for "product"

In [15]:
import pandas as pd
import numpy as np
import re

def create_phage_protein_mapping():
    """
    Creates a dictionary mapping specific phage proteins to broader functional categories.
    """
    return {
        # Structural proteins
        'major_capsid': [
            r'major head protein',
            r'major coat protein',
            r'head protein',
            r'major head and protease protein',
            r'head morphogenesis',
            r'head decoration',
            r'head closure',
            r'head assembly',
            r'head fiber protein',
            r'minor head protein',
            r'internal head protein'
        ],
        'tail_proteins': [
            r'tail protein',
            r'major tail protein',
            r'minor tail protein',
            r'tail fiber protein',
            r'tail assembly',
            r'tail sheath',
            r'tail tube',
            r'tail spike protein',
            r'tail completion',
            r'tail collar',
            r'tail terminator',
            r'tail length tape measure protein',
            r'tail associated',
            r'tail tip',
            r'tail chaperone',
            r'tail assembly chaperone'
        ],
        'baseplate': [
            r'baseplate',
            r'base plate'
        ],
        'portal_connector': [
            r'portal protein',
            r'head-tail connector',
            r'head-tail adaptor',
            r'head-tail joining'
        ],

        # DNA/RNA processing
        'dna_replication': [
            r'dna polymerase',
            r'dna helicase',
            r'dna primase',
            r'replication',
            r'replicative',
            r'single strand.*binding',
            r'ssb',
            r'clamp loader'
        ],
        'dna_packaging': [
            r'terminase',
            r'packaging'
        ],
        'transcription': [
            r'rna polymerase',
            r'transcription',
            r'transcriptional',
            r'sigma factor'
        ],
        'nucleases': [
            r'nuclease',
            r'endonuclease',
            r'exonuclease'
        ],

        # Host interaction
        'lysis': [
            r'holin',
            r'lysin',
            r'endolysin',
            r'amidase',
            r'peptidase',
            r'lysis',
            r'spanin'
        ],
        'host_manipulation': [
            r'toxin',
            r'antitoxin',
            r'immunity',
            r'infection',
            r'host.*inhibitor',
            r'superinfection',
            r'anti-restriction'
        ],

        # Enzymes
        'transferases': [
            r'transferase',
            r'methyltransferase',
            r'glycosyltransferase'
        ],
        'hydrolases': [
            r'hydrolase',
            r'phosphatase',
            r'phosphohydrolase',
            r'pyrophosphatase'
        ],
        'ligases': [
            r'ligase',
            r'synthetase'
        ],

        # Regulatory proteins
        'regulators': [
            r'regulator',
            r'repressor',
            r'activator',
            r'regulatory',
            r'regulation'
        ],

        # Assembly and morphogenesis
        'morphogenesis': [
            r'scaffolding',
            r'assembly',
            r'morphogenesis',
            r'chaperone'
        ],

        # Other important categories
        'structural': [
            r'structural protein',
            r'virion.*protein'
        ],
        'membrane_associated': [
            r'membrane protein',
            r'membrane associated',
            r'lipoprotein'
        ]
    }


def create_detailed_phage_mapping():
    """
    Creates a detailed dictionary mapping phage proteins to specific functional categories.
    Categories are broken down into fine-grained functional groups.
    """
    return {
        # Head/Capsid components
        'major_capsid': [
            r'major head protein',
            r'major coat protein',
            r'major head and protease protein',
            r'head morphogenesis'
        ],
        'minor_capsid': [
            r'minor head protein',
            r'head decoration',
            r'head fiber protein',
            r'internal head protein'
        ],
        'head_assembly': [
            r'head assembly',
            r'head closure',
            r'prohead',
            r'scaffolding protein',
            r'head maturation'
        ],

        # Tail components
        'major_tail': [
            r'major tail protein',
            r'tail tube protein',
            r'tail sheath'
        ],
        'minor_tail': [
            r'minor tail protein',
            r'tail fiber protein',
            r'tail spike protein',
            r'tail needle protein',
            r'tail tip',
            r'tail associated lysin'
        ],
        'tail_assembly': [
            r'tail assembly',
            r'tail chaperone',
            r'tail assembly chaperone',
            r'tail completion',
            r'tail terminator'
        ],
        'tail_tape_measure': [
            r'tail length tape measure protein',
            r'tape measure'
        ],

        # Connection components
        'portal': [
            r'portal protein'
        ],
        'connector': [
            r'head-tail connector',
            r'head-tail adaptor',
            r'head-tail joining',
            r'neck protein'
        ],
        'baseplate': [
            r'baseplate',
            r'base plate',
            r'baseplate wedge',
            r'baseplate hub'
        ],

        # DNA processing
        'dna_replication': [
            r'dna polymerase',
            r'replication initiation',
            r'replication protein',
            r'dna primase',
            r'helicase',
            r'clamp loader',
            r'single strand.*binding',
            r'ssb'
        ],
        'dna_packaging': [
            r'terminase large subunit',
            r'terminase small subunit',
            r'packaging',
            r'dna packaging'
        ],
        'dna_modification': [
            r'methyltransferase',
            r'dna methyltransferase',
            r'modification protein'
        ],
        'recombination': [
            r'recombinase',
            r'integrase',
            r'excisionase',
            r'resolvase',
            r'invertase',
            r'site-specific recombination'
        ],

        # RNA processing
        'transcription': [
            r'rna polymerase',
            r'sigma factor',
            r'transcription factor'
        ],
        'transcription_regulation': [
            r'transcriptional regulator',
            r'transcriptional activator',
            r'transcriptional repressor',
            r'anti-terminator'
        ],

        # Nucleases
        'endonucleases': [
            r'endonuclease',
            r'restriction endonuclease',
            r'homing endonuclease'
        ],
        'exonucleases': [
            r'exonuclease',
            r'exonuclease V',
            r'exonuclease VIII'
        ],

        # Host interaction
        'lysis': [
            r'holin',
            r'endolysin',
            r'lysin',
            r'spanin',
            r'lysis protein'
        ],
        'peptidases': [
            r'peptidase',
            r'protease',
            r'amidase'
        ],
        'host_takeover': [
            r'host transcription',
            r'host manipulation',
            r'host killing',
            r'host range',
            r'host specificity'
        ],
        'immunity': [
            r'immunity protein',
            r'superinfection',
            r'anti-restriction',
            r'restriction alleviation'
        ],

        # Various enzymes
        'transferases': [
            r'glycosyltransferase',
            r'acetyltransferase',
            r'nucleotidyltransferase',
            r'phosphotransferase',
            r'aminotransferase'
        ],
        'hydrolases': [
            r'hydrolase',
            r'phosphatase',
            r'phosphohydrolase',
            r'pyrophosphatase'
        ],
        'oxidoreductases': [
            r'oxidoreductase',
            r'dehydrogenase',
            r'reductase',
            r'oxidase'
        ],
        'ligases': [
            r'ligase',
            r'synthetase',
            r'dna ligase',
            r'rna ligase'
        ],

        # Membrane and structural
        'membrane_proteins': [
            r'membrane protein',
            r'transmembrane',
            r'lipoprotein'
        ],
        'structural': [
            r'structural protein',
            r'virion structural protein',
            r'virion protein'
        ],

        # Regulatory
        'transcriptional_control': [
            r'repressor',
            r'activator',
            r'regulator'
        ],
        'regulatory_proteins': [
            r'regulatory protein',
            r'regulation',
            r'modulator',
            r'control protein'
        ],

        # Toxin-antitoxin
        'toxin_systems': [
            r'toxin',
            r'antitoxin',
            r'toxin-antitoxin'
        ]
    }

def classify_phage_protein(product_name, categories):
    """
    Classifies a phage protein name into detailed categories.

    Parameters:
    product_name (str): Original protein product name
    categories (dict): Dictionary of category patterns

    Returns:
    str or np.nan: Category name or np.nan if no match found
    """
    if pd.isna(product_name):
        return pd.NA

    product_name = str(product_name).lower()

    for category, patterns in categories.items():
        if any(re.search(pattern.lower(), product_name) for pattern in patterns):
            return category

    return pd.NA

def map_phage_proteins(df, product_column):
    """
    Maps phage proteins in a DataFrame to detailed functional categories.

    Parameters:
    df (pandas.DataFrame): DataFrame containing protein information
    product_column (str): Name of the column containing protein product names

    Returns:
    pandas.DataFrame: Original DataFrame with new 'product_category' column
    """
    categories = create_detailed_phage_mapping()
    df = df.copy()

    df['product_category'] = df[product_column].apply(
        lambda x: classify_phage_protein(x, categories)
    )

    return df

df_new = map_phage_proteins(df, "product")
df_new["product_category"].unique()

array(['connector', 'transcription_regulation', 'lysis', 'minor_capsid',
       'endonucleases', 'major_tail', <NA>, 'tail_tape_measure',
       'dna_replication', 'minor_tail', 'major_capsid', 'recombination',
       'dna_modification', 'transferases', 'oxidoreductases',
       'structural', 'tail_assembly', 'dna_packaging', 'head_assembly',
       'transcription', 'transcriptional_control', 'toxin_systems',
       'portal', 'baseplate', 'membrane_proteins', 'ligases',
       'exonucleases', 'immunity', 'peptidases', 'hydrolases',
       'host_takeover', 'regulatory_proteins'], dtype=object)

In [17]:
df_new.to_csv("processed/foldseek_new2.csv", index=False)