# Parse / Extract consistent needed data

In [9]:
import pandas as pd
from pathlib import Path

## Parse Kim Excel

In [82]:
# Define paths
DATA_PATH = Path("..") / "data"
INPUT_PATH = DATA_PATH / "raw" / "2023_Kim.xlsx"
OUTPUT_PATH = DATA_PATH / "interim" / "toxprot_2017_old.csv"

# Define constants
SHEET_NAME = "ToxProt11.2017"
COLUMNS_TO_EXTRACT = [
    "Entry",
    "Organism",
    "Protein families",
    "Length (aa)",
    "Fragments",
    "Toxic dose",
    "PTM",
]

# Read specific columns from the Excel sheet
df = pd.read_excel(INPUT_PATH, sheet_name=SHEET_NAME, usecols=COLUMNS_TO_EXTRACT)

# Rename 'Length (aa)' to 'Length' and 'Fragments' to 'Fragment'
df = df.rename(columns={"Length (aa)": "Length", "Fragments": "Fragment"})

# Ensure output directory exists and save as CSV
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT_PATH, index=False)

# Display information about the processed data
print(f"Data extracted and saved to {OUTPUT_PATH}")
print(f"Shape of the extracted data: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("First few rows:")
display(df.head())


Data extracted and saved to ../data/interim/toxprot_2017_old.csv
Shape of the extracted data: (6658, 7)
Columns: ['Entry', 'Organism', 'Protein families', 'Length', 'Fragment', 'Toxic dose', 'PTM']
First few rows:


Unnamed: 0,Entry,Organism,Protein families,Length,Fragment,Toxic dose,PTM
0,Q26292,Leiurus quinquestriatus hebraeus (Yellow scorp...,"Long (4 C-C) scorpion toxin superfamily, Sodiu...",85,,,
1,P30431,Bothrops jararaca (Jararaca) (Bothrops jajaraca),"Venom metalloproteinase (M12B) family, P-III s...",571,fragment,,The N-terminus of Jararhagin is blocked.
2,P60266,Centruroides suffusus suffusus (Mexican scorpion),"Long (4 C-C) scorpion toxin superfamily, Sodiu...",66,,LD(50) is 0.12 ug/kg in mouse by intracerebrov...,
3,P00626,Vipera ammodytes ammodytes (Western sand viper),"Phospholipase A2 family, Group II subfamily, D...",138,,LD(50) is 0.021 mg/kg by intravenous injection...,
4,P60274,Conus geographus (Geography cone) (Nubecula ge...,Conotoxin A superfamily,66,,,Gamma-carboxyglutamation of Glu-48 seems to be...


## Parse SwissProt 2017-11 (extracted from [FTP](https://ftp.uniprot.org/pub/databases/uniprot/previous_major_releases/release-2017_11/))

In [100]:
# Define paths for the TSV file
TSV_INPUT_PATH = DATA_PATH / "interim" / "toxprot_2017.tsv"
CAV_OUTPUT_PATH = DATA_PATH / "interim" / "toxprot_2017.csv"

# Define columns to extract with the correct column name for PTM
COLUMNS_TO_EXTRACT = [
    "Entry",
    "Organism",
    "Organism (ID)",
    "Protein families",
    "Length",
    "Fragment",
    "Toxic dose",
    "Post-translational modification",  # Correct column name in the TSV file
]

# Read the TSV file
df_tsv = pd.read_csv(TSV_INPUT_PATH, sep='\t', usecols=COLUMNS_TO_EXTRACT)

# Rename 'Post-translational modification' to 'PTM' for consistency
df_tsv = df_tsv.rename(columns={"Post-translational modification": "PTM"})

# Save as CSV
df_tsv.to_csv(CAV_OUTPUT_PATH, index=False)

# Display information about the processed data
print(f"TSV data extracted and saved to {CAV_OUTPUT_PATH}")
print(f"Shape of the extracted TSV data: {df_tsv.shape}")
print(f"Columns: {df_tsv.columns.tolist()}")
print("First few rows:")
display(df_tsv.head())

TSV data extracted and saved to ../data/interim/toxprot_2017.csv
Shape of the extracted TSV data: (6703, 8)
Columns: ['Entry', 'Organism', 'Organism (ID)', 'Protein families', 'Length', 'Fragment', 'Toxic dose', 'PTM']
First few rows:


Unnamed: 0,Entry,Organism,Organism (ID),Protein families,Length,Fragment,Toxic dose,PTM
0,P84001,Ancylometes sp. (South American fishing spider),280265,,50,,,
1,P84027,Ancylometes sp. (South American fishing spider),280265,Omega-agatoxin superfamily. Type II/III omega-...,37,,,
2,Q7M3V1,Chelonus sp. nr. curvimaculatus (Parasitic wasp),132888,,246,,,
3,F8J2B3,Drysdalia coronoides (White-lipped snake) (Hop...,66186,Snake three-finger toxin family. Long- chain s...,108,,,
4,Q53B54,Ophiophagus hannah (King cobra) (Naja hannah),8665,Snake three-finger toxin family. Long- chain s...,72,,LD(50) is 210 ug/kg by intraperitoneal injecti...,


## Parse SwissProt 2025-03

In [None]:
# Define paths for the TSV file
TSV_INPUT_PATH = DATA_PATH / "raw" / "202503_ToxProt.tsv"
CSV_OUTPUT_PATH = DATA_PATH / "interim" / "toxprot_2025.csv"

# Define columns to extract with the correct column name for PTM
COLUMNS_TO_EXTRACT = [
    "Entry",
    "Organism",
    "Organism (ID)",
    "Protein families",
    "Length",
    "Fragment",
    "Toxic dose",
    "Post-translational modification",  # Correct column name in the TSV file
]

# Read the TSV file
df_tsv = pd.read_csv(TSV_INPUT_PATH, sep='\t', usecols=COLUMNS_TO_EXTRACT)

# Rename 'Post-translational modification' to 'PTM' for consistency
df_tsv = df_tsv.rename(columns={"Post-translational modification": "PTM"})

# Save as CSV
df_tsv.to_csv(CSV_OUTPUT_PATH, index=False)

# Display information about the processed data
print(f"TSV data extracted and saved to {CSV_OUTPUT_PATH}")
print(f"Shape of the extracted TSV data: {df_tsv.shape}")
print(f"Columns: {df_tsv.columns.tolist()}")
print("First few rows:")
display(df_tsv.head())


TSV data extracted and saved to ../data/interim/toxprot_2025.csv
Shape of the extracted TSV data: (8055, 8)
Columns: ['Entry', 'Organism', 'Organism (ID)', 'Protein families', 'Length', 'Fragment', 'Toxic dose', 'PTM']
First few rows:


Unnamed: 0,Entry,Organism,Organism (ID),Protein families,Length,Fragment,Toxic dose,PTM
0,A0A068B6Q6,Conus betulinus (Beech cone),89764,Conotoxin A superfamily,37,fragment,,
1,A0A088MIT0,Physalaemus nattereri (Cuyaba dwarf frog) (Eup...,248869,"Frog skin active peptide (FSAP) family, Bradyk...",134,,,
2,A0A0A1I6E7,Androctonus crassicauda (Arabian fat-tailed sc...,122909,Non-disulfide-bridged peptide (NDBP) superfami...,74,,,
3,A0A0A1I6N9,Androctonus crassicauda (Arabian fat-tailed sc...,122909,Non-disulfide-bridged peptide (NDBP) superfami...,74,,,
4,A0A0B4U9L8,Vipera ammodytes ammodytes (Western sand viper),8705,"Venom metalloproteinase (M12B) family, P-III s...",614,,,PTM: N-glycosylated. {ECO:0000269|PubMed:25549...


# Get taxonomy info
Process taxonomic information from a CSV file using taxopy.

In [101]:
import pandas as pd
import taxopy
from pathlib import Path


def setup_db_paths():
    """Setup and return the database paths."""
    home_dir = Path.home() / ".cache"
    db_dir = home_dir / "taxopy_db"
    db_dir.mkdir(parents=True, exist_ok=True)
    nodes_file = db_dir / "nodes.dmp"
    names_file = db_dir / "names.dmp"
    merged_file = db_dir / "merged.dmp"

    return db_dir, nodes_file, names_file, merged_file


def initialize_taxdb():
    """Initialize and return the taxonomy database."""
    # Get the database paths
    db_dir, nodes_file, names_file, merged_file = setup_db_paths()

    if nodes_file.exists() and names_file.exists():
        print(f"Loading existing taxopy database from {db_dir}")
        taxdb = taxopy.TaxDb(
            nodes_dmp=str(nodes_file),
            names_dmp=str(names_file),
            merged_dmp=str(merged_file),
        )
    else:
        print(f"Downloading taxopy database to {db_dir}")
        taxdb = taxopy.TaxDb(taxdb_dir=str(db_dir), keep_files=True)

    return taxdb


def get_taxonomy_info(taxon_id, taxdb):
    """Get order, family, genus info for a taxon ID."""
    # Get the Taxon object
    taxon = taxopy.Taxon(taxon_id, taxdb)

    # Get the rank information
    ranks = taxon.rank_name_dictionary

    return {
        "taxon_name": taxon.name,
        "order": ranks.get("order", ""),
        "family": ranks.get("family", ""),
        "genus": ranks.get("genus", ""),
    }


def build_taxonomy_cache(df, taxdb):
    """Build a cache of taxonomy information for all unique organism IDs."""
    taxonomy_cache = {}
    for taxon_id in df["Organism (ID)"].unique():
        if pd.notna(taxon_id):
            taxonomy_cache[taxon_id] = get_taxonomy_info(taxon_id, taxdb)

    return taxonomy_cache


def add_taxonomy_columns(df, taxonomy_cache):
    """Add taxonomy columns to the dataframe."""
    df["Scientific_Name"] = df["Organism (ID)"].map(
        lambda x: taxonomy_cache.get(x, {}).get("taxon_name", "")
    )
    df["Order"] = df["Organism (ID)"].map(
        lambda x: taxonomy_cache.get(x, {}).get("order", "")
    )
    df["Family"] = df["Organism (ID)"].map(
        lambda x: taxonomy_cache.get(x, {}).get("family", "")
    )
    df["Genus"] = df["Organism (ID)"].map(
        lambda x: taxonomy_cache.get(x, {}).get("genus", "")
    )

    return df


def process_dataframe(input_path, output_path, taxdb):
    """Process the dataframe: load, add taxonomy, remove Organism column, save."""
    # Load the dataframe
    print(f"Loading data from {input_path}")
    df = pd.read_csv(input_path)

    # Build the taxonomy cache
    print("Building taxonomy cache...")
    taxonomy_cache = build_taxonomy_cache(df, taxdb)

    # Add taxonomy columns
    print("Adding taxonomy columns...")
    df = add_taxonomy_columns(df, taxonomy_cache)

    # Remove the Organism column if it exists
    if "Organism" in df.columns:
        print("Removing 'Organism' column...")
        df = df.drop(columns=["Organism"])

    # Save the updated dataframe
    print(f"Saving processed data to {output_path}")
    df.to_csv(output_path, index=False)
    print(f"Processing complete. Data saved to {output_path}")

In [102]:
print("Initializing taxonomy database...")
taxdb = initialize_taxdb()

# Process ToxProt 2017-11
input_path = "../data/interim/toxprot_2017.csv"
output_path = "../data/processed/toxprot_2017.csv"
process_dataframe(input_path, output_path, taxdb)

# Process ToxProt 2025-03
input_path = "../data/interim/toxprot_2025.csv"
output_path = "../data/processed/toxprot_2025.csv"
process_dataframe(input_path, output_path, taxdb)

Initializing taxonomy database...
Loading existing taxopy database from /Users/tsenoner/.cache/taxopy_db
Loading data from ../data/interim/toxprot_2017.csv
Building taxonomy cache...
Adding taxonomy columns...
Removing 'Organism' column...
Saving processed data to ../data/processed/toxprot_2017.csv
Processing complete. Data saved to ../data/processed/toxprot_2017.csv
Loading data from ../data/interim/toxprot_2025.csv
Building taxonomy cache...
Adding taxonomy columns...
Removing 'Organism' column...
Saving processed data to ../data/processed/toxprot_2025.csv
Processing complete. Data saved to ../data/processed/toxprot_2025.csv
