In [None]:
import pandas as pd
from pathlib import Path

# Define paths
DATA_PATH = Path("..") / "data"
INPUT_PATH = DATA_PATH / "raw" / "2023_Kim.xlsx"
OUTPUT_PATH = DATA_PATH / "processed" / "toxprot_2017.csv"

# Define constants
SHEET_NAME = "ToxProt11.2017"
COLUMNS_TO_EXTRACT = [
    "Entry",
    "Organism",
    "Protein families",
    "Length (aa)",
    "Fragments",
    "Toxic dose",
    "PTM",
]

# Read specific columns from the Excel sheet
df = pd.read_excel(INPUT_PATH, sheet_name=SHEET_NAME, usecols=COLUMNS_TO_EXTRACT)

# Rename 'Length (aa)' to 'Length'
df = df.rename(columns={"Length (aa)": "Length"})

# Ensure output directory exists and save as CSV
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT_PATH, index=False)

# Display information about the processed data
print(f"Data extracted and saved to {OUTPUT_PATH}")
print(f"Shape of the extracted data: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("First few rows:")
display(df.head())


Data extracted and saved to ../data/processed/toxprot_2017.csv
Shape of the extracted data: (6658, 7)
Columns: ['Entry', 'Organism', 'Protein families', 'Length', 'Fragments', 'Toxic dose', 'PTM']
First few rows:


Unnamed: 0,Entry,Organism,Protein families,Length,Fragments,Toxic dose,PTM
0,Q26292,Leiurus quinquestriatus hebraeus (Yellow scorp...,"Long (4 C-C) scorpion toxin superfamily, Sodiu...",85,,,
1,P30431,Bothrops jararaca (Jararaca) (Bothrops jajaraca),"Venom metalloproteinase (M12B) family, P-III s...",571,fragment,,The N-terminus of Jararhagin is blocked.
2,P60266,Centruroides suffusus suffusus (Mexican scorpion),"Long (4 C-C) scorpion toxin superfamily, Sodiu...",66,,LD(50) is 0.12 ug/kg in mouse by intracerebrov...,
3,P00626,Vipera ammodytes ammodytes (Western sand viper),"Phospholipase A2 family, Group II subfamily, D...",138,,LD(50) is 0.021 mg/kg by intravenous injection...,
4,P60274,Conus geographus (Geography cone) (Nubecula ge...,Conotoxin A superfamily,66,,,Gamma-carboxyglutamation of Glu-48 seems to be...
