In [None]:
# Download PTM-related UniProtKB annotations for all human proteins
import time

import pandas as pd
import requests

BASE_URL = "https://rest.uniprot.org/uniprotkb/search"


def download_uniprot_ptm_human_tsv(
    output_path: str,
    size: int = 500,
    max_pages: int | None = None,
    sleep_seconds: float = 0.2,
    accessions: list[str] | None = None,
    gene_names: list[str] | None = None,
) -> int:
    base_query = (
        "taxonomy_id:9606 AND ("
        "ft_mod_res:* OR ft_lipid:* OR ft_carbohyd:* OR ft_crosslnk:* OR ft_disulfid:*"
        ")"
    )
    filters = []
    if accessions:
        filters.append("accession:" + " OR accession:".join(accessions))
    if gene_names:
        filters.append("gene:" + " OR gene:".join(gene_names))

    if filters:
        query = f"({base_query}) AND ({' OR '.join(filters)})"
    else:
        query = base_query

    fields = [
        "accession",
        "id",
        "gene_primary",
        "protein_name",
        "sequence",
        "ft_mod_res",
        "ft_lipid",
        "ft_carbohyd",
        "ft_crosslnk",
        "ft_disulfid",
    ]

    params = {
        "query": query,
        "format": "tsv",
        "fields": ",".join(fields),
        "size": size,
    }

    next_url = BASE_URL
    pages = 0
    rows_written = 0
    header_written = False

    with open(output_path, "w", encoding="utf-8") as handle:
        while next_url:
            response = requests.get(
                next_url, params=params if next_url == BASE_URL else None, timeout=60
            )
            response.raise_for_status()

            lines = response.text.strip().splitlines()
            if not lines:
                break

            if not header_written:
                handle.write(lines[0] + "\n")
                header_written = True

            for line in lines[1:]:
                handle.write(line + "\n")
                rows_written += 1

            pages += 1
            if max_pages is not None and pages >= max_pages:
                break

            next_url = response.links.get("next", {}).get("url")
            if next_url:
                time.sleep(sleep_seconds)

    return rows_written



Saved 16451 rows to uniprot_human_ptm.tsv


Unnamed: 0,Entry,Entry Name,Gene Names (primary),Protein names,Modified residue,Lipidation,Glycosylation,Cross-link,Disulfide bond
0,P20700,LMNB1_HUMAN,LMNB1,Lamin-B1,"MOD_RES 2; /note=""N-acetylalanine""; /evidence=...","LIPID 583; /note=""S-farnesyl cysteine""; /evide...","CARBOHYD 399; /note=""O-linked (GlcNAc) threoni...","CROSSLNK 102; /note=""Glycyl lysine isopeptide ...","DISULFID 317; /note=""Interchain""; /evidence=""E..."
1,P56817,BACE1_HUMAN,BACE1,Beta-secretase 1 (EC 3.4.23.46) (Aspartyl prot...,"MOD_RES 126; /note=""N6-acetyllysine""; /evidenc...","LIPID 474; /note=""S-palmitoyl cysteine""; /evid...","CARBOHYD 153; /note=""N-linked (GlcNAc...) aspa...","CROSSLNK 501; /note=""Glycyl lysine isopeptide ...","DISULFID 216..420; /evidence=""ECO:0000269|PubM..."
2,O75581,LRP6_HUMAN,LRP6,Low-density lipoprotein receptor-related prote...,"MOD_RES 1420; /note=""Phosphoserine; by CK1""; /...","LIPID 1394; /note=""S-palmitoyl cysteine""; /evi...","CARBOHYD 42; /note=""N-linked (GlcNAc...) aspar...","CROSSLNK 1403; /note=""Glycyl lysine isopeptide...","DISULFID 286..297; /evidence=""ECO:0000255|PROS..."
3,P00533,EGFR_HUMAN,EGFR,Epidermal growth factor receptor (EC 2.7.10.1)...,"MOD_RES 229; /note=""Phosphoserine""; /evidence=...","LIPID 1049; /note=""S-palmitoyl cysteine""; /evi...","CARBOHYD 56; /note=""N-linked (GlcNAc...) (comp...","CROSSLNK 716; /note=""Glycyl lysine isopeptide ...","DISULFID 31..58; /evidence=""ECO:0000269|PubMed..."
4,P17181,INAR1_HUMAN,IFNAR1,Interferon alpha/beta receptor 1 (IFN-R-1) (IF...,"MOD_RES 466; /note=""Phosphotyrosine; by TYK2"";...","LIPID 463; /note=""S-palmitoyl cysteine""; /evid...","CARBOHYD 50; /note=""N-linked (GlcNAc...) aspar...","CROSSLNK 501; /note=""Glycyl lysine isopeptide ...","DISULFID 79..87; /evidence=""ECO:0000269|PubMed..."


In [None]:

output_path = "uniprot_human_ptm.tsv"

# Optional filters from your MS-identified proteins
# accessions = ["P00533", "P20700"]
# gene_names = ["EGFR", "LMNB1"]

rows = download_uniprot_ptm_human_tsv(
    output_path,
    accessions=None,
    gene_names=None,
)
print(f"Saved {rows} rows to {output_path}")

# Preview a few rows
preview = pd.read_csv(output_path, sep="\t", nrows=5)
preview