# Update Protein Identifiers to UniProt

Note: Requires internet connection to download information from the UniProt.

## Setup
### Import packages

In [1]:
import numpy as np
import pandas as pd
from rbc_gem_utils import (
    build_string,
    check_database_release_online,
    get_dirpath,
    show_versions,
)
from rbc_gem_utils.database.uniprot import (
    UNIPROT_DB_TAG,
    UNIPROT_ID_RE,
    get_release_UniProt,
    query_UniProt,
)

# Display versions of last time notebook ran and worked
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.3

Dependency Information
----------------------
beautifulsoup4                       4.13.4
bio                                   1.8.0
cobra                                0.29.1
depinfo                               2.2.0
gurobipy                             12.0.3
matplotlib                           3.10.3
matplotlib-venn                       1.1.2
memote                               0.17.0
networkx                                3.5
notebook                              7.4.4
openpyxl                              3.1.5
pandas                                2.3.1
pre-commit                            4.2.0
rbc-gem-utils[database,network,vis] missing
requests                             2.32.4
scikit-learn                          1.7.0
scipy                                1.16.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip          25.1
setuptools 78.1.1
wheel      0.45

## Check UniProt release
If the release does not match the expected release, it is because database has been updated since the last time this code was utilized. 

* According to [UniProt](https://www.uniprot.org/help/downloads), updates to the database are made every eight weeks.
* If the current release does not match the expected release, it is because database has been updated since the last time this code was utilized.
    * If the notebook works without needing any significant modifications, the only update needed is to the release in the [uniprot.py](../../src/rbc_gem_utils/database/uniprot.py) source code file to resolve the issue.

In [2]:
release = get_release_UniProt()

use_interim = not check_database_release_online(UNIPROT_DB_TAG, verbose=True, **{})
# Use different directory paths for unexpected behavior
if use_interim:
    warn(
        "Online release of database has been updated since the last time notebook was used."
    )


database_dirpath = get_dirpath(
    "database", UNIPROT_DB_TAG, use_temp="interim" if use_interim else None
)
annotation_dirpath = get_dirpath(
    "annotation", use_temp="interim" if use_interim else None
)

# Ensure directories exist
database_dirpath.mkdir(exist_ok=True, parents=True)
annotation_dirpath.mkdir(exist_ok=True, parents=True)

Current and expected releases match. Current release: 2025_03


## Load aggregated proteomic data, raw

In [3]:
overwrite = True

excel_filepath_raw = (
    get_dirpath("proteomics", use_temp="external") / "proteomics_aggregated_raw.xlsx"
)

### Load obsolete identifier mapping
Meant to get all IDs into UniProt identifiers. Does not account for obselete/deleted/old UniProt IDs. Those must be manually checked

In [4]:
df_obsolete = pd.read_csv(
    get_dirpath("proteomics", use_temp="external") / "proteomics_obsolete.tsv",
    sep="\t",
    dtype=str,
    index_col=None,
)
df_obsolete

Unnamed: 0,UniProt,IPI,GI,EntryNames,UniParc,Other,Known issue?
0,A0A024QZ64,,78070601,,UPI0000000C53,,
1,A0A024R0L6,,4505587;14043830,,UPI0000131074,,
2,A0A024R1A3,,23510338,,UPI0000137946,,
3,A0A024R1I3,,10092677;40674427,,UPI000006D362,,
4,A0A024R4T4,,4507791,,UPI0000020ECC,,
...,...,...,...,...,...,...,...
3559,E9PM46,,,,,,Merged with other UniProt
3560,P27824,,,,,,Merged with other UniProt
3561,Q9BQA1,,,,,,Merged with other UniProt
3562,Q9P035,,,,,,Merged with other UniProt


### Load table of contents, raw

In [5]:
df_contents_raw = pd.read_excel(
    excel_filepath_raw, sheet_name="Table of Contents", dtype=str
).fillna("")
df_contents_raw

Unnamed: 0,PubMed/Sheet Name,ID type,Year,Publication,Notes
0,12362340,UniProt,2002,"Low TY, Seow TK, Chung MC. Separation of human...",Used for iAB-RBC-283
1,14963112,GI,2004,"Kakhniashvili DG, Bulla LA Jr, Goodman SR. The...",
2,16861337,IPI,2006,"Pasini EM, Kirkegaard M, Mortensen P, Lutz HU,...",Used for iAB-RBC-283
3,18399644,IPI,2008,"Simó C, Bachi A, Cattaneo A, Guerrier L, Forti...",
4,18494517,IPI,2008,"Ringrose JH, van Solinge WW, Mohammed S, O'Fla...",
5,18614565,IPI,2008,"Roux-Dalvai F, Gonzalez de Peredo A, Simó C, G...",Used for iAB-RBC-283
6,19778645,GI,2009,"van Gestel RA, van Solinge WW, van der Toorn H...",
7,22157974,GI,2011,"D'Amici GM, Rinalducci S, Zolla L. Depletion o...",
8,22954596,UniProt,2012,"Pesciotta EN, Sriswasdi S, Tang HY, Mason PJ, ...",Disease conditions
9,23781972,GI,2013,"Pallotta V, D'Alessandro A, Rinalducci S, Zoll...",


### Map other identifiers to UniProt

In [6]:
updated_table_dict = {"Table of Contents": df_contents_raw.copy()}
obsolete_dict = {}
problems = {}
index_name = "Uniprot"
for idx, (sheet_name, id_type) in df_contents_raw[
    ["PubMed/Sheet Name", "ID type"]
].iterrows():
    df = pd.read_excel(
        excel_filepath_raw, sheet_name=sheet_name, usecols=[0], dtype=str
    )
    df.columns = [id_type]
    if id_type != "UniProt":
        # Map to obsolete identifiers
        df_obsolete_mapping = (
            df_obsolete[[id_type, "UniProt"]].dropna(how="all").drop_duplicates().copy()
        )
        if id_type == "GI":
            df[id_type] = df[id_type].apply(
                lambda x: x.split("|")[-1] if str(x).startswith("gi|") else x
            )
        df_obsolete_mapping = df.merge(
            df_obsolete_mapping, left_on=id_type, right_on=id_type, how="left"
        )
        obsolete_dict[sheet_name] = df_obsolete_mapping.copy()
        counts = df_obsolete_mapping.nunique()
        counts["Obsolete"] = (
            df_obsolete_mapping["UniProt"].isna().value_counts()[True].item()
        )

        counts = {k: v for (k, v) in sorted(counts.to_dict().items())}
        updated_table_dict["Table of Contents"].loc[idx, "ID type"] = "UniProt"
        note_str = build_string([f"{v} {k}" for k, v in counts.items() if v != 0])
        updated_table_dict["Table of Contents"].loc[idx, "Notes"] = note_str
        updated_table_dict[sheet_name] = list(
            df_obsolete_mapping["UniProt"].dropna().unique()
        )
    else:
        check_mixed_ids = df["UniProt"][
            ~df["UniProt"].apply(lambda x: True if UNIPROT_ID_RE.search(x) else False)
        ]
        counts = {"UniProt": 0}
        if not check_mixed_ids.empty:
            problems[sheet_name] = df
        df["UniProt"] = df["UniProt"].apply(
            lambda x: (
                UNIPROT_ID_RE.search(x).group()
                if isinstance(x, str) and UNIPROT_ID_RE.search(x)
                else x
            )
        )
        counts.update({"UniProt": df["UniProt"].nunique()})
        counts = {k: v for (k, v) in sorted(counts.items())}
        note_str = build_string([f"{v} {k}" for k, v in counts.items() if v != 0])
        updated_table_dict["Table of Contents"].loc[idx, "Notes"] = note_str
        updated_table_dict[sheet_name] = list(df["UniProt"].dropna().unique())

updated_table_dict["Table of Contents"]

Unnamed: 0,PubMed/Sheet Name,ID type,Year,Publication,Notes
0,12362340,UniProt,2002,"Low TY, Seow TK, Chung MC. Separation of human...",84 UniProt
1,14963112,UniProt,2004,"Kakhniashvili DG, Bulla LA Jr, Goodman SR. The...",183 GI;128 Obsolete;66 UniProt
2,16861337,UniProt,2006,"Pasini EM, Kirkegaard M, Mortensen P, Lutz HU,...",590 IPI;197 Obsolete;393 UniProt
3,18399644,UniProt,2008,"Simó C, Bachi A, Cattaneo A, Guerrier L, Forti...",153 IPI;42 Obsolete;111 UniProt
4,18494517,UniProt,2008,"Ringrose JH, van Solinge WW, Mohammed S, O'Fla...",699 IPI;310 Obsolete;389 UniProt
5,18614565,UniProt,2008,"Roux-Dalvai F, Gonzalez de Peredo A, Simó C, G...",1577 IPI;364 Obsolete;1214 UniProt
6,19778645,UniProt,2009,"van Gestel RA, van Solinge WW, van der Toorn H...",561 GI;180 Obsolete;426 UniProt
7,22157974,UniProt,2011,"D'Amici GM, Rinalducci S, Zolla L. Depletion o...",134 GI;214 Obsolete;44 UniProt
8,22954596,UniProt,2012,"Pesciotta EN, Sriswasdi S, Tang HY, Mason PJ, ...",831 UniProt
9,23781972,UniProt,2013,"Pallotta V, D'Alessandro A, Rinalducci S, Zoll...",140 GI;47 Obsolete;118 UniProt


### Map UniProt IDs to current UniProt

In [7]:
all_failed = {}
all_unmapped = {}

In [8]:
# Extract all relevant information for now and save
query_parameters = {
    "query": " && ".join(
        [
            "(organism_id:9606)",  # Homo sapiens (Human)
        ]
    ),
    "format": "tsv",
    "size": 500,
    "compressed": True,
    "fields": ",".join(
        [
            "reviewed",
            "accession",
            "gene_primary",
        ]
    ),
}

idx = -1
# Use redo sheets to remap specific sheets that failed
redo_sheets = set([])
issues = ['28689405']
for sheet_name, query_ids in updated_table_dict.copy().items():
    if sheet_name == "Table of Contents" or (
        redo_sheets and sheet_name not in redo_sheets
    ):
        idx += 1
        continue
    print(f"{idx}) {sheet_name}\n{(4 + len(sheet_name)) * '-'}")
    try:
        df_results, uniparc, failed_ids, unmapped_ids = query_UniProt(
            query_ids,
            query_parameters=query_parameters,
            from_db="UniProtKB",
            to_db="UniProtKB",
            return_failed=True,
        )
    except:
        # Try additional time before continuing
        print(f"Issue with {sheet_name}, retry after extraction is finished\n")
        idx += 1
        issues += [sheet_name]
        continue
    if failed_ids:
        print(f"Failed IDS: {failed_ids}\n")
        all_failed[sheet_name] = set(failed_ids)
    if unmapped_ids:
        print(f"Unmmaped IDS: {unmapped_ids}\n")
        all_unmapped[sheet_name] = set(unmapped_ids)
    df_uniprot = df_results["Entry"].drop_duplicates().dropna().reset_index(drop=True)
    df_uniprot.name = index_name
    updated_table_dict[sheet_name] = df_uniprot
    updated_table_dict["Table of Contents"].loc[
        idx, "Notes"
    ] += f" -->  {len(df_uniprot)} UniProt {release}"
    idx += 1
    print()
print(issues)
updated_table_dict["Table of Contents"]

0) 12362340
------------


Number of failed query IDs : 5


Fetched: 79 / 79


Number of failed IDs : 3


Failed IDS: {'182074', '7710117', '4503581'}

Unmmaped IDS: {'Q64689', 'P80315'}


1) 14963112
------------
Fetched: 66 / 66

2) 16861337
------------
Fetched: 396 / 396

3) 18399644
------------
Fetched: 112 / 112

4) 18494517
------------
Fetched: 389 / 389

5) 18614565
------------
Fetched: 500 / 1217
Fetched: 1000 / 1217
Fetched: 1217 / 1217

6) 19778645
------------
Fetched: 427 / 427

7) 22157974
------------
Fetched: 44 / 44

8) 22954596
------------
Fetched: 500 / 765


Number of failed query IDs : 67


Fetched: 765 / 765


Number of failed IDs : 8
Number of obsolete IDs : 54


Failed IDS: {'1A85', 'UPI0001B1', 'UPI0001E6', 'A4B8', 'UPI000188', 'UPI000041', 'F69B', '20FF'}

Unmmaped IDS: {'D9YZV2', 'E7EQ55', 'E9PC36', 'E7EUY0', 'E9PEC0', 'B1AKJ5', 'E9PCW5', 'E9PBB7', 'C9JCD7', 'B5ME63', 'P35241', 'Q5RA57', 'D3DPK5', 'C9J436', 'D3DR22', 'E7ESQ4', 'B0UX83', 'Q86U75', 'A8MTC3', 'E2RB38', 'B7WPI6', 'Q6PIK3', 'D3DUP1', 'A8MZ22', 'C9JLU2', 'D3DTE6', 'C9JNF5', 'A8MZ76', 'E9PDP1', 'A8MW68', 'A8MVJ7', 'A8MWX0', 'E7ENR8', 'E9PAN3', 'A8MTG8', 'C9K0W8', 'E7EQR4', 'C9JUN6', 'A6NDN4', 'E2QS59', 'A8MYA6', 'E9PIZ3', 'D3DP46', 'D3DQ69', 'E7EVT0', 'E9PC14', 'E7EVU8', 'D6R985', 'A1L1K4', 'A7E2Y5', 'E7ETK2', 'E7EMJ6', 'E9PAR6', 'B9ZVX0', 'E9PBD4', 'A5GFU6', 'E9PCQ3', 'E7EVN0', 'C9IZE4'}


9) 23781972
------------
Fetched: 119 / 119

10) 24555563
------------
Fetched: 500 / 1214
Fetched: 1000 / 1214


Number of failed query IDs : 9


Fetched: 1214 / 1214


Number of obsolete IDs : 5


Unmmaped IDS: {'B5MCY6', 'F6RGB9', 'H0Y7U1', 'Q6PK30', 'B4E3P0', 'E9PEW9', 'F8VRQ1', 'B4DXJ1', 'J3KPV7'}


11) 26078478
------------


Number of failed query IDs : 6


Fetched: 416 / 416


Number of obsolete IDs : 6


Unmmaped IDS: {'H0Y5J2', 'H0YLA4', 'Q5VSJ9', 'I3NI00', 'E7ETU3', 'B1ALS2'}


12) 26271157
------------


Number of failed query IDs : 4


Fetched: 72 / 72


Number of obsolete IDs : 4


Unmmaped IDS: {'K7EJT8', 'Q5VSJ9', 'D3DRP5', 'D6W5X8'}


13) 26474164
------------
Fetched: 500 / 1148
Fetched: 1000 / 1148


Number of failed query IDs : 98


Fetched: 1148 / 1148


Number of failed IDs : 48
Number of obsolete IDs : 22


Failed IDS: {'31747338', '114051856', '2370337', '51247812', '3BPQ4', '10835849', '73996461', '0Y990', '73996342', '205055', '56710317', '999627', '73996350', '0Y5R6', '2NNT7', '126308130', '149714766', '9FQK7', '0Y3V4', '15341208', '2194089', '57012432', '56847618', '109119997', '73996332', '81170669', 'UPI0002AF517B', '73996336', '2R572', '50233797', '50582982', '82995623', '20911031', '73965965', '126308120', '148235126', '149714762', '12345', '1942352', '67549', '73996316', '3318722', '116004057', '16303309', '73996314', '6754450', '51247813', '126308122'}

Unmmaped IDS: {'P62937', 'E2QRB9', 'P29562', 'G5B5L5', 'P31939', 'L5JU82', 'F2Z3F8', 'P02024', 'J3KRC6', 'F6YXG7', 'G3S502', 'Q3TWV4', 'Q59EU8', 'G1S738', 'F6UEB3', '12345;10835849;3318722;51247813;1942352', 'F5H698', 'F5H6Y7', 'Q7Z474', 'F7FMZ1', 'L5K2B1', 'G1QXD5', 'D3Z794', 'G1QJZ1', 'G1RZN3', 'K7C0T3', 'K7B5G6', 'Q5R507', 'Q9TT33', 'F7FCV2', '149714766;149714762;73996336;73996350;73996342;73996314;81170669;20911031;16303309;

Number of failed query IDs : 130


Fetched: 1187 / 1187


Number of obsolete IDs : 117


Unmmaped IDS: {'E7EWT1', 'F5GXH1', 'F5GZ16', 'A8MUU4', 'F5H0N0', 'E7EPC9', 'E7EWR0', 'F5H0H0', 'C9JB13', 'F5GWE5', 'F5H0R5', 'F5H1I1', 'F5H6Z3', 'F5GWQ7', 'E9PBW0', 'F2Z3D0', 'B0V2L0', 'E7ESM6', 'E9PD66', 'E9PHI9', 'B0UX83', 'E9PGR0', 'Q3B7A4', 'C9JSL3', 'E9PH51', 'P63010', 'F5H3T8', 'Q5SYT8', 'B1AH89', 'E7ERV9', 'A8MVD5', 'C9JGQ9', 'E7EQZ3', 'A8MZ71', 'F5H153', 'F2Z3A5', 'B8ZZT7', 'Q5HY54', 'E9PBC7', 'F5H4D6', 'E2QC29', 'F5H4G7', 'E9PNT8', 'F5H223', 'F5GXC8', 'F5GWY2', 'E9PEA8', 'Q567Q5', 'A8MSH7', 'Q5T1I6', 'Q5T6W5', 'E9PDI1', 'F5H4I0', 'E7EVE3', 'F6WSP1', 'A8MYA6', 'E7EP96', 'E7EWX4', 'E9PIZ3', 'E9PAQ6', 'F5H3X9', 'A6NJ11', 'E7EWT6', 'E9PEX0', 'F5H7U0', 'F5H754', 'F5GZB1', 'F5H6K0', 'E9PCQ3', 'Q5JUV3', 'F5H3J5', 'D6RFZ8', 'E9PG85', 'E9PEC0', 'Q5TCU6', 'B7Z7A9', 'F5GX07', 'E7EMJ8', 'E9PB22', 'F5H2W0', 'F5H2A5', 'C9JKD6', 'F5GYQ0', 'E5RJ56', 'F5GWW0', 'E9PC09', 'F5GY14', 'F5H335', 'C8KIM0', 'F5GZJ1', 'F5H569', 'B0V043', 'F5H2Q7', 'O14744', 'F5GZU7', 'A8MZ22', 'E7EWF1', 'F5H328', 'E9PE

Number of failed query IDs : 56


Fetched: 1226 / 1226


Number of obsolete IDs : 53


Unmmaped IDS: {'A0A087X1S2', 'A0A0C4DH83', 'J9JID7', 'E2QRB9', 'A0A0A0MTS2', 'P01609', 'Q5T993', 'A0A0B4J2G9', 'A0A087X2I2', 'A0A087WWS7', 'A0A087WUD7', 'G5E963', 'A0A087WZW8', 'A0A087X0E2', 'F8W914', 'A0A087WUQ6', 'A0A087X232', 'A0A087WX08', 'H0Y6Q0', 'A0A0D9SFK2', 'A0A087WWL9', 'A0A087X0K1', 'F5GWE5', 'A0A087WWJ2', 'A0A087X0C8', 'X6RGJ2', 'E7EQR4', 'A0A087WUL0', 'S4R460', 'H7BXK9', 'A0A087X079', 'A0A0C4DGX4', 'A0A087WZR3', 'A0A087WTB8', 'A0A087WY82', 'A0A087WZE4', 'A0A087WXL8', 'A0A087WVF8', 'J3KR44', 'A0A0A0MT60', 'A0A087X2C0', 'Q8WZ42', 'A0A0A0MSW4', 'A0A087WXS7', 'A0A087WYC5', 'F2Z2X4', 'K7ELI3', 'A0A087WVV2', 'A0A087WUZ3', 'A0A075B6J4', 'I3L2G5', 'B1ANM7', 'Q5HY54', 'A0A0A0MSK3', 'A0A087X0Q9', 'A0A0C4DGZ8'}


16) 28263177
------------
Fetched: 500 / 1815
Fetched: 1000 / 1815
Fetched: 1500 / 1815


Number of failed query IDs : 14


Fetched: 1815 / 1815


Number of obsolete IDs : 2


Unmmaped IDS: {'P01621', 'P04220', 'P18135', 'P01769', 'P01598', 'P01772', 'P04434', 'P01608', 'P01779', 'P04206', 'P06326', 'A6NJ16', 'P01620', 'P01602'}


17) 28689405
------------
Fetched: 500 / 2556
Fetched: 1000 / 2556
Fetched: 1500 / 2556
Fetched: 2000 / 2556
Fetched: 2500 / 2556


Number of failed query IDs : 23


Fetched: 2556 / 2556


Number of obsolete IDs : 22


Unmmaped IDS: {'S4R460', 'A0A0A0MQS8', 'A0A0A6YYB0', 'H7C2F2', 'A0A087WUW9', 'V9GY79', 'A0A087WV23', 'A9Z1Z3', 'J3KN10', 'A0A087WZW8', 'K7EQW8', 'A0A087WTI1', 'H0Y8T6', 'A0A087WY61', 'Q5T699', 'C9JV50', 'A0A0G2JNK4', 'D6RD96', 'H0Y905', 'D6RHC5', 'P01891', 'F6QW41', 'F5H0Y5'}


18) 30271928
------------


Number of failed query IDs : 1


Fetched: 262 / 262
Unmmaped IDS: {'P13746'}


19) 30327373
------------
Fetched: 500 / 1884
Fetched: 1000 / 1884
Fetched: 1500 / 1884
Fetched: 1884 / 1884

20) 31552303
------------
Fetched: 267 / 267

21) 33103907
------------
Fetched: 500 / 921
Fetched: 921 / 921

22) 33341364
------------
Fetched: 500 / 841


Number of failed query IDs : 35


Fetched: 841 / 841


Number of obsolete IDs : 29


Unmmaped IDS: {'E7EPC6', 'Q5TCU6', 'K7EJT8', 'F5H223', 'F5H012', 'H7BY58', 'E7EW52', 'G5EA52', 'J3KN69', 'M0QX55', 'E7ESI6', 'Q5T6W5', 'F5H4R9', 'E9PM46', 'F5GWE5', 'F5GZD0', 'J3KNL3', 'Q9P035', 'Q9BQA1', 'E7ETU3', 'E7EU23', 'Q53H96', 'B4DDP6', 'B8ZZP3', 'F5H569', 'J3KR44', 'P27824', 'G3XAN9', 'F2Z2X4', 'F5GXV9', 'F8WAQ9', 'I7HJJ0', 'B1ANM7', 'J3QK90', 'C9IZE4'}


23) 33676898
------------
Fetched: 500 / 735


Number of failed query IDs : 1


Fetched: 735 / 735
Unmmaped IDS: {'P01765'}


24) 33806028
------------
Fetched: 500 / 1339
Fetched: 1000 / 1339
Fetched: 1339 / 1339

25) 35858567
------------
Fetched: 500 / 1530
Fetched: 1000 / 1530
Fetched: 1500 / 1530
Fetched: 1530 / 1530

26) 36346805
------------
Fetched: 500 / 659
Fetched: 659 / 659

27) 37760001
------------
Fetched: 500 / 878
Fetched: 878 / 878

28) 37942280
------------
Fetched: 500 / 1812
Fetched: 1000 / 1812
Fetched: 1500 / 1812


Number of failed query IDs : 2


Fetched: 1812 / 1812


Number of obsolete IDs : 2


Unmmaped IDS: {'P0DN79', 'A0A0B4J2D5'}


29) 38147558
------------
Fetched: 500 / 1208
Fetched: 1000 / 1208
Fetched: 1208 / 1208

30) 38964323
------------
Fetched: 500 / 2601
Fetched: 1000 / 2601
Fetched: 1500 / 2601
Fetched: 2000 / 2601
Fetched: 2500 / 2601
Fetched: 2601 / 2601

31) 40043591
------------
Fetched: 346 / 346

['28689405']


Unnamed: 0,PubMed/Sheet Name,ID type,Year,Publication,Notes
0,12362340,UniProt,2002,"Low TY, Seow TK, Chung MC. Separation of human...",84 UniProt --> 79 UniProt 2025_03
1,14963112,UniProt,2004,"Kakhniashvili DG, Bulla LA Jr, Goodman SR. The...",183 GI;128 Obsolete;66 UniProt --> 66 UniProt...
2,16861337,UniProt,2006,"Pasini EM, Kirkegaard M, Mortensen P, Lutz HU,...",590 IPI;197 Obsolete;393 UniProt --> 396 UniP...
3,18399644,UniProt,2008,"Simó C, Bachi A, Cattaneo A, Guerrier L, Forti...",153 IPI;42 Obsolete;111 UniProt --> 112 UniPr...
4,18494517,UniProt,2008,"Ringrose JH, van Solinge WW, Mohammed S, O'Fla...",699 IPI;310 Obsolete;389 UniProt --> 389 UniP...
5,18614565,UniProt,2008,"Roux-Dalvai F, Gonzalez de Peredo A, Simó C, G...",1577 IPI;364 Obsolete;1214 UniProt --> 1217 U...
6,19778645,UniProt,2009,"van Gestel RA, van Solinge WW, van der Toorn H...",561 GI;180 Obsolete;426 UniProt --> 427 UniPr...
7,22157974,UniProt,2011,"D'Amici GM, Rinalducci S, Zolla L. Depletion o...",134 GI;214 Obsolete;44 UniProt --> 44 UniProt...
8,22954596,UniProt,2012,"Pesciotta EN, Sriswasdi S, Tang HY, Mason PJ, ...",831 UniProt --> 765 UniProt 2025_03
9,23781972,UniProt,2013,"Pallotta V, D'Alessandro A, Rinalducci S, Zoll...",140 GI;47 Obsolete;118 UniProt --> 119 UniPro...


#### Quick check unmapped against known deleted entries

In [9]:
known_issues = df_obsolete[["UniProt", "Known issue?"]].dropna(subset="Known issue?")
known_issues = known_issues.set_index("UniProt")["Known issue?"].to_dict()

for k, values in all_unmapped.items():
    print(f"Sheet: {k}")
    for v in values:
        if not UNIPROT_ID_RE.search(v):
            # print(f"{v}\tNot a UniProt ID")
            continue

        elif v in known_issues:
            # print(f"{v}\t{known_issues[v]}")
            continue
        else:
            print(f"{v}\tUnclear")
    print()
    print()

Sheet: 12362340


Sheet: 22954596
P35241	Unclear


Sheet: 24555563


Sheet: 26078478


Sheet: 26271157


Sheet: 26474164
P62937	Unclear
P31939	Unclear
P11586	Unclear


Sheet: 27006477
P63010	Unclear
O14744	Unclear
P78371	Unclear
P06748	Unclear
P62495	Unclear


Sheet: 27777340
Q8WZ42	Unclear


Sheet: 28263177
P01772	Unclear


Sheet: 28689405


Sheet: 30271928
P13746	Unclear


Sheet: 33341364


Sheet: 33676898


Sheet: 37942280




### Export aggregated proteomic data, updated IDs


In [10]:
if overwrite:
    with pd.ExcelWriter(
        get_dirpath("proteomics", use_temp="external") / "proteomics_aggregated.xlsx"
    ) as writer:
        for sheet_name, df in updated_table_dict.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)