# Update Protein Identifiers to UniProt

Note: Requires internet connection to download information from the UniProt.

## Setup
### Import packages

In [1]:
import pandas as pd
import numpy as np
from rbc_gem_utils import (
    ANNOTATION_PATH,
    DATABASE_PATH,
    EXTERNAL_PATH,
    INTERIM_PATH,
    ROOT_PATH,
    build_string,
    check_version,
    show_versions,
)
from rbc_gem_utils.database.uniprot import (
    UNIPROT_ID_RE,
    UNIPROT_VERSION_EXPECTED,
    get_version_UniProt,
    query_UniProt,
)

# Display versions of last time notebook ran and worked
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
beautifulsoup4                       4.12.3
bio                                   1.6.2
cobra                                0.29.0
depinfo                               2.2.0
kaleido                               0.2.1
matplotlib                            3.8.2
memote                               0.17.0
networkx                              3.2.1
notebook                              7.0.7
openpyxl                              3.1.2
pandas                                2.2.0
pre-commit                            3.6.0
pyvis                                 0.3.2
rbc-gem-utils[database,network,vis] missing
requests                             2.31.0
scipy                                1.12.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip        23.3.1
setuptools 68.2.2
wheel      0.41.2

Platform Information
-------------------

## Check UniProt version
If the version does not match the expected version, it is because database has been updated since the last time this code was utilized. 

### Expected UniProt version: 2025_01
* According to [UniProt](https://www.uniprot.org/help/downloads), updates to the database are made every eight weeks. 

In [2]:
version = get_version_UniProt()
if check_version(version, UNIPROT_VERSION_EXPECTED, verbose=True):
    database_dirpath = ROOT_PATH / DATABASE_PATH
    annotation_dirpath = ROOT_PATH / ANNOTATION_PATH
else:
    # Use different directory paths for unexpected behavior
    database_dirpath = ROOT_PATH / INTERIM_PATH
    annotation_dirpath = ROOT_PATH / INTERIM_PATH

Current and expected versions match.


## Load aggregated proteomic data, raw

In [3]:
overwrite = True

excel_filepath_raw = (
    ROOT_PATH / EXTERNAL_PATH / "proteomics" / "proteomics_aggregated_raw.xlsx"
)

### Load obsolete identifier mapping
Meant to get all IDs into UniProt identifiers. Does not account for obselete/deleted/old UniProt IDs. Those must be manually checked

In [4]:
df_obsolete = pd.read_csv(
    ROOT_PATH / EXTERNAL_PATH / "proteomics" / "proteomics_obsolete.tsv",
    sep="\t",
    dtype=str,
    index_col=None,
)
df_obsolete

Unnamed: 0,UniProt,IPI,GI,EntryNames,UniParc,Other,Known issue?
0,A0A024QZ64,,78070601,,UPI0000000C53,,
1,A0A024R0L6,,4505587;14043830,,UPI0000131074,,
2,A0A024R1A3,,23510338,,UPI0000137946,,
3,A0A024R1I3,,10092677;40674427,,UPI000006D362,,
4,A0A024R4T4,,4507791,,UPI0000020ECC,,
...,...,...,...,...,...,...,...
3558,B4DDP6,,,,,,Merged with other UniProt
3559,E9PM46,,,,,,Merged with other UniProt
3560,P27824,,,,,,Merged with other UniProt
3561,Q9BQA1,,,,,,Merged with other UniProt


### Load table of contents, raw

In [5]:
df_contents_raw = pd.read_excel(
    excel_filepath_raw, sheet_name="Table of Contents", dtype=str
).fillna("")
df_contents_raw

Unnamed: 0,PubMed/Sheet Name,ID type,Year,Publication,Notes
0,12362340,UniProt,2002,"Low TY, Seow TK, Chung MC. Separation of human...",Used for iAB-RBC-283
1,14963112,GI,2004,"Kakhniashvili DG, Bulla LA Jr, Goodman SR. The...",
2,16861337,IPI,2006,"Pasini EM, Kirkegaard M, Mortensen P, Lutz HU,...",Used for iAB-RBC-283
3,18399644,IPI,2008,"Simó C, Bachi A, Cattaneo A, Guerrier L, Forti...",
4,18494517,IPI,2008,"Ringrose JH, van Solinge WW, Mohammed S, O'Fla...",
5,18614565,IPI,2008,"Roux-Dalvai F, Gonzalez de Peredo A, Simó C, G...",Used for iAB-RBC-283
6,19778645,GI,2009,"van Gestel RA, van Solinge WW, van der Toorn H...",
7,22157974,GI,2011,"D'Amici GM, Rinalducci S, Zolla L. Depletion o...",
8,22954596,UniProt,2012,"Pesciotta EN, Sriswasdi S, Tang HY, Mason PJ, ...",Disease conditions
9,23781972,GI,2013,"Pallotta V, D'Alessandro A, Rinalducci S, Zoll...",


### Map other identifiers to UniProt

In [6]:
updated_table_dict = {"Table of Contents": df_contents_raw.copy()}
obsolete_dict = {}
problems = {}
index_name = "Uniprot"
for idx, (sheet_name, id_type) in df_contents_raw[
    ["PubMed/Sheet Name", "ID type"]
].iterrows():
    df = pd.read_excel(
        excel_filepath_raw, sheet_name=sheet_name, usecols=[0], dtype=str
    )
    df.columns = [id_type]
    if id_type != "UniProt":
        # Map to obsolete identifiers
        df_obsolete_mapping = (
            df_obsolete[[id_type, "UniProt"]]
            .dropna(how="all")
            .drop_duplicates()
            .copy()
        )
        if id_type == "GI":
            df[id_type] = df[id_type].apply(
                lambda x: x.split("|")[-1] if str(x).startswith("gi|") else x
            )
        df_obsolete_mapping = df.merge(
            df_obsolete_mapping, left_on=id_type, right_on=id_type, how="left"
        )
        obsolete_dict[sheet_name] = df_obsolete_mapping.copy()
        counts = df_obsolete_mapping.nunique()
        counts["Obsolete"] = (
            df_obsolete_mapping["UniProt"].isna().value_counts()[True].item()
        )

        counts = {k: v for (k, v) in sorted(counts.to_dict().items())}
        updated_table_dict["Table of Contents"].loc[idx, "ID type"] = "UniProt"
        note_str = build_string([
            f"{v} {k}" for k, v in counts.items() if v != 0]
        )
        updated_table_dict["Table of Contents"].loc[idx, "Notes"] = note_str
        updated_table_dict[sheet_name] = list(
            df_obsolete_mapping["UniProt"].dropna().unique()
        )
    else:
        check_mixed_ids = df["UniProt"][~df["UniProt"].apply(lambda x: True if UNIPROT_ID_RE.search(x) else False)]
        counts = {"UniProt": 0}
        if not check_mixed_ids.empty:
            problems[sheet_name] = df
        df["UniProt"] = df["UniProt"].apply(
            lambda x: UNIPROT_ID_RE.search(x).group() if isinstance(x, str) and UNIPROT_ID_RE.search(x) else x
        )
        counts.update({"UniProt": df["UniProt"].nunique()})
        counts = {k: v for (k, v) in sorted(counts.items())}
        note_str = build_string([
            f"{v} {k}" for k, v in counts.items() if v != 0]
        )
        updated_table_dict["Table of Contents"].loc[idx, "Notes"] = note_str
        updated_table_dict[sheet_name] = list(df["UniProt"].dropna().unique())

updated_table_dict["Table of Contents"]

Unnamed: 0,PubMed/Sheet Name,ID type,Year,Publication,Notes
0,12362340,UniProt,2002,"Low TY, Seow TK, Chung MC. Separation of human...",84 UniProt
1,14963112,UniProt,2004,"Kakhniashvili DG, Bulla LA Jr, Goodman SR. The...",183 GI;128 Obsolete;66 UniProt
2,16861337,UniProt,2006,"Pasini EM, Kirkegaard M, Mortensen P, Lutz HU,...",590 IPI;197 Obsolete;393 UniProt
3,18399644,UniProt,2008,"Simó C, Bachi A, Cattaneo A, Guerrier L, Forti...",153 IPI;42 Obsolete;111 UniProt
4,18494517,UniProt,2008,"Ringrose JH, van Solinge WW, Mohammed S, O'Fla...",699 IPI;310 Obsolete;389 UniProt
5,18614565,UniProt,2008,"Roux-Dalvai F, Gonzalez de Peredo A, Simó C, G...",1577 IPI;364 Obsolete;1214 UniProt
6,19778645,UniProt,2009,"van Gestel RA, van Solinge WW, van der Toorn H...",561 GI;180 Obsolete;426 UniProt
7,22157974,UniProt,2011,"D'Amici GM, Rinalducci S, Zolla L. Depletion o...",134 GI;214 Obsolete;44 UniProt
8,22954596,UniProt,2012,"Pesciotta EN, Sriswasdi S, Tang HY, Mason PJ, ...",831 UniProt
9,23781972,UniProt,2013,"Pallotta V, D'Alessandro A, Rinalducci S, Zoll...",140 GI;47 Obsolete;118 UniProt


### Map UniProt IDs to current UniProt

In [7]:
# Extract all relevant information for now and save
query_parameters = {
    "query": " && ".join(
        [
            "(organism_id:9606)",  # Homo sapiens (Human)
        ]
    ),
    "format": "tsv",
    "size": 500,
    "compressed": True,
    "fields": ",".join(
        [
            "reviewed",
            "accession",
            "gene_primary",
        ]
    ),
}
all_failed = {}
all_unmapped = {}
idx = 0
for sheet_name, query_ids in updated_table_dict.copy().items():
    if sheet_name == "Table of Contents":
        continue
    print(f"{sheet_name}\n{len(sheet_name) * '-'}")
    df_results, uniparc, failed_ids, unmapped_ids = query_UniProt(
        query_ids,
        query_parameters=query_parameters,
        from_db="UniProtKB",
        to_db="UniProtKB",
        return_failed=True,
    )
    if failed_ids:
        print(f"Failed IDS: {failed_ids}\n")
        all_failed[sheet_name] = set(failed_ids)
    if unmapped_ids:
        print(f"Unmmaped IDS: {unmapped_ids}\n")
        all_unmapped[sheet_name] = set(unmapped_ids)
    df_uniprot = df_results["Entry"].drop_duplicates().dropna().reset_index(drop=True)
    df_uniprot.name = index_name
    updated_table_dict[sheet_name] = df_uniprot
    updated_table_dict["Table of Contents"].loc[
        idx, "Notes"
    ] += f" -->  {len(df_uniprot)} UniProt {version}"
    idx += 1
    print()

updated_table_dict["Table of Contents"]

12362340
--------


Number of failed query IDs : 5


Fetched: 79 / 79


Number of failed IDs : 3


Failed IDS: {'7710117', '4503581', '182074'}

Unmmaped IDS: {'P80315', 'Q64689'}


14963112
--------
Fetched: 66 / 66

16861337
--------
Fetched: 396 / 396

18399644
--------
Fetched: 112 / 112

18494517
--------
Fetched: 389 / 389

18614565
--------
Fetched: 500 / 1217
Fetched: 1000 / 1217
Fetched: 1217 / 1217

19778645
--------
Fetched: 427 / 427

22157974
--------
Fetched: 44 / 44

22954596
--------
Fetched: 500 / 765


Number of failed query IDs : 67


Fetched: 765 / 765


Number of failed IDs : 8
Number of obsolete IDs : 54


Failed IDS: {'UPI000041', '20FF', 'F69B', 'UPI0001B1', 'A4B8', 'UPI0001E6', '1A85', 'UPI000188'}

Unmmaped IDS: {'C9JUN6', 'C9K0W8', 'E9PC36', 'A8MZ76', 'D3DTE6', 'E7ESQ4', 'C9IZE4', 'E7EQ55', 'D3DP46', 'B7WPI6', 'E7ETK2', 'P35241', 'D3DR22', 'E9PCW5', 'E7EQR4', 'C9JLU2', 'D6R985', 'E9PC14', 'A8MZ22', 'Q6PIK3', 'A1L1K4', 'A6NDN4', 'E7EVU8', 'Q5RA57', 'C9J436', 'B5ME63', 'Q86U75', 'B0UX83', 'B1AKJ5', 'D3DUP1', 'E9PEC0', 'B9ZVX0', 'E9PBD4', 'E9PCQ3', 'E9PAN3', 'E7EUY0', 'A8MYA6', 'E7ENR8', 'E9PAR6', 'C9JNF5', 'A8MWX0', 'E7EMJ6', 'A7E2Y5', 'A8MW68', 'A5GFU6', 'D9YZV2', 'E7EVT0', 'D3DPK5', 'A8MTC3', 'E9PDP1', 'C9JCD7', 'E2QS59', 'D3DQ69', 'E2RB38', 'E7EVN0', 'A8MTG8', 'E9PBB7', 'E9PIZ3', 'A8MVJ7'}


23781972
--------
Fetched: 119 / 119

24555563
--------
Fetched: 500 / 1214
Fetched: 1000 / 1214


Number of failed query IDs : 9


Fetched: 1214 / 1214


Number of obsolete IDs : 5


Unmmaped IDS: {'F8VRQ1', 'F6RGB9', 'J3KPV7', 'B5MCY6', 'Q6PK30', 'B4DXJ1', 'B4E3P0', 'H0Y7U1', 'E9PEW9'}


26078478
--------


Number of failed query IDs : 6


Fetched: 416 / 416


Number of obsolete IDs : 6


Unmmaped IDS: {'B1ALS2', 'I3NI00', 'Q5VSJ9', 'H0YLA4', 'H0Y5J2', 'E7ETU3'}


26271157
--------


Number of failed query IDs : 4


Fetched: 72 / 72


Number of obsolete IDs : 4


Unmmaped IDS: {'K7EJT8', 'Q5VSJ9', 'D6W5X8', 'D3DRP5'}


26474164
--------
Fetched: 500 / 1148
Fetched: 1000 / 1148


Number of failed query IDs : 98


Fetched: 1148 / 1148


Number of failed IDs : 48
Number of obsolete IDs : 22


Failed IDS: {'1942352', '6754450', '73996350', '67549', '51247812', '0Y990', '9FQK7', '126308120', '73996336', '73996316', '126308122', '16303309', '3BPQ4', '126308130', '149714766', '73996461', '2370337', '109119997', '56847618', '149714762', '50233797', '116004057', '82995623', '73965965', '0Y5R6', '15341208', '73996314', '73996342', '50582982', '148235126', '10835849', '57012432', '20911031', '205055', '0Y3V4', '114051856', '73996332', '3318722', '51247813', '2194089', '31747338', '999627', 'UPI0002AF517B', '12345', '56710317', '2NNT7', '81170669', '2R572'}

Unmmaped IDS: {'12345;10835849;3318722;999627;51247812', 'G1RZN3', 'J3KMX9', '149714766;149714762;116004057;114051856', 'L5LCL6', 'P31939', 'L5KK29', 'K7C0T3', 'K7C6C0', 'G5B5L5', 'E9PFH8', 'F5H6T1', 'B0V0T3', 'F7FMZ1', 'G3H5V0', 'G3R4Y6', 'E9PC74', 'F7BTR0', 'I0FWE8', '149714766;149714762;73996336;73996350;73996342;73996314;81170669;20911031;16303309;56710317;73996332;73996316;50233797;205055', '73965965;126308130;109119997', '

Number of failed query IDs : 130


Fetched: 1187 / 1187


Number of obsolete IDs : 117


Unmmaped IDS: {'E9PPD9', 'A6NJ11', 'F5H3P5', 'P63010', 'F5GZ76', 'F5H4D6', 'E7ESM6', 'F5H6Z3', 'A8MSH7', 'F2Z3A5', 'E9PH51', 'B2REB8', 'F5H4I0', 'F5GWQ7', 'E9PNT8', 'F5GWY2', 'E7ERW8', 'F5GYQ0', 'F5H6K0', 'F5H328', 'C9JSL3', 'E9PB22', 'B0UX83', 'Q5SYT8', 'Q567Q5', 'F5GWE5', 'B7Z7A9', 'E7EWT1', 'F5H335', 'F6WSP1', 'F5GXD5', 'E7EWF1', 'B4DW94', 'E9PGR0', 'F5H0H0', 'E7EVE3', 'F5H754', 'F5H7U0', 'F5H223', 'C9JGQ9', 'E7EQZ3', 'F5H8M4', 'P62495', 'C8KIM0', 'D6RFZ8', 'A8MQ60', 'B0V2L0', 'B0V043', 'E9PHI9', 'Q5HY54', 'A8MW06', 'P78371', 'F5H4G7', 'B7WPF4', 'E7EMJ8', 'E9PD66', 'E9PBW0', 'F5H5Y5', 'F5GXG3', 'F5H1I1', 'F5H7U6', 'E9PBC7', 'Q5T6W5', 'F5GWW0', 'F5H569', 'E9PGE6', 'Q5JUV3', 'E7EWX4', 'E9PDI1', 'F5GZU7', 'F5GX52', 'E7EQR4', 'E7EWT6', 'F5GZJ1', 'A8MZ22', 'F5H7M3', 'E7EWR0', 'E9PER8', 'F5H0R5', 'F5GXC8', 'E5RJ56', 'C9JB13', 'Q3B7A4', 'Q5T1I6', 'F5H3J5', 'C4AM82', 'E9PCW3', 'E9PBQ1', 'F5GY14', 'E7ERV9', 'E7EP96', 'B8ZZT7', 'E7EMY5', 'E7EPC9', 'F5H2W0', 'F5H2Q7', 'B1AHC7', 'E9PC09', 'E9PE

Number of failed query IDs : 55


Fetched: 1227 / 1227


Number of obsolete IDs : 53


Unmmaped IDS: {'G5E963', 'A0A087WUQ6', 'K7ELI3', 'F5GWE5', 'I3L2G5', 'H0Y6Q0', 'A0A087WTB8', 'A0A087WWJ2', 'A0A087WXL8', 'E2QRB9', 'A0A087X232', 'A0A087WUL0', 'A0A0D9SFK2', 'A0A087X2I2', 'X6RGJ2', 'A0A087WY82', 'A0A087X1S2', 'F2Z2X4', 'A0A087WVF8', 'A0A087WYC5', 'Q5T993', 'A0A087WZE4', 'J9JID7', 'A0A075B6J4', 'A0A0C4DH83', 'E7EQR4', 'A0A087X0Q9', 'A0A087WWL9', 'J3KR44', 'A0A087WWS7', 'A0A087X079', 'A0A0B4J2G9', 'A0A087WUZ3', 'F8W914', 'A0A0A0MTS2', 'S4R460', 'A0A087WVV2', 'H7BXK9', 'A0A087X2C0', 'A0A087X0C8', 'A0A087WX08', 'A0A087WUD7', 'A0A0C4DGZ8', 'Q5HY54', 'A0A087WXS7', 'A0A087WZR3', 'A0A0C4DGX4', 'B1ANM7', 'A0A087WZW8', 'A0A087X0K1', 'A0A0A0MSW4', 'A0A0A0MT60', 'P01609', 'A0A087X0E2', 'A0A0A0MSK3'}


28263177
--------
Fetched: 500 / 1815
Fetched: 1000 / 1815
Fetched: 1500 / 1815


Number of failed query IDs : 14


Fetched: 1815 / 1815


Number of obsolete IDs : 2


Unmmaped IDS: {'A6NJ16', 'P01608', 'P01769', 'P18135', 'P06326', 'P01621', 'P01602', 'P01598', 'P04206', 'P01772', 'P01620', 'P04220', 'P04434', 'P01779'}


28689405
--------
Fetched: 500 / 2556
Fetched: 1000 / 2556
Fetched: 1500 / 2556
Fetched: 2000 / 2556
Fetched: 2500 / 2556


Number of failed query IDs : 23


Fetched: 2556 / 2556


Number of obsolete IDs : 22


Unmmaped IDS: {'P01891', 'C9JV50', 'A0A087WV23', 'A9Z1Z3', 'J3KN10', 'H7C2F2', 'A0A087WTI1', 'S4R460', 'A0A087WY61', 'K7EQW8', 'A0A087WUW9', 'H0Y8T6', 'H0Y905', 'V9GY79', 'A0A0G2JNK4', 'A0A0A6YYB0', 'D6RD96', 'Q5T699', 'A0A087WZW8', 'F6QW41', 'F5H0Y5', 'A0A0A0MQS8', 'D6RHC5'}


30327373
--------
Fetched: 500 / 1884
Fetched: 1000 / 1884
Fetched: 1500 / 1884
Fetched: 1884 / 1884

31552303
--------
Fetched: 267 / 267

33103907
--------
Fetched: 500 / 921
Fetched: 921 / 921

33341364
--------
Fetched: 500 / 841


Number of failed query IDs : 35


Fetched: 841 / 841


Number of obsolete IDs : 29


Unmmaped IDS: {'B8ZZP3', 'F5GWE5', 'M0QX55', 'G3XAN9', 'E7ETU3', 'Q53H96', 'E7EPC6', 'P27824', 'Q5T6W5', 'F5H569', 'C9IZE4', 'Q9P035', 'I7HJJ0', 'F2Z2X4', 'G5EA52', 'H7BY58', 'F5H4R9', 'Q5TCU6', 'F5H223', 'K7EJT8', 'J3KR44', 'J3QK90', 'E7ESI6', 'F8WAQ9', 'F5H012', 'Q9BQA1', 'J3KNL3', 'E7EW52', 'E9PM46', 'B4DDP6', 'E7EU23', 'B1ANM7', 'F5GZD0', 'F5GXV9', 'J3KN69'}


33806028
--------
Fetched: 500 / 1339
Fetched: 1000 / 1339
Fetched: 1339 / 1339

35858567
--------
Fetched: 500 / 1530
Fetched: 1000 / 1530
Fetched: 1500 / 1530
Fetched: 1530 / 1530

36346805
--------
Fetched: 500 / 659
Fetched: 659 / 659

37760001
--------
Fetched: 500 / 878
Fetched: 878 / 878

37942280
--------
Fetched: 500 / 1812
Fetched: 1000 / 1812
Fetched: 1500 / 1812


Number of failed query IDs : 2


Fetched: 1812 / 1812


Number of obsolete IDs : 2


Unmmaped IDS: {'P0DN79', 'A0A0B4J2D5'}


38147558
--------
Fetched: 500 / 1208
Fetched: 1000 / 1208
Fetched: 1208 / 1208

38964323
--------
Fetched: 500 / 2601
Fetched: 1000 / 2601
Fetched: 1500 / 2601
Fetched: 2000 / 2601
Fetched: 2500 / 2601
Fetched: 2601 / 2601

40043591
--------
Fetched: 346 / 346



Unnamed: 0,PubMed/Sheet Name,ID type,Year,Publication,Notes
0,12362340,UniProt,2002,"Low TY, Seow TK, Chung MC. Separation of human...",84 UniProt --> 79 UniProt 2025_01
1,14963112,UniProt,2004,"Kakhniashvili DG, Bulla LA Jr, Goodman SR. The...",183 GI;128 Obsolete;66 UniProt --> 66 UniProt...
2,16861337,UniProt,2006,"Pasini EM, Kirkegaard M, Mortensen P, Lutz HU,...",590 IPI;197 Obsolete;393 UniProt --> 396 UniP...
3,18399644,UniProt,2008,"Simó C, Bachi A, Cattaneo A, Guerrier L, Forti...",153 IPI;42 Obsolete;111 UniProt --> 112 UniPr...
4,18494517,UniProt,2008,"Ringrose JH, van Solinge WW, Mohammed S, O'Fla...",699 IPI;310 Obsolete;389 UniProt --> 389 UniP...
5,18614565,UniProt,2008,"Roux-Dalvai F, Gonzalez de Peredo A, Simó C, G...",1577 IPI;364 Obsolete;1214 UniProt --> 1217 U...
6,19778645,UniProt,2009,"van Gestel RA, van Solinge WW, van der Toorn H...",561 GI;180 Obsolete;426 UniProt --> 427 UniPr...
7,22157974,UniProt,2011,"D'Amici GM, Rinalducci S, Zolla L. Depletion o...",134 GI;214 Obsolete;44 UniProt --> 44 UniProt...
8,22954596,UniProt,2012,"Pesciotta EN, Sriswasdi S, Tang HY, Mason PJ, ...",831 UniProt --> 765 UniProt 2025_01
9,23781972,UniProt,2013,"Pallotta V, D'Alessandro A, Rinalducci S, Zoll...",140 GI;47 Obsolete;118 UniProt --> 119 UniPro...


#### Quick check unmapped against known deleted entries

In [8]:
known_issues = df_obsolete[["UniProt", "Known issue?"]].dropna(subset="Known issue?")
known_issues = known_issues.set_index("UniProt")["Known issue?"].to_dict()

for k, values in all_unmapped.items():
    print(f"Sheet: {k}")
    for v in values:
        if not UNIPROT_ID_RE.search(v):
            # print(f"{v}\tNot a UniProt ID")
            continue

        elif v in known_issues:
            # print(f"{v}\t{known_issues[v]}")
            continue
        else:
            print(f"{v}\tUnclear")
    print()
    print()

Sheet: 12362340


Sheet: 22954596
P35241	Unclear


Sheet: 24555563


Sheet: 26078478


Sheet: 26271157


Sheet: 26474164
P31939	Unclear
P62937	Unclear
P11586	Unclear


Sheet: 27006477
P63010	Unclear
P62495	Unclear
P78371	Unclear
P06748	Unclear
O14744	Unclear


Sheet: 27777340


Sheet: 28263177
P01772	Unclear


Sheet: 28689405


Sheet: 33341364


Sheet: 37942280




### Export aggregated proteomic data, updated IDs


In [9]:
if overwrite:
    with pd.ExcelWriter(
        ROOT_PATH / EXTERNAL_PATH / "proteomics" / "proteomics_aggregated.xlsx"
    ) as writer:
        for sheet_name, df in updated_table_dict.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)
else:
    with pd.ExcelWriter(
        ROOT_PATH / INTERIM_PATH / "proteomics" / "proteomics_aggregated.xlsx"
    ) as writer:
        for sheet_name, df in updated_table_dict.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)