# Update Protein Identifiers to UniProt

Note: Requires internet connection to download information from the UniProt.

## Setup
### Import packages

In [1]:
import pandas as pd
from rbc_gem_utils import (
    ANNOTATION_PATH,
    DATABASE_PATH,
    EXTERNAL_PATH,
    INTERIM_PATH,
    ROOT_PATH,
    build_string,
    check_version,
    show_versions,
)
from rbc_gem_utils.database.uniprot import (
    UNIPROT_ID_RE,
    UNIPROT_VERSION_EXPECTED,
    get_version_UniProt,
    query_UniProt,
)

# Display versions of last time notebook ran and worked
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
beautifulsoup4                       4.12.3
bio                                   1.6.2
cobra                                0.29.0
depinfo                               2.2.0
kaleido                               0.2.1
matplotlib                            3.8.2
memote                               0.17.0
networkx                              3.2.1
notebook                              7.0.7
openpyxl                              3.1.2
pandas                                2.2.0
pre-commit                            3.6.0
pyvis                                 0.3.2
rbc-gem-utils[database,network,vis] missing
requests                             2.31.0
scipy                                1.12.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip        23.3.1
setuptools 68.2.2
wheel      0.41.2

Platform Information
-------------------

## Check UniProt version
If the version does not match the expected version, it is because database has been updated since the last time this code was utilized. 

### Expected UniProt version: 2024_06
* According to [UniProt](https://www.uniprot.org/help/downloads), updates to the database are made every eight weeks. 

In [2]:
version = get_version_UniProt()
if check_version(version, UNIPROT_VERSION_EXPECTED, verbose=True):
    database_dirpath = ROOT_PATH / DATABASE_PATH
    annotation_dirpath = ROOT_PATH / ANNOTATION_PATH
else:
    # Use different directory paths for unexpected behavior
    database_dirpath = ROOT_PATH / INTERIM_PATH
    annotation_dirpath = ROOT_PATH / INTERIM_PATH

Current and expected versions match.


## Load aggregated proteomic data, raw

In [3]:
overwrite = True

excel_filepath_raw = (
    ROOT_PATH / EXTERNAL_PATH / "proteomics" / "proteomics_aggregated_raw.xlsx"
)

### Load obsolete identifier mapping

In [4]:
df_obsolete = pd.read_csv(
    ROOT_PATH / EXTERNAL_PATH / "proteomics" / "proteomics_obsolete.tsv",
    sep="\t",
    dtype=str,
    index_col=0,
)
df_obsolete

Unnamed: 0,UniProt,IPI,GI,UniParc,EntryNames
0,A0A024QZ64,,78070601,,
1,A0A024R0L6,,4505587;14043830,,
2,A0A024R1A3,,23510338,,
3,A0A024R1I3,,10092677;40674427,,
4,A0A024R4T4,,4507791,,
...,...,...,...,...,...
3181,,IPI00843857,,,
3182,,IPI00843921,,,
3183,,IPI00844038,,,
3184,,IPI00844133,,,


### Load table of contents, raw

In [5]:
df_contents_raw = pd.read_excel(
    excel_filepath_raw, sheet_name="Table of Contents", dtype=str
).fillna("")
df_contents_raw

Unnamed: 0,PubMed/Sheet Name,ID type,Year,Publication,Notes
0,12362340,UniProt,2002,"Low TY, Seow TK, Chung MC. Separation of human...",Used for iAB-RBC-283
1,14963112,GI,2004,"Kakhniashvili DG, Bulla LA Jr, Goodman SR. The...",
2,16861337,IPI,2006,"Pasini EM, Kirkegaard M, Mortensen P, Lutz HU,...",Used for iAB-RBC-283
3,18399644,IPI,2008,"Simó C, Bachi A, Cattaneo A, Guerrier L, Forti...",
4,18494517,IPI,2008,"Ringrose JH, van Solinge WW, Mohammed S, O'Fla...",
5,18614565,IPI,2008,"Roux-Dalvai F, Gonzalez de Peredo A, Simó C, G...",Used for iAB-RBC-283
6,19778645,GI,2009,"van Gestel RA, van Solinge WW, van der Toorn H...",
7,22157974,GI,2011,"D'Amici GM, Rinalducci S, Zolla L. Depletion o...",
8,22954596,UniProt,2012,"Pesciotta EN, Sriswasdi S, Tang HY, Mason PJ, ...",Disease conditions
9,23781972,GI,2013,"Pallotta V, D'Alessandro A, Rinalducci S, Zoll...",


### Map other identifiers to UniProt

In [6]:
updated_table_dict = {"Table of Contents": df_contents_raw.copy()}
obsolete_dict = {}
index_name = "Uniprot"
for idx, (sheet_name, id_type) in df_contents_raw[
    ["PubMed/Sheet Name", "ID type"]
].iterrows():
    df = pd.read_excel(
        excel_filepath_raw, sheet_name=sheet_name, usecols=[0], dtype=str
    )
    df.columns = [id_type]
    if id_type != "UniProt":
        # Map to obsolete identifiers
        df_obsolete_mapping = (
            df_obsolete[[id_type, "UniProt", "UniParc"]]
            .dropna(how="all")
            .drop_duplicates()
            .copy()
        )
        if id_type == "GI":
            df[id_type] = df[id_type].apply(
                lambda x: x.split("|")[-1] if str(x).startswith("gi|") else x
            )
        df_obsolete_mapping = df.merge(
            df_obsolete_mapping, left_on=id_type, right_on=id_type, how="left"
        )
        obsolete_dict[sheet_name] = df_obsolete_mapping.copy()
        counts = df_obsolete_mapping.nunique()
        counts["Obsolete"] = (
            df_obsolete_mapping["UniProt"].isna().value_counts()[True].item()
        )

        counts = {k: v for (k, v) in sorted(counts.to_dict().items())}
        updated_table_dict["Table of Contents"].loc[idx, "ID type"] = "UniProt"
        updated_table_dict["Table of Contents"].loc[idx, "Notes"] = build_string(
            [f"{v} {k}" for k, v in counts.items()]
        )
        updated_table_dict[sheet_name] = list(
            df_obsolete_mapping["UniProt"].dropna().unique()
        )
    else:
        df["UniProt"] = df["UniProt"].apply(
            lambda x: UNIPROT_ID_RE.search(x).group() if UNIPROT_ID_RE.search(x) else x
        )
        updated_table_dict[sheet_name] = list(df["UniProt"].dropna().unique())
        updated_table_dict["Table of Contents"].loc[idx, "Notes"] = "{} UniProt".format(
            df["UniProt"].nunique()
        )

updated_table_dict["Table of Contents"]

Unnamed: 0,PubMed/Sheet Name,ID type,Year,Publication,Notes
0,12362340,UniProt,2002,"Low TY, Seow TK, Chung MC. Separation of human...",84 UniProt
1,14963112,UniProt,2004,"Kakhniashvili DG, Bulla LA Jr, Goodman SR. The...",183 GI;128 Obsolete;7 UniParc;66 UniProt
2,16861337,UniProt,2006,"Pasini EM, Kirkegaard M, Mortensen P, Lutz HU,...",590 IPI;197 Obsolete;0 UniParc;393 UniProt
3,18399644,UniProt,2008,"Simó C, Bachi A, Cattaneo A, Guerrier L, Forti...",153 IPI;42 Obsolete;0 UniParc;111 UniProt
4,18494517,UniProt,2008,"Ringrose JH, van Solinge WW, Mohammed S, O'Fla...",699 IPI;310 Obsolete;0 UniParc;389 UniProt
5,18614565,UniProt,2008,"Roux-Dalvai F, Gonzalez de Peredo A, Simó C, G...",1577 IPI;364 Obsolete;0 UniParc;1214 UniProt
6,19778645,UniProt,2009,"van Gestel RA, van Solinge WW, van der Toorn H...",561 GI;180 Obsolete;22 UniParc;426 UniProt
7,22157974,UniProt,2011,"D'Amici GM, Rinalducci S, Zolla L. Depletion o...",134 GI;214 Obsolete;0 UniParc;44 UniProt
8,22954596,UniProt,2012,"Pesciotta EN, Sriswasdi S, Tang HY, Mason PJ, ...",831 UniProt
9,23781972,UniProt,2013,"Pallotta V, D'Alessandro A, Rinalducci S, Zoll...",140 GI;47 Obsolete;5 UniParc;118 UniProt


### Map UniProt IDs to current UniProt

In [7]:
# Extract all relevant information for now and save
query_parameters = {
    "query": " && ".join(
        [
            "(organism_id:9606)",  # Homo sapiens (Human)
        ]
    ),
    "format": "tsv",
    "size": 500,
    "compressed": True,
    "fields": ",".join(
        [
            "reviewed",
            "accession",
            "gene_primary",
        ]
    ),
}
idx = 0
for sheet_name, query_ids in updated_table_dict.copy().items():
    if sheet_name == "Table of Contents":
        continue
    print(f"{sheet_name}\n{len(sheet_name) * '-'}")
    df_results, uniparc, failed_ids, unmapped_ids = query_UniProt(
        query_ids,
        query_parameters=query_parameters,
        from_db="UniProtKB",
        to_db="UniProtKB",
        return_failed=True,
    )

    df_uniprot = df_results["Entry"].drop_duplicates().dropna().reset_index(drop=True)
    df_uniprot.name = index_name
    updated_table_dict[sheet_name] = df_uniprot
    updated_table_dict["Table of Contents"].loc[
        idx, "Notes"
    ] += f" -->  {len(df_uniprot)} UniProt {version}"
    idx += 1
    print()

updated_table_dict["Table of Contents"]

12362340
--------
Retrying in 3s
Retrying in 3s


Number of failed query IDs : 5


Fetched: 79 / 79


Number of failed IDs : 3



14963112
--------
Retrying in 3s
Retrying in 3s
Fetched: 66 / 66

16861337
--------
Retrying in 3s
Retrying in 3s
Fetched: 396 / 396

18399644
--------
Retrying in 3s
Fetched: 112 / 112

18494517
--------
Retrying in 3s
Retrying in 3s
Fetched: 389 / 389

18614565
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 1217
Fetched: 1000 / 1217
Fetched: 1217 / 1217

19778645
--------
Retrying in 3s
Retrying in 3s
Fetched: 426 / 426

22157974
--------
Retrying in 3s
Retrying in 3s
Fetched: 44 / 44

22954596
--------
Retrying in 3s
Retrying in 3s
Fetched: 500 / 765


Number of failed query IDs : 67


Fetched: 765 / 765


Number of failed IDs : 8
Number of obsolete IDs : 54



23781972
--------
Retrying in 3s
Fetched: 118 / 118

24555563
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 1214
Fetched: 1000 / 1214


Number of failed query IDs : 9


Fetched: 1214 / 1214


Number of obsolete IDs : 5



26078478
--------
Retrying in 3s
Retrying in 3s


Number of failed query IDs : 6


Fetched: 416 / 416


Number of obsolete IDs : 6



26271157
--------
Retrying in 3s
Retrying in 3s


Number of failed query IDs : 4


Fetched: 72 / 72


Number of obsolete IDs : 4



26474164
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 1148
Fetched: 1000 / 1148


Number of failed query IDs : 98


Fetched: 1148 / 1148


Number of failed IDs : 48
Number of obsolete IDs : 22



27006477
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 1187
Fetched: 1000 / 1187


Number of failed query IDs : 130


Fetched: 1187 / 1187


Number of obsolete IDs : 117



27777340
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 1227
Fetched: 1000 / 1227


Number of failed query IDs : 55


Fetched: 1227 / 1227


Number of obsolete IDs : 53



28263177
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 1815
Fetched: 1000 / 1815
Fetched: 1500 / 1815


Number of failed query IDs : 14


Fetched: 1815 / 1815


Number of obsolete IDs : 2



28689405
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 2556
Fetched: 1000 / 2556
Fetched: 1500 / 2556
Fetched: 2000 / 2556
Fetched: 2500 / 2556


Number of failed query IDs : 23


Fetched: 2556 / 2556


Number of obsolete IDs : 22



30327373
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 1884
Fetched: 1000 / 1884
Fetched: 1500 / 1884
Fetched: 1884 / 1884

31552303
--------
Retrying in 3s
Retrying in 3s
Fetched: 267 / 267

33103907
--------
Retrying in 3s
Retrying in 3s
Fetched: 500 / 921
Fetched: 921 / 921

33341364
--------
Retrying in 3s
Retrying in 3s
Fetched: 500 / 841


Number of failed query IDs : 35


Fetched: 841 / 841


Number of obsolete IDs : 29



33806028
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 1341
Fetched: 1000 / 1341
Fetched: 1341 / 1341

35858567
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 1530
Fetched: 1000 / 1530
Fetched: 1500 / 1530
Fetched: 1530 / 1530

36346805
--------
Retrying in 3s
Retrying in 3s
Fetched: 500 / 659
Fetched: 659 / 659

37760001
--------
Retrying in 3s
Retrying in 3s
Fetched: 500 / 878
Fetched: 878 / 878

37942280
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 1812
Fetched: 1000 / 1812
Fetched: 1500 / 1812


Number of failed query IDs : 2


Fetched: 1812 / 1812


Number of obsolete IDs : 2



38147558
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 1210
Fetched: 1000 / 1210
Fetched: 1210 / 1210

38964323
--------
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 2601
Fetched: 1000 / 2601
Fetched: 1500 / 2601
Fetched: 2000 / 2601
Fetched: 2500 / 2601
Fetched: 2601 / 2601



Unnamed: 0,PubMed/Sheet Name,ID type,Year,Publication,Notes
0,12362340,UniProt,2002,"Low TY, Seow TK, Chung MC. Separation of human...",84 UniProt --> 79 UniProt 2024_06
1,14963112,UniProt,2004,"Kakhniashvili DG, Bulla LA Jr, Goodman SR. The...",183 GI;128 Obsolete;7 UniParc;66 UniProt --> ...
2,16861337,UniProt,2006,"Pasini EM, Kirkegaard M, Mortensen P, Lutz HU,...",590 IPI;197 Obsolete;0 UniParc;393 UniProt -->...
3,18399644,UniProt,2008,"Simó C, Bachi A, Cattaneo A, Guerrier L, Forti...",153 IPI;42 Obsolete;0 UniParc;111 UniProt --> ...
4,18494517,UniProt,2008,"Ringrose JH, van Solinge WW, Mohammed S, O'Fla...",699 IPI;310 Obsolete;0 UniParc;389 UniProt -->...
5,18614565,UniProt,2008,"Roux-Dalvai F, Gonzalez de Peredo A, Simó C, G...",1577 IPI;364 Obsolete;0 UniParc;1214 UniProt -...
6,19778645,UniProt,2009,"van Gestel RA, van Solinge WW, van der Toorn H...",561 GI;180 Obsolete;22 UniParc;426 UniProt -->...
7,22157974,UniProt,2011,"D'Amici GM, Rinalducci S, Zolla L. Depletion o...",134 GI;214 Obsolete;0 UniParc;44 UniProt --> ...
8,22954596,UniProt,2012,"Pesciotta EN, Sriswasdi S, Tang HY, Mason PJ, ...",831 UniProt --> 765 UniProt 2024_06
9,23781972,UniProt,2013,"Pallotta V, D'Alessandro A, Rinalducci S, Zoll...",140 GI;47 Obsolete;5 UniParc;118 UniProt --> ...


### Export aggregated proteomic data, updated IDs


In [8]:
if overwrite:
    with pd.ExcelWriter(
        ROOT_PATH / EXTERNAL_PATH / "proteomics" / "proteomics_aggregated.xlsx"
    ) as writer:
        for sheet_name, df in updated_table_dict.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)
else:
    with pd.ExcelWriter(
        ROOT_PATH / INTERIM_PATH / "proteomics" / "proteomics_aggregated.xlsx"
    ) as writer:
        for sheet_name, df in updated_table_dict.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)