# Update Protein Identifiers to UniProt

Note: Requires internet connection to download information from the UniProt.

## Setup
### Import packages

In [None]:
import numpy as np
import pandas as pd
from rbc_gem_utils import (
    build_string,
    check_database_release_online,
    get_dirpath,
    show_versions,
)
from rbc_gem_utils.database.uniprot import (
    UNIPROT_DB_TAG,
    UNIPROT_ID_RE,
    get_release_UniProt,
    query_UniProt,
)

# Display versions of last time notebook ran and worked
show_versions()

## Check UniProt release
If the release does not match the expected release, it is because database has been updated since the last time this code was utilized. 

* According to [UniProt](https://www.uniprot.org/help/downloads), updates to the database are made every eight weeks.
* If the current release does not match the expected release, it is because database has been updated since the last time this code was utilized.
    * If the notebook works without needing any significant modifications, the only update needed is to the release in the [uniprot.py](../../src/rbc_gem_utils/database/uniprot.py) source code file to resolve the issue.

In [None]:
release = get_release_UniProt()

use_interim = not check_database_release_online(UNIPROT_DB_TAG, verbose=True, **{})
# Use different directory paths for unexpected behavior
if use_interim:
    warn(
        "Online release of database has been updated since the last time notebook was used."
    )


database_dirpath = get_dirpath(
    "database", UNIPROT_DB_TAG, use_temp="interim" if use_interim else None
)
annotation_dirpath = get_dirpath(
    "annotation", use_temp="interim" if use_interim else None
)

# Ensure directories exist
database_dirpath.mkdir(exist_ok=True, parents=True)
annotation_dirpath.mkdir(exist_ok=True, parents=True)

## Load aggregated proteomic data, raw

In [None]:
overwrite = True

excel_filepath_raw = (
    get_dirpath("proteomics", use_temp="external") / "proteomics_aggregated_raw.xlsx"
)

### Load obsolete identifier mapping
Meant to get all IDs into UniProt identifiers. Does not account for obselete/deleted/old UniProt IDs. Those must be manually checked

In [None]:
df_obsolete = pd.read_csv(
    get_dirpath("proteomics", use_temp="external") / "proteomics_obsolete.tsv",
    sep="\t",
    dtype=str,
    index_col=None,
)
df_obsolete

### Load table of contents, raw

In [None]:
df_contents_raw = pd.read_excel(
    excel_filepath_raw, sheet_name="Table of Contents", dtype=str
).fillna("")
df_contents_raw

### Map other identifiers to UniProt

In [None]:
updated_table_dict = {"Table of Contents": df_contents_raw.copy()}
obsolete_dict = {}
problems = {}
index_name = "Uniprot"
for idx, (sheet_name, id_type) in df_contents_raw[
    ["PubMed/Sheet Name", "ID type"]
].iterrows():
    df = pd.read_excel(
        excel_filepath_raw, sheet_name=sheet_name, usecols=[0], dtype=str
    )
    df.columns = [id_type]
    if id_type != "UniProt":
        # Map to obsolete identifiers
        df_obsolete_mapping = (
            df_obsolete[[id_type, "UniProt"]].dropna(how="all").drop_duplicates().copy()
        )
        if id_type == "GI":
            df[id_type] = df[id_type].apply(
                lambda x: x.split("|")[-1] if str(x).startswith("gi|") else x
            )
        df_obsolete_mapping = df.merge(
            df_obsolete_mapping, left_on=id_type, right_on=id_type, how="left"
        )
        obsolete_dict[sheet_name] = df_obsolete_mapping.copy()
        counts = df_obsolete_mapping.nunique()
        counts["Obsolete"] = (
            df_obsolete_mapping["UniProt"].isna().value_counts()[True].item()
        )

        counts = {k: v for (k, v) in sorted(counts.to_dict().items())}
        updated_table_dict["Table of Contents"].loc[idx, "ID type"] = "UniProt"
        note_str = build_string([f"{v} {k}" for k, v in counts.items() if v != 0])
        updated_table_dict["Table of Contents"].loc[idx, "Notes"] = note_str
        updated_table_dict[sheet_name] = list(
            df_obsolete_mapping["UniProt"].dropna().unique()
        )
    else:
        check_mixed_ids = df["UniProt"][
            ~df["UniProt"].apply(lambda x: True if UNIPROT_ID_RE.search(x) else False)
        ]
        counts = {"UniProt": 0}
        if not check_mixed_ids.empty:
            problems[sheet_name] = df
        df["UniProt"] = df["UniProt"].apply(
            lambda x: (
                UNIPROT_ID_RE.search(x).group()
                if isinstance(x, str) and UNIPROT_ID_RE.search(x)
                else x
            )
        )
        counts.update({"UniProt": df["UniProt"].nunique()})
        counts = {k: v for (k, v) in sorted(counts.items())}
        note_str = build_string([f"{v} {k}" for k, v in counts.items() if v != 0])
        updated_table_dict["Table of Contents"].loc[idx, "Notes"] = note_str
        updated_table_dict[sheet_name] = list(df["UniProt"].dropna().unique())

updated_table_dict["Table of Contents"]

### Map UniProt IDs to current UniProt

In [None]:
all_failed = {}
all_unmapped = {}

In [None]:
# Extract all relevant information for now and save
query_parameters = {
    "query": " && ".join(
        [
            "(organism_id:9606)",  # Homo sapiens (Human)
        ]
    ),
    "format": "tsv",
    "size": 500,
    "compressed": True,
    "fields": ",".join(
        [
            "reviewed",
            "accession",
            "gene_primary",
        ]
    ),
}

idx = -1
# Use redo sheets to remap specific sheets that failed
redo_sheets = set([])
issues = ["28689405"]
for sheet_name, query_ids in updated_table_dict.copy().items():
    if sheet_name == "Table of Contents" or (
        redo_sheets and sheet_name not in redo_sheets
    ):
        idx += 1
        continue
    print(f"{idx}) {sheet_name}\n{(4 + len(sheet_name)) * '-'}")
    try:
        df_results, uniparc, failed_ids, unmapped_ids = query_UniProt(
            query_ids,
            query_parameters=query_parameters,
            from_db="UniProtKB",
            to_db="UniProtKB",
            return_failed=True,
        )
    except:
        # Try additional time before continuing
        print(f"Issue with {sheet_name}, retry after extraction is finished\n")
        idx += 1
        issues += [sheet_name]
        continue
    if failed_ids:
        print(f"Failed IDS: {failed_ids}\n")
        all_failed[sheet_name] = set(failed_ids)
    if unmapped_ids:
        print(f"Unmmaped IDS: {unmapped_ids}\n")
        all_unmapped[sheet_name] = set(unmapped_ids)
    df_uniprot = df_results["Entry"].drop_duplicates().dropna().reset_index(drop=True)
    df_uniprot.name = index_name
    updated_table_dict[sheet_name] = df_uniprot
    updated_table_dict["Table of Contents"].loc[
        idx, "Notes"
    ] += f" -->  {len(df_uniprot)} UniProt {release}"
    idx += 1
    print()
print(issues)
updated_table_dict["Table of Contents"]

#### Quick check unmapped against known deleted entries

In [None]:
known_issues = df_obsolete[["UniProt", "Known issue?"]].dropna(subset="Known issue?")
known_issues = known_issues.set_index("UniProt")["Known issue?"].to_dict()

for k, values in all_unmapped.items():
    print(f"Sheet: {k}")
    for v in values:
        if not UNIPROT_ID_RE.search(v):
            # print(f"{v}\tNot a UniProt ID")
            continue

        elif v in known_issues:
            # print(f"{v}\t{known_issues[v]}")
            continue
        else:
            print(f"{v}\tUnclear")
    print()
    print()

### Export aggregated proteomic data, updated IDs


In [None]:
if overwrite:
    with pd.ExcelWriter(
        get_dirpath("proteomics", use_temp="external") / "proteomics_aggregated.xlsx"
    ) as writer:
        for sheet_name, df in updated_table_dict.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)