# Extract annotations and data from UniProtKB

The purpose of this notebook is to extract and format UniProt data for subsequent model annotation.

Additionally, the purpose of this notebook is to extract data and other annotations related to model proteins.

## Notebook Requirements:
*  Model genes **must** have the at least one of following annotations stored in the `object.annotation`. Values are expected to be seperated by semicolons. Accepted keys currently include:
    * `"uniprot"`
* Note: Requires internet connection to download information from the [Universal Protein Resource (UniProt)](https://www.uniprot.org/).
    *  [UniProt Knowledgebase (UniProtKB)](https://www.uniprot.org/help/uniprotkb)

### Citation
UniProt Consortium. UniProt: the Universal Protein Knowledgebase in 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D523-D531. doi: 10.1093/nar/gkac1052. PMID: 36408920; PMCID: PMC9825514.

## Setup
### Import packages

In [None]:
import re
from warnings import warn

import matplotlib.pyplot as plt
import pandas as pd
from rbc_gem_utils import (
    GEM_NAME,
    build_string,
    check_database_release_online,
    compare_tables,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    split_string,
    visualize_comparison,
)
from rbc_gem_utils.database.uniprot import (
    UNIPROT_DB_TAG,
    UNIPROT_RELEASE_EXPECTED,
    get_annotation_to_from_db_UniProt,
    get_isoform_value_from_entry_UniProt,
    get_label_miriam_mapping_UniProt,
    get_query_fields_UniProt,
    parse_chains_UniProt,
    parse_isoforms_UniProt,
    query_UniProt,
)

# Display versions of last time notebook ran and worked
show_versions()

## Set notebook options

In [None]:
db_tag = UNIPROT_DB_TAG
expected_release = UNIPROT_RELEASE_EXPECTED

compare_figsize = (5, 20)
compare = True
display_nunique = True
overwrite = True

## Check UniProt release
If the release does not match the expected release, it is because database has been updated since the last time this code was utilized. 

* According to [UniProt](https://www.uniprot.org/help/downloads), updates to the database are made every eight weeks.
* If the current release does not match the expected release, it is because database has been updated since the last time this code was utilized.
    * If the notebook works without needing any significant modifications, the only update needed is to the release in the [uniprot.py](../../src/rbc_gem_utils/database/uniprot.py) source code file to resolve the issue.


In [None]:
use_interim = not check_database_release_online(db_tag, verbose=True, **{})
# Use different directory paths for unexpected behavior
if use_interim:
    warn(
        "Online release of database has been updated since the last time notebook was used."
    )


database_dirpath = get_dirpath(
    "database", db_tag, use_temp="interim" if use_interim else None
)
annotation_dirpath = get_dirpath(
    "annotation", use_temp="interim" if use_interim else None
)

# Ensure directories exist
database_dirpath.mkdir(exist_ok=True, parents=True)
annotation_dirpath.mkdir(exist_ok=True, parents=True)

## Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
model

## Download data from UniProt

### Get IDs for query
#### Using an existing annotation

In [None]:
annotation_type = "genes"
annotation_cols = ["uniprot"]
mapping_key = "uniprot"

df_model_mappings = (
    get_annotation_df(model.genes, annotation_cols)
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)
for col in df_model_mappings.columns:
    df_model_mappings[col] = df_model_mappings[col].apply(lambda x: split_string(x))
    df_model_mappings = df_model_mappings.explode(col).drop_duplicates().dropna()
df_model_mappings = df_model_mappings.sort_values(annotation_type)

print(df_model_mappings.nunique(dropna=True))
df_model_mappings = df_model_mappings.reset_index(drop=True)
df_model_mappings

In [None]:
annotation_to_from_db = get_annotation_to_from_db_UniProt(miriam_only=True)

from_db = annotation_to_from_db[mapping_key]
query_ids = df_model_mappings[mapping_key].dropna().unique()
assert len(set(query_ids)) == len(query_ids), "Duplicate IDs in list to query"
model_search_mapping = df_model_mappings.set_index(annotation_type)[
    mapping_key
].to_dict()
print(f"Number of model genes associated with query: {len(model_search_mapping)}")
print(f"Number of unique IDs to query: {len(query_ids)}")
df_model_mappings[[annotation_type, mapping_key]].drop_duplicates()

## Run queries
### Set universal query parameters

In [None]:
all_query_results = {}

In [None]:
miriam_query_fields = get_query_fields_UniProt(miriam_only=True)
query_fields = miriam_query_fields + [
    # Add additional non-miriam fields if desired
    # Complex composition
    "cc_subunit",
    # Specific isoforms to include/avoid
    "cc_tissue_specificity",
    "cc_subcellular_location",
    # Chromosome
    "xref_proteomes",
]

In [None]:
# Extract all relevant information for now and save
query_parameters = {
    "query": " && ".join(
        [
            "(reviewed:true)",
            "(organism_id:9606)",  # Homo sapiens (Human)
        ]
    ),
    "format": "tsv",
    "size": 500,
    "compressed": True,
    "fields": ",".join(query_fields),
}

### Initial query

In [None]:
query_key = "initial"
df_results, uniparc, failed_ids, obselete_counts = query_UniProt(
    list(query_ids),
    query_parameters=query_parameters,
    to_db="UniProtKB",
    from_db=from_db,
    return_failed=True,
)
if failed_ids:
    print(failed_ids)
all_query_results[query_key] = df_results
df_results

### Address failed IDs

In [None]:
# retry_ids = {}
# query_key = "retry_1"
# df_results, failed_ids = query_UniProt(
#     list(sorted(retry_ids.values())),
#     from_db="UniProtKB",
#     query_parameters=query_parameters
# )
# if failed_ids:
#     print(failed_ids)
# all_query_results[query_key] = df_results
# model_search_mapping.update({
#     k: retry_ids[v] for k, v in model_search_mapping.items()
#     if v in retry_ids and v not in failed_ids
# })
# df_results

## Concat, cleanup, and save query results

In [None]:
print(f"Number of unique queries: {len(all_query_results)}")
df_query_results = pd.concat(tuple(all_query_results.values()))
df_query_results = df_query_results.set_index("From").drop_duplicates()
df_query_results = df_query_results.replace("", pd.NA)
df_query_results

### Save extracted data to database

In [None]:
# Save query results to external database
df_database = df_query_results.reset_index(drop=True).drop_duplicates()
if compare:
    compare_on_index = ["Entry"]
    try:
        df_previous = pd.read_csv(
            database_dirpath / f"{db_tag}_{GEM_NAME}.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
        df_previous = df_previous.replace(float("nan"), pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index), df_database.set_index(compare_on_index)
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)


if overwrite:
    df_database.to_csv(
        database_dirpath / f"{db_tag}_{GEM_NAME}.tsv", sep="\t", index=False
    )

df_database

## Format UniProt information for annotation files
### Genes
#### Map to chosen MIRIAMs
As formatting may be needed for some MIRIAMS, keep it simple for now until formatting methods are developed. 

In [None]:
annotation_type = "genes"

# Keeping it simple for now, group items regardless of isoforms for the time being
uniprot_miriam_mapping = get_label_miriam_mapping_UniProt(
    get_query_fields_UniProt(miriam_only=True)
)
uniprot_miriam_mapping["Proteomes"] = "chromosome"

merge_key = {
    v: k for k, v in uniprot_miriam_mapping.items() if v in df_model_mappings.columns
}[mapping_key]
df_annotations = df_model_mappings.set_index(mapping_key).merge(
    df_database, left_index=True, right_on=merge_key, how="left"
)
df_annotations = (
    df_annotations.set_index(annotation_type)
    .loc[:, list(uniprot_miriam_mapping)]
    .rename(uniprot_miriam_mapping, axis=1)
)
uniprot_columns = ["uniprot", "uniprot.isoform", "uniprot.chain"]
# For the most part, these columns do not require any reformatting or are easy to work with.
annotation_columns = [
    "hgnc.symbol",
    "ec-code",
    "taxonomy",
    "uniparc",
    # Reactions
    "rhea",
    # Gene Ontology (GO)
    "go",
    # Sequence
    "ccds",
    "ena.embl",
    "refseq",
    # 3D Structure
    "bmrb",
    "pdb",
    "sasbdb",
    "smr",
    # Protein-protein interaction
    "biogrid",
    "complexportal",
    "dip",
    "intact",
    # Chemistry databases
    "chembl.target",
    "drugbank",
    "iuphar.receptor",
    # Protein family/group databases
    "cazy",
    "ideal",
    "merops",
    "peroxibase",
    "tcdb",
    # Genetic variation/Polymorphism and mutation databases
    "dbsnp",
    # Proteomic databases
    "proteomicsdb.protein",
    # Genome annotation databases
    "ensembl",
    "ncbigene",
    ## Organism-specific
    "kegg.genes",
    "genecards",
    "hgnc",
    "hpa",
    "mim",
    "nextprot",
    "orphanet",
    "pharmgkb.gene",
    # Phylogenomic databases
    "eggnog",
    "genetree",
    "hogenom",
    "oma.grp",
    "orthodb",
    "treefam",
    # Enzyme and pathway databases
    "biocyc",
    "brenda",
    "reactome",
    # Miscellaneous databases
    "genewiki",
    # Gene expression databases
    "bgee.gene",
    ## Family and domain databases
    "cdd",
    "disprot",
    "hamap",
    "interpro",
    "panther.family",
    "pfam",
    "pirsf",
    "prints",
    "prosite",
    "smart",
    "supfam",
    "chromosome",
]
df_annotations["chromosome"] = df_annotations["chromosome"].apply(
    lambda x: x.split(" ")[-1]
)
df_annotations = df_annotations.loc[:, uniprot_columns + annotation_columns]
print(f"Fields searched: {df_annotations.shape[1]}")
all_na = df_annotations.T[df_annotations.isna().all(axis=0)].index
annotation_columns = [x for x in annotation_columns if x not in all_na]
df_annotations = df_annotations.dropna(how="all", axis=1)
print(f"Empty dropped: {len(all_na)}")
print(f"Remaining: {df_annotations.shape[1]}")
df_annotations = df_annotations.reset_index(drop=False).replace(pd.NA, "")
df_annotations

In [None]:
df_isoforms = parse_isoforms_UniProt(
    df_annotations.loc[:, ["uniprot", "uniprot.isoform"]].copy(), add_canonical=True
)

df_canonical = df_isoforms[df_isoforms["uniprot.canonical"].apply(bool)].set_index(
    "uniprot"
)
df_canonical = df_canonical.apply(
    lambda x: x["uniprot.isoform"] if x["uniprot.isoform"] else x.name, axis=1
)
df_isoforms = df_isoforms.groupby("uniprot")[["uniprot.isoform"]].agg(
    lambda x: build_string(x)
)

df_chains = parse_chains_UniProt(
    df_annotations.loc[:, ["uniprot", "uniprot.chain"]].copy()
)
df_isoforms_chains = df_chains.merge(df_isoforms, right_index=True, left_on="uniprot")
df_isoforms_chains

In [None]:
for col, series in df_isoforms_chains.items():
    df_annotations[col] = series

for idx, row in df_annotations.loc[:, annotation_columns].iterrows():
    uniprot_id, isoform_id = df_isoforms_chains.loc[idx, ["uniprot", "uniprot.isoform"]]
    if isoform_id and len(isoform_id.split(";")) != 1:
        isoform_id = None
    # No isoform ID set, just aggregate all without regards to isoform.
    row = row.apply(
        lambda x: (
            get_isoform_value_from_entry_UniProt(x, isoform_id)
            if get_isoform_value_from_entry_UniProt(x, isoform_id).strip()
            else x
        )
    )
    row = row.apply(lambda x: x.strip().rstrip(";"))
    # A duplicate reindexing error may here may mean duplicate columns in annotation column values
    df_annotations.loc[idx, annotation_columns] = row.values
# Clean up other annotations
keys = ["rhea", "go", "hgnc"]
for key in keys:
    if key in df_annotations.columns:
        df_annotations[key] = (
            df_annotations[key]
            .fillna("")
            .apply(
                lambda x: build_string(
                    [s.lstrip(f"{key.upper()}:") for s in split_string(x)]
                )
            )
        )


df_annotations = df_annotations.replace(float("nan"), pd.NA).replace("", pd.NA)
if compare:
    compare_on_index = [annotation_type]
    try:
        df_previous = pd.read_csv(
            annotation_dirpath / f"{annotation_type}_{db_tag}.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_annotations.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    for col in df_annotations.columns:
        df = (
            df_annotations[col]
            .apply(lambda x: split_string(x))
            .explode(col)
            .drop_duplicates()
        )
        print(f"{df.name}: {df.nunique()}")
if overwrite:
    df_annotations.to_csv(
        annotation_dirpath / f"{annotation_type}_{db_tag}.tsv", sep="\t", index=False
    )

df_annotations

## Format Complex Table
### Complexes and stoichiometry
Information extracted here can be useful for protein constrained modeling

In [None]:
rename_mapping = {
    "Entry": "uniprot",
    "Gene Names (primary)": "hgnc.symbol",
    "Organism (ID)": "taxonomy",
    "Subunit structure": "subunit_text",
}
df_complex_results = (
    df_query_results.loc[:, list(rename_mapping)].rename(rename_mapping, axis=1).copy()
)

if compare:
    compare_on_index = ["uniprot"]
    try:
        df_previous = pd.read_csv(
            database_dirpath / "uniprot_complexes.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_complex_results.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    print(f"{df_complex_results.nunique()}")

if overwrite:
    df_complex_results.to_csv(
        database_dirpath / "uniprot_complexes.tsv",
        sep="\t",
        index=False,
    )

#### Parse complexes keywords

In [None]:
# TODO finish automation component
# subunit_terms_mapping_dict = {
#     # Contains terms for subunit mapping.
#     # Additional parsing needed for terms associated with more than one key and/or heteromultimeric complexes
#     # May occur for complexes that have overlapping terms (e.g., decamer and undecamer)
#     # A value of `0` indicates specific parsing or manual mappping is needed and can be determined from just the term.
# #     0: {"polypeptide", "oligomer", "complex", "catalytic subunit", "multisubunit", "regulatory subunit", "auxillary subunit", "alpha subunit", "beta subunit", "gamma subunit", "delta subunit", "proteasome", "multimer", "polymers"},
# #     1: {"1 subunit", "monomer"},
# #     2: {"2 subunits", "dimer"},
# #     3: {"3 subunits", "trimer"},
# #     4: {"4 subunits", "tetramer"},
# #     5: {"5 subunits", "pentamer"},
# #     6: {"6 subunits", "hexamer"},
# #     7: {"7 subunits", "heptamer"},
# #     8: {"8 subunits", "octamer"},
# #     9: {"9 subunits", "nonomer"},
# #     10: {"10 subunits", "decamer"},
# #     11: {"11 subunits", "undecamer"},
# #     12: {"12 subunits", "dodecamer"},
# #     13: {"13 subunits", "tridecamer"},
# #     14: {"14 subunits", "tetradecamer"},
# #     15: {"15 subunits", "pentadecamer"},
# #     16: {"16 subunits", "hexadecamer"},
# #     17: {"17 subunits", "heptadecamer"},
# #     18: {"18 subunits", "octaadecamer"},
# #     19: {"19 subunits", "nonadecamer"},
# #     20: {"20 subunits", "didecamer"},
# #     22: {"22 subunits"},
# #     24: {"24 subunits"},
# #     26: {"26 subunits"},
# # }

# data = {}
# for uniprot_id, subunit_text in df_complex_results["subunit_text"].dropna().items():
#     data[uniprot_id] = {
#         "subunit_text": subunit_text
#     }
#     # First determine if any subunit matches can be made.
#     matches = set()
#     for n_subunits, search_terms in subunit_terms_mapping_dict.items():
#         matches.update([match.lower() for term in search_terms for match in re.findall(term, subunit_text, re.IGNORECASE)])
#     if not matches:
#         data[uniprot_id].update({
#             "matches": pd.NA,
#             "manual": True,
#         })
#         continue
#     elif matches:

#     break


# # df_model_complexes = pd.DataFrame.from_dict(data, orient="index")


# # # if overwrite:
# # #     df_isoforms_final.to_csv(database_dirpath / "uniprot_isoforms.tsv", sep="\t", index=False)
# # #     df_erythroid.to_csv(database_dirpath / "uniprot_isoforms_erythroid.tsv", sep="\t", index=False)
# # # else:
# # #     df_isoforms_final.to_csv(ROOT_PATH / INTERIM_PATH / "uniprot_isoforms.tsv", sep="\t", index=False)
# # #     df_erythroid.to_csv(ROOT_PATH / INTERIM_PATH / "uniprot_isoforms_erythroid.tsv", sep="\t", index=False)
# # # df_erythroid
# # df_model_complexes = df_model_mappings.merge(df_model_complexes, left_on="uniprot", right_index=True, how="inner")
# # df_model_complexes

## Load Isoforms and Sequences
### Isoforms
#### Parse data into initial table of isoforms

In [None]:
erythro_keywords = [
    "erythro",
    "erythrocyte",
    "erythroid",
    "red blood cell",
    "rbc",
    "R-type",
    "P5N-I",
    "reticulocyte",
]
backup_keywords = ["cyto", "retic", "cell membrane"]
avoid_keywords = ["non-erythro", "mito", "not detected", "synaptic", "testis"]

rename_mapping = {
    "Entry": "uniprot",
    "Gene Names (primary)": "hgnc.symbol",
    "Organism (ID)": "taxonomy",
    "Alternative products (isoforms)": "uniprot.isoform",
    "Tissue specificity": "tissue_specificity",
    "Subcellular location [CC]": "subcellular_location",
}
columns_to_search = ["tissue_specificity", "subcellular_location"]

df_tissue_specificity = (
    df_query_results.loc[:, list(rename_mapping)].rename(rename_mapping, axis=1).copy()
)
df_isoforms = parse_isoforms_UniProt(
    df_tissue_specificity.loc[:, ["uniprot", "uniprot.isoform"]].copy(),
    add_canonical=True,
)
df_isoforms = df_isoforms.merge(
    df_tissue_specificity[columns_to_search],
    left_on="uniprot",
    right_index=True,
    how="left",
)

df_isoforms["erythroid"] = pd.NA
df_isoforms["backup"] = pd.NA
df_isoforms["avoid"] = pd.NA
df_isoforms["keywords.erythroid"] = pd.NA
df_isoforms["keywords.backup"] = pd.NA
df_isoforms["keywords.avoid"] = pd.NA
all_names = sorted(
    df_isoforms["uniprot.isoform.name"].replace("", pd.NA).dropna().unique()
)
all_synonyms = sorted(
    df_isoforms["uniprot.isoform.synonyms"].replace("", pd.NA).dropna().unique()
)
for col_to_search in columns_to_search:
    for idx, value_string in df_isoforms[col_to_search].dropna().items():
        if not re.search(r"\[Isoform (.+?(?=\]))", value_string):
            continue
        for isoform_entry in value_string.split(";"):
            match = re.search(r"\[Isoform (.+?(?=\]))", isoform_entry)
            if not match:
                continue

            isoform_name_or_synonym = match.group(1)
            if not (
                df_isoforms.loc[idx, "uniprot.isoform.name"] == isoform_name_or_synonym
                or df_isoforms.loc[idx, "uniprot.isoform.synonyms"]
                == isoform_name_or_synonym
            ):
                continue

            for col, keywords in zip(
                ["erythroid", "backup", "avoid"],
                [erythro_keywords, backup_keywords, avoid_keywords],
            ):
                found_keywords = set()
                for k in keywords:
                    found_keywords.update(
                        re.findall(k, isoform_entry.strip(), re.IGNORECASE)
                    )
                if df_isoforms.fillna("").loc[idx, f"keywords.{col}"]:
                    found_keywords.update(
                        split_string(df_isoforms.loc[idx, f"keywords.{col}"])
                    )
                df_isoforms.loc[idx, f"keywords.{col}"] = build_string(found_keywords)

for col, keywords in zip(
    ["erythroid", "backup", "avoid"],
    [erythro_keywords, backup_keywords, avoid_keywords],
):
    df_isoforms[f"keywords.{col}"] = df_isoforms[f"keywords.{col}"].apply(
        lambda x: set(split_string(x)) if isinstance(x, str) else set()
    )
    for k in keywords:
        df_isoforms[f"keywords.{col}"] = (
            df_isoforms[[f"keywords.{col}", "uniprot.isoform.name"]]
            .fillna("")
            .apply(
                lambda x: x[f"keywords.{col}"].union(
                    set(re.findall(k, x["uniprot.isoform.name"], re.IGNORECASE))
                ),
                axis=1,
            )
        )
        df_isoforms[f"keywords.{col}"] = (
            df_isoforms[[f"keywords.{col}", "uniprot.isoform.synonyms"]]
            .fillna("")
            .apply(
                lambda x: x[f"keywords.{col}"].union(
                    set(re.findall(k, x["uniprot.isoform.synonyms"], re.IGNORECASE))
                ),
                axis=1,
            )
        )
    df_isoforms[f"keywords.{col}"] = df_isoforms[f"keywords.{col}"].apply(
        lambda x: build_string([s for s in x if s])
    )
df_isoforms = df_isoforms.replace(float("nan"), pd.NA).replace("", pd.NA)
df_isoforms["erythroid"] = df_isoforms["keywords.erythroid"].notna()
df_isoforms["backup"] = df_isoforms["keywords.backup"].notna()
df_isoforms["avoid"] = df_isoforms["keywords.avoid"].notna()
# Remove those found in both categories from "erythroid", usually caused by words like `non-erythro`
df_isoforms.loc[
    df_isoforms[df_isoforms[["erythroid", "avoid"]].all(axis=1)].index,
    "erythroid",
] = False
# Erythroid easily serves as a backup option
df_isoforms.loc[
    df_isoforms[df_isoforms[["erythroid"]].all(axis=1)].index,
    "backup",
] = True

df_isoforms["canonical"] = df_isoforms["uniprot.canonical"]
df_isoforms["sequence.id"] = df_isoforms.fillna("").apply(
    lambda x: x["uniprot.isoform"] if x["uniprot.isoform"] else x["uniprot"], axis=1
)
df_isoforms = df_isoforms.replace(float("nan"), pd.NA).replace("", pd.NA)
df_isoforms = (
    df_isoforms.loc[
        :,
        [
            "uniprot",
            "uniprot.isoform",
            "sequence.id",
            "canonical",
            "erythroid",
            "backup",
            "avoid",
            "keywords.erythroid",
            "keywords.backup",
            "keywords.avoid",
        ],
    ]
    .drop_duplicates()
    .reset_index(drop=True)
)

### Sequences
#### Extract sequences using UniParc Mapping

In [None]:
df_isoforms_sequences = df_isoforms[
    df_isoforms.apply(lambda x: x["sequence.id"].startswith(x["uniprot"]), axis=1)
].copy()
query_ids = df_isoforms_sequences["sequence.id"].unique()

query_parameters = {
    "format": "tsv",
    "size": 500,
    "compressed": True,
    "fields": ",".join(
        [
            "sequence",
            "length",
        ]
    ),
}

query_key = "initial"
df_results, uniparc, failed_ids, obselete_counts = query_UniProt(
    list(query_ids),
    query_parameters=query_parameters,
    from_db="UniProtKB_AC-ID",
    to_db="UniParc",
    return_failed=True,
)

if failed_ids:
    print(failed_ids)

df_isoforms_sequences = df_isoforms_sequences.merge(
    df_results.set_index("From").rename(
        {
            "Sequence": "sequence",
            "Length": "sequence.length",
            "mass": "sequence.mass",
        },
        axis=1,
    ),
    left_on="sequence.id",
    right_index=True,
)

# #  The following code can be used to address entries with issues in extractions. LDHB example.
# df_isoforms_sequences = pd.concat(
#     (
#         df_isoforms_sequences,
#         pd.DataFrame.from_dict(
#             {
#                 "uniprot": "P07195",
#                 "sequence.id": "P07195",
#                 "canonical": True,
#                 "erythroid": False,
#                 "backup": False,
#                 "avoid": False,
#                 "sequence": "MATLKEKLIAPVAEEEATVPNNKITVVGVGQVGMACAISILGKSLADELALVDVLEDKLKGEMMDLQHGSLFLQTPKIVADKDYSVTANSKIVVVTAGVRQQEGESRLNLVQRNVNVFKFIIPQIVKYSPDCIIIVVSNPVDILTYVTWKLSGLPKHRVIGSGCNLDSARFRYLMAEKLGIHPSSCHGWILGEHGDSSVAVWSGVNVAGVSLQELNPEMGTDNDSENWKEVHKMVVESAYEVIKLKGYTNWAIGLSVADLIESMLKNLSRIHPVSTMVKGMYGIENEVFLSLPCILNARGLTSVINQKLKDDEVAQLKKSADTLWDIQKDLKDL",
#                 "sequence.length": "334",
#             },
#             orient="index",
#         ).T,
#     ),
#     axis=0,
# ).convert_dtypes()

df_isoforms_sequences = df_isoforms_sequences.reset_index(drop=True)

if overwrite:
    df_isoforms_sequences.to_csv(
        database_dirpath / f"{UNIPROT_DB_TAG}_isoforms_sequences.tsv",
        sep="\t",
        index=False,
    )

df_isoforms_sequences

In [None]:
df_model_mappings.merge(df_isoforms_sequences[df_isoforms_sequences["erythroid"]])