# Extract annotations and data from Mouse Genome Informatics database

The purpose of this notebook is to extract and format data for subsequent model annotation. 

Additionally, the purpose of this notebook is to extract relevant data to map between mouse and human homologs.

## Notebook Requirements:
*  Model genes **must** have the at least one of following annotations stored in the `object.annotation`. Values are expected to be seperated by semicolons. Accepted keys currently include:
    * `"uniprot"`
*  Note: Requires internet connection to download information from [MGI](https://www.informatics.jax.org/).

### Citations
Baldarelli RM, Smith CL, Ringwald M, Richardson JE, Bult CJ; Mouse Genome Informatics Group. Mouse Genome Informatics: an integrated knowledgebase system for the laboratory mouse. Genetics. 2024 May 7;227(1):iyae031. doi: 10.1093/genetics/iyae031. PMID: 38531069; PMCID: PMC11075557.

## Setup
### Import packages

In [None]:
import re
from warnings import warn

import matplotlib.pyplot as plt
import pandas as pd
from cobra.core.gene import GPR
from cobra.manipulation import remove_genes
from rbc_gem_utils import (
    GEM_NAME,
    build_string,
    check_database_release_online,
    compare_tables,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    split_string,
    visualize_comparison,
    write_cobra_model,
)
from rbc_gem_utils.database.mgi import (
    MGI_DB_TAG,
    MGI_RELEASE_EXPECTED,
    download_database_MGI,
)
from sympy import parse_expr

# Show versions of notebook
show_versions()

## Set notebook options

In [None]:
db_tag = MGI_DB_TAG
expected_release = MGI_RELEASE_EXPECTED
download_database = True

compare_figsize = (5, 5)
compare = True
display_nunique = True
overwrite = True

## Check MGI release
* If the current release does not match the expected release, it is because database has been updated since the last time this code was utilized.
    * If the notebook works without needing any significant modifications, the only update needed is to the release in the [mgi.py](../../src/rbc_gem_utils/database/complexportal.py) source code file to resolve the issue.

In [None]:
use_interim = not check_database_release_online(
    db_tag,
    verbose=True,
)
# Use different directory paths for unexpected behavior
if use_interim:
    warn(
        "Online release of database has been updated since the last time notebook was used."
    )

database_dirpath = get_dirpath(
    "database", db_tag, use_temp="interim" if use_interim else None
)
annotation_dirpath = get_dirpath(
    "annotation", use_temp="interim" if use_interim else None
)

# Ensure directories exist
database_dirpath.mkdir(exist_ok=True, parents=True)
annotation_dirpath.mkdir(exist_ok=True, parents=True)

## Load Human-Mouse mapping tables
### Download from database

In [None]:
if download_database:
    download_database_MGI(
        [
            "HGNC_AllianceHomology.rpt",
            "MRK_SwissProt.rpt",  # Only reviewed proteins included
            "MRK_SwissProt_TrEMBL.rpt",  # Unreviewed proteins included
        ]
    )

### Set MIRIAM keys

In [None]:
mgi_key = "mgi"
hgnc_key = "hgnc"
hgnc_sym_key = f"{hgnc_key}.symbol"
mgnc_sym_key = "mgnc.symbol"  # Not currently a true miriam mapping
uniprot_key = "uniprot"
only_reviewed_proteins = True

header_len = 30
column_map = {
    "MGI Accession ID": mgi_key,
    "Marker Symbol": mgnc_sym_key,
    "HGNC ID": hgnc_key,
    "UniProt": f"{uniprot_key}.mouse",  # To distinguish from human,
}

### Load homology table

In [None]:
df_homology = pd.read_csv(
    database_dirpath / "HGNC_AllianceHomology.tsv",
    sep="\t",
    # TODO map additional columns for annotations
    index_col=False,
)
df_homology = df_homology[[x for x in column_map if x in df_homology.columns]]
df_homology = df_homology.rename(column_map, axis=1)
print(df_homology.nunique())
df_homology.head()

### Load protein table

In [None]:
if only_reviewed_proteins:
    prot_file = "MRK_SwissProt.tsv"
else:
    prot_file = "MRK_SwissProt_TrEMBL.tsv"

df_proteins = pd.read_csv(
    database_dirpath / prot_file,
    sep="\t",
    # TODO map additional columns for annotations
    index_col=False,
    header=None,
)
# Headers are not included, so label them using the website it from
df_proteins.columns = [
    "MGI Accession ID",
    "Marker Symbol",
    "Status",
    "Marker Name",
    "cM position",
    "Chromosome",
    "UniProt",  # SWISS-PROT Protein Accession IDs
]
df_proteins = df_proteins[[x for x in column_map if x in df_proteins.columns]]
df_proteins = df_proteins.rename(column_map, axis=1)

print(df_proteins.nunique())
df_proteins.head()

### Create mapping table for all mouse genes/proteins to human genes

In [None]:
df_mouse_prots = df_homology.merge(
    df_proteins, left_on=mgi_key, right_on=mgi_key, how="left", suffixes=[None, "_drop"]
)
df_mouse_prots = df_mouse_prots.drop(
    [c for c in df_mouse_prots.columns if c.endswith("_drop")], axis=1
)
df_mouse_prots[f"{uniprot_key}.mouse"] = df_mouse_prots[
    f"{uniprot_key}.mouse"
].str.split(" ")
df_mouse_prots = df_mouse_prots.explode(f"{uniprot_key}.mouse")
print(
    "\n".join(
        ("Initial data table", "=" * header_len, str(df_mouse_prots.nunique()), "")
    )
)

# Original mapping is 1 mouse gene to 1+ human genes
# Drop entries without human gene mappings and rename Marker Symbol column
df_mouse_prots = df_mouse_prots[df_mouse_prots[hgnc_key].notna()]

# Remove prefixes from the IDs
df_mouse_prots[hgnc_key] = df_mouse_prots[hgnc_key].str.replace("HGNC:", "")
df_mouse_prots[mgi_key] = df_mouse_prots[mgi_key].str.replace("MGI:", "")
df_mouse_prots[hgnc_key] = df_mouse_prots[hgnc_key].str.split("|")
df_mouse_prots = df_mouse_prots.explode(hgnc_key).sort_values(hgnc_key)
print(
    "\n".join(
        (
            "Genes with human homologs",
            "=" * header_len,
            str(df_mouse_prots.nunique()),
            "",
        )
    )
)
df_mouse_prots

#### Summarize general mapping results

In [None]:
duplicated_count = df_mouse_prots[[hgnc_key, mgnc_sym_key]].drop_duplicates()
duplicated_count = duplicated_count[hgnc_key].duplicated(keep=False).value_counts()

duplicated_count = duplicated_count.rename(
    {False: "One to One map ", True: "One to Many map "}, axis=0
)
duplicated_count.index.name = None
duplicated_count.name = None
print(
    "\n".join(("Human to Mouse mapping", "=" * header_len, str(duplicated_count), ""))
)

## Map to RBC-GEM
### Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
model

### Create model mapping table from annotations

In [None]:
df_model = get_annotation_df(model.genes, [uniprot_key, hgnc_key, hgnc_sym_key]).rename(
    {"id": "genes"}, axis=1
)
df_model[hgnc_key] = df_model[hgnc_key].str.split(";")
df_model = df_model.explode(hgnc_key).sort_values(hgnc_key)
df_model = df_model.merge(
    df_mouse_prots, left_on=hgnc_key, right_on=hgnc_key, how="left"
)
print("\n".join(("Mapped to model", "=" * header_len, str(df_model.nunique()), "")))

duplicated_count = df_model[[hgnc_key, mgnc_sym_key]].drop_duplicates()
duplicated_count = duplicated_count[hgnc_key].duplicated(keep=False).value_counts()

duplicated_count = duplicated_count.rename(
    {False: "One to One map ", True: "One to Many map "}, axis=0
)
duplicated_count.index.name = None
duplicated_count.name = None
print(
    "\n".join(("Human to Mouse mapping", "=" * header_len, str(duplicated_count), ""))
)

### Format for human model annotations

In [None]:
df_annotations = df_model.replace(pd.NA, "")
df_annotations = df_annotations.groupby("genes", as_index=False)[
    [uniprot_key, hgnc_key, hgnc_sym_key, mgi_key]
].agg(lambda x: build_string(sorted(x)))
df_annotations = df_annotations.replace("", pd.NA)
df_annotations

In [None]:
annotation_type = "genes"
if compare:
    compare_on_index = [annotation_type]
    try:
        df_previous = pd.read_csv(
            annotation_dirpath / f"{annotation_type}_{db_tag}.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_annotations.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    for col in df_annotations.columns:
        df = (
            df_annotations[col]
            .apply(lambda x: split_string(x))
            .explode(col)
            .drop_duplicates()
        )
        print(f"{df.name}: {df.nunique()}")
if overwrite:
    df_annotations.to_csv(
        annotation_dirpath / f"{annotation_type}_{db_tag}.tsv", sep="\t", index=False
    )

df_annotations

In [None]:
map_to_na_gene = df_model[df_model[mgnc_sym_key].isna()]
map_to_na_gene = map_to_na_gene[
    ["genes", mgnc_sym_key, f"{uniprot_key}.mouse"]
].drop_duplicates()
map_to_na_gene = map_to_na_gene.sort_values([mgnc_sym_key, f"{uniprot_key}.mouse"])

print(
    "\n".join(
        (
            "No identified mouse gene",
            "=" * header_len,
            str(map_to_na_gene.nunique()),
            "",
        )
    )
)

map_to_na_protein = df_model[df_model[f"{uniprot_key}.mouse"].isna()]
map_to_na_protein = map_to_na_protein[
    ["genes", mgnc_sym_key, f"{uniprot_key}.mouse"]
].drop_duplicates()
map_to_na_protein = map_to_na_protein.sort_values(
    [mgnc_sym_key, f"{uniprot_key}.mouse"]
)

print(
    "\n".join(
        (
            "No identified mouse protein",
            "=" * header_len,
            str(map_to_na_protein.nunique()),
            "",
        )
    )
)
df_model = df_model.dropna(subset=[mgnc_sym_key, f"{uniprot_key}.mouse"])
gene_mapping = df_model.groupby(["genes"])[mgnc_sym_key].agg(lambda x: list(x))
gene_mapping = gene_mapping.to_dict()

### Update gene reaction rules

In [None]:
mouse_model = model.copy()
new_gprs = {}
for i, reaction in enumerate(
    mouse_model.reactions.query(lambda x: x.gene_reaction_rule)
):
    gene_reaction_rule = reaction.gene_reaction_rule
    for gene in reaction.genes:
        try:
            replacements = gene_mapping[gene.id]
        except KeyError as e:
            replacements = "TO_REMOVE"
        else:
            replacements = " or ".join(replacements)
        # Replace all matches, be mindful of partial gene ID replacements by searching by ensuring no trailing alphanumeric characters
        gene_reaction_rule = re.subn(
            f"{gene.id}(?![a-zA-Z0-9_])", replacements, gene_reaction_rule
        )[0]

    gene_reaction_rule = (
        gene_reaction_rule.replace("-", "_")
        .replace(" or ", "|")
        .replace(" and ", " & ")
    )
    gene_reaction_rule = parse_expr(gene_reaction_rule)
    reaction.gpr = GPR.from_symbolic(gene_reaction_rule)

# Remove old model genes
remove_genes(
    mouse_model,
    gene_list=["TO_REMOVE"] + model.genes.list_attr("id"),
    remove_reactions=False,
)
mouse_model

### Update annotations for new genes

In [None]:
df_mouse_annotations = df_model.groupby(mgnc_sym_key)[
    [mgi_key, f"{uniprot_key}.mouse", hgnc_key, hgnc_sym_key]
].agg(lambda x: build_string(x))
df_mouse_annotations = df_mouse_annotations.rename(
    {f"{uniprot_key}.mouse": uniprot_key}, axis=1
)
df_mouse_annotations.index = df_mouse_annotations.index.str.replace("-", "_")
for gene in mouse_model.genes:
    annotation = {mgnc_sym_key: gene.id}
    annotation.update(df_mouse_annotations.loc[gene.id].to_dict())
    gene.annotation.update(annotation)
df_mouse_annotations

## Export model and data mapping tables

In [None]:
write_cobra_model(mouse_model, filename=database_dirpath / f"{GEM_NAME}.xml")
df_model.to_csv(database_dirpath / "HumanMouseMapping.tsv", sep="\t", index=False)
df_model

## Download sequence data from UniProt

In [None]:
from rbc_gem_utils import GEM_NAME
from rbc_gem_utils.database.uniprot import (
    UNIPROT_API_URL,
    UNIPROT_DB_TAG,
    UNIPROT_PATH,
    UNIPROT_RELEASE_EXPECTED,
    get_annotation_to_from_db_UniProt,
    get_isoform_value_from_entry_UniProt,
    get_label_miriam_mapping_UniProt,
    get_query_fields_UniProt,
    get_release_UniProt,
    parse_chains_UniProt,
    parse_isoforms_UniProt,
    query_UniProt,
)

### Get IDs for query
#### Using an existing annotation

In [None]:
annotation_type = "genes"
annotation_cols = ["uniprot"]
mapping_key = "uniprot"

df_model_mappings = (
    get_annotation_df(mouse_model.genes, annotation_cols)
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)
for col in df_model_mappings.columns:
    df_model_mappings[col] = df_model_mappings[col].apply(lambda x: split_string(x))
    df_model_mappings = df_model_mappings.explode(col).drop_duplicates().dropna()
df_model_mappings = df_model_mappings.sort_values(annotation_type)

print(df_model_mappings.nunique(dropna=True))
df_model_mappings = df_model_mappings.reset_index(drop=True)
df_model_mappings

In [None]:
annotation_to_from_db = get_annotation_to_from_db_UniProt(miriam_only=True)

from_db = annotation_to_from_db[mapping_key]
query_ids = df_model_mappings[mapping_key].dropna().unique()
assert len(set(query_ids)) == len(query_ids), "Duplicate IDs in list to query"
model_search_mapping = df_model_mappings.set_index(annotation_type)[
    mapping_key
].to_dict()
print(f"Number of model genes associated with query: {len(model_search_mapping)}")
print(f"Number of unique IDs to query: {len(query_ids)}")
df_model_mappings[[annotation_type, mapping_key]].drop_duplicates()

### Run queries
#### Set universal query parameters

In [None]:
all_query_results = {}
miriam_query_fields = get_query_fields_UniProt(miriam_only=True)
query_fields = miriam_query_fields + [
    # Add additional non-miriam fields if desired
    # Complex composition
    "cc_subunit",
    # Specific isoforms to include/avoid
    "cc_tissue_specificity",
    "cc_subcellular_location",
    # Chromosome
    "xref_proteomes",
]

# Extract all relevant information for now and save
query_parameters = {
    "query": " && ".join(
        [
            "(reviewed:true)",
            "(organism_id:10090)",  # Homo sapiens (Human)
        ]
    ),
    "format": "tsv",
    "size": 500,
    "compressed": True,
    "fields": ",".join(query_fields),
}

#### Initial query

In [None]:
query_key = "initial"
df_results, uniparc, failed_ids, obselete_counts = query_UniProt(
    list(query_ids),
    query_parameters=query_parameters,
    to_db="UniProtKB",
    from_db=from_db,
    return_failed=True,
)
if failed_ids:
    print(failed_ids)
all_query_results[query_key] = df_results
df_results

### Address failed IDs

In [None]:
# retry_ids = {}
# query_key = "retry_1"
# df_results, failed_ids = query_UniProt(
#     list(sorted(retry_ids.values())),
#     from_db="UniProtKB",
#     query_parameters=query_parameters
# )
# if failed_ids:
#     print(failed_ids)
# all_query_results[query_key] = df_results
# model_search_mapping.update({
#     k: retry_ids[v] for k, v in model_search_mapping.items()
#     if v in retry_ids and v not in failed_ids
# })
# df_results

## Concat, cleanup, and save query results

In [None]:
print(f"Number of unique queries: {len(all_query_results)}")
df_query_results = pd.concat(tuple(all_query_results.values()))
df_query_results = df_query_results.set_index("From").drop_duplicates()
df_query_results = df_query_results.replace("", pd.NA)
df_query_results

#### Save extracted data to database

In [None]:
# Save query results to external database
df_database = df_query_results.reset_index(drop=True).drop_duplicates()
if compare:
    compare_on_index = ["Entry"]
    try:
        df_previous = pd.read_csv(
            database_dirpath / f"{db_tag}_{GEM_NAME}.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
        df_previous = df_previous.replace(float("nan"), pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index), df_database.set_index(compare_on_index)
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)


if overwrite:
    df_database.to_csv(
        database_dirpath / f"{db_tag}_{GEM_NAME}.tsv", sep="\t", index=False
    )

df_database

## Format UniProt information for annotation files
### Genes
#### Map to chosen MIRIAMs
As formatting may be needed for some MIRIAMS, keep it simple for now until formatting methods are developed. 

In [None]:
annotation_type = "genes"

# Keeping it simple for now, group items regardless of isoforms for the time being
uniprot_miriam_mapping = get_label_miriam_mapping_UniProt(
    get_query_fields_UniProt(miriam_only=True)
)
uniprot_miriam_mapping["Proteomes"] = "chromosome"

merge_key = {
    v: k for k, v in uniprot_miriam_mapping.items() if v in df_model_mappings.columns
}[mapping_key]
df_annotations = df_model_mappings.set_index(mapping_key).merge(
    df_database, left_index=True, right_on=merge_key, how="inner"
)
df_annotations = (
    df_annotations.set_index(annotation_type)
    .loc[:, list(uniprot_miriam_mapping)]
    .rename(uniprot_miriam_mapping, axis=1)
)
uniprot_columns = ["uniprot", "uniprot.isoform", "uniprot.chain"]
# For the most part, these columns do not require any reformatting or are easy to work with.
annotation_columns = [
    "hgnc.symbol",
    "ec-code",
    "taxonomy",
    "uniparc",
    # Reactions
    "rhea",
    # Gene Ontology (GO)
    "go",
    # Sequence
    "ccds",
    "ena.embl",
    "refseq",
    # 3D Structure
    "bmrb",
    "pdb",
    "sasbdb",
    "smr",
    # Protein-protein interaction
    "biogrid",
    "complexportal",
    "dip",
    "intact",
    # Chemistry databases
    "chembl.target",
    "drugbank",
    "iuphar.receptor",
    # Protein family/group databases
    "cazy",
    "ideal",
    "merops",
    "peroxibase",
    "tcdb",
    # Genetic variation/Polymorphism and mutation databases
    "dbsnp",
    # Proteomic databases
    "proteomicsdb.protein",
    # Genome annotation databases
    "ensembl",
    "ncbigene",
    ## Organism-specific
    "kegg.genes",
    "genecards",
    "hgnc",
    "hpa",
    "mim",
    "nextprot",
    "orphanet",
    "pharmgkb.gene",
    # Phylogenomic databases
    "eggnog",
    "genetree",
    "hogenom",
    "oma.grp",
    "orthodb",
    "treefam",
    # Enzyme and pathway databases
    "biocyc",
    "brenda",
    "reactome",
    # Miscellaneous databases
    "genewiki",
    # Gene expression databases
    "bgee.gene",
    ## Family and domain databases
    "cdd",
    "disprot",
    "hamap",
    "interpro",
    "panther.family",
    "pfam",
    "pirsf",
    "prints",
    "prosite",
    "smart",
    "supfam",
    "chromosome",
]
df_annotations["chromosome"] = df_annotations["chromosome"].apply(
    lambda x: x.split(" ")[-1]
)
df_annotations = df_annotations.loc[:, uniprot_columns + annotation_columns].rename(
    {
        "hgnc.symbol": "mgnc.symbol",  # Rename for mouse
    },
    axis=1,
)
annotation_columns[annotation_columns.index("hgnc.symbol")] = "mgnc.symbol"
print(f"Fields searched: {df_annotations.shape[1]}")
all_na = df_annotations.T[df_annotations.isna().all(axis=0)].index
annotation_columns = [x for x in annotation_columns if x not in all_na]
df_annotations = df_annotations.dropna(how="all", axis=1)
print(f"Empty dropped: {len(all_na)}")
print(f"Remaining: {df_annotations.shape[1]}")
df_annotations = df_annotations.reset_index(drop=False).replace(pd.NA, "")
df_annotations

In [None]:
df_isoforms = parse_isoforms_UniProt(
    df_annotations.loc[:, ["uniprot", "uniprot.isoform"]].copy(), add_canonical=True
)

df_canonical = df_isoforms[df_isoforms["uniprot.canonical"].apply(bool)].set_index(
    "uniprot"
)
df_canonical = df_canonical.apply(
    lambda x: x["uniprot.isoform"] if x["uniprot.isoform"] else x.name, axis=1
)
df_isoforms = df_isoforms.groupby("uniprot")[["uniprot.isoform"]].agg(
    lambda x: build_string(x)
)

df_chains = parse_chains_UniProt(
    df_annotations.loc[:, ["uniprot", "uniprot.chain"]].copy()
)
df_isoforms_chains = df_chains.merge(df_isoforms, right_index=True, left_on="uniprot")
df_isoforms_chains

In [None]:
for col, series in df_isoforms_chains.items():
    df_annotations[col] = series

for idx, row in df_annotations.loc[:, annotation_columns].iterrows():
    uniprot_id, isoform_id = df_isoforms_chains.loc[idx, ["uniprot", "uniprot.isoform"]]
    if isoform_id and len(isoform_id.split(";")) != 1:
        isoform_id = None
    # No isoform ID set, just aggregate all without regards to isoform.
    row = row.apply(
        lambda x: (
            get_isoform_value_from_entry_UniProt(x, isoform_id)
            if get_isoform_value_from_entry_UniProt(x, isoform_id).strip()
            else x
        )
    )
    row = row.apply(lambda x: x.strip().rstrip(";"))
    # A duplicate reindexing error may here may mean duplicate columns in annotation column values
    df_annotations.loc[idx, annotation_columns] = row.values
# Clean up other annotations
keys = ["rhea", "go", "hgnc"]
for key in keys:
    if key in df_annotations.columns:
        df_annotations[key] = (
            df_annotations[key]
            .fillna("")
            .apply(
                lambda x: build_string(
                    [s.lstrip(f"{key.upper()}:") for s in split_string(x)]
                )
            )
        )


df_annotations = df_annotations.replace(float("nan"), pd.NA).replace("", pd.NA)
if compare:
    compare_on_index = [annotation_type]
    try:
        df_previous = pd.read_csv(
            database_dirpath / f"{annotation_type}_{db_tag}.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_annotations.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    for col in df_annotations.columns:
        df = (
            df_annotations[col]
            .apply(lambda x: split_string(x))
            .explode(col)
            .drop_duplicates()
        )
        print(f"{df.name}: {df.nunique()}")
if overwrite:
    df_annotations.to_csv(
        database_dirpath / f"{annotation_type}_{db_tag}.tsv", sep="\t", index=False
    )

df_annotations

## Load Isoforms and Sequences
### Isoforms
#### Parse data into initial table of isoforms

In [None]:
erythro_keywords = [
    "erythro",
    "erythrocyte",
    "erythroid",
    "red blood cell",
    "rbc",
    "R-type",
    "P5N-I",
    "reticulocyte",
]
backup_keywords = ["cyto", "retic", "cell membrane"]
avoid_keywords = ["non-erythro", "mito", "not detected", "synaptic", "testis"]

rename_mapping = {
    "Entry": "uniprot",
    "Gene Names (primary)": "hgnc.symbol",
    "Organism (ID)": "taxonomy",
    "Alternative products (isoforms)": "uniprot.isoform",
    "Tissue specificity": "tissue_specificity",
    "Subcellular location [CC]": "subcellular_location",
}
columns_to_search = ["tissue_specificity", "subcellular_location"]

df_tissue_specificity = (
    df_query_results.loc[:, list(rename_mapping)].rename(rename_mapping, axis=1).copy()
)
df_isoforms = parse_isoforms_UniProt(
    df_tissue_specificity.loc[:, ["uniprot", "uniprot.isoform"]].copy(),
    add_canonical=True,
)
df_isoforms = df_isoforms.merge(
    df_tissue_specificity[columns_to_search],
    left_on="uniprot",
    right_index=True,
    how="left",
)

df_isoforms["erythroid"] = pd.NA
df_isoforms["backup"] = pd.NA
df_isoforms["avoid"] = pd.NA
df_isoforms["keywords.erythroid"] = pd.NA
df_isoforms["keywords.backup"] = pd.NA
df_isoforms["keywords.avoid"] = pd.NA
all_names = sorted(
    df_isoforms["uniprot.isoform.name"].replace("", pd.NA).dropna().unique()
)
all_synonyms = sorted(
    df_isoforms["uniprot.isoform.synonyms"].replace("", pd.NA).dropna().unique()
)
for col_to_search in columns_to_search:
    for idx, value_string in df_isoforms[col_to_search].dropna().items():
        if not re.search(r"\[Isoform (.+?(?=\]))", value_string):
            continue
        for isoform_entry in value_string.split(";"):
            match = re.search(r"\[Isoform (.+?(?=\]))", isoform_entry)
            if not match:
                continue

            isoform_name_or_synonym = match.group(1)
            if not (
                df_isoforms.loc[idx, "uniprot.isoform.name"] == isoform_name_or_synonym
                or df_isoforms.loc[idx, "uniprot.isoform.synonyms"]
                == isoform_name_or_synonym
            ):
                continue

            for col, keywords in zip(
                ["erythroid", "backup", "avoid"],
                [erythro_keywords, backup_keywords, avoid_keywords],
            ):
                found_keywords = set()
                for k in keywords:
                    found_keywords.update(
                        re.findall(k, isoform_entry.strip(), re.IGNORECASE)
                    )
                if df_isoforms.fillna("").loc[idx, f"keywords.{col}"]:
                    found_keywords.update(
                        split_string(df_isoforms.loc[idx, f"keywords.{col}"])
                    )
                df_isoforms.loc[idx, f"keywords.{col}"] = build_string(found_keywords)

for col, keywords in zip(
    ["erythroid", "backup", "avoid"],
    [erythro_keywords, backup_keywords, avoid_keywords],
):
    df_isoforms[f"keywords.{col}"] = df_isoforms[f"keywords.{col}"].apply(
        lambda x: set(split_string(x)) if isinstance(x, str) else set()
    )
    for k in keywords:
        df_isoforms[f"keywords.{col}"] = (
            df_isoforms[[f"keywords.{col}", "uniprot.isoform.name"]]
            .fillna("")
            .apply(
                lambda x: x[f"keywords.{col}"].union(
                    set(re.findall(k, x["uniprot.isoform.name"], re.IGNORECASE))
                ),
                axis=1,
            )
        )
        df_isoforms[f"keywords.{col}"] = (
            df_isoforms[[f"keywords.{col}", "uniprot.isoform.synonyms"]]
            .fillna("")
            .apply(
                lambda x: x[f"keywords.{col}"].union(
                    set(re.findall(k, x["uniprot.isoform.synonyms"], re.IGNORECASE))
                ),
                axis=1,
            )
        )
    df_isoforms[f"keywords.{col}"] = df_isoforms[f"keywords.{col}"].apply(
        lambda x: build_string([s for s in x if s])
    )
df_isoforms = df_isoforms.replace(float("nan"), pd.NA).replace("", pd.NA)
df_isoforms["erythroid"] = df_isoforms["keywords.erythroid"].notna()
df_isoforms["backup"] = df_isoforms["keywords.backup"].notna()
df_isoforms["avoid"] = df_isoforms["keywords.avoid"].notna()
# Remove those found in both categories from "erythroid", usually caused by words like `non-erythro`
df_isoforms.loc[
    df_isoforms[df_isoforms[["erythroid", "avoid"]].all(axis=1)].index,
    "erythroid",
] = False
# Erythroid easily serves as a backup option
df_isoforms.loc[
    df_isoforms[df_isoforms[["erythroid"]].all(axis=1)].index,
    "backup",
] = True

df_isoforms["canonical"] = df_isoforms["uniprot.canonical"]
df_isoforms["sequence.id"] = df_isoforms.fillna("").apply(
    lambda x: x["uniprot.isoform"] if x["uniprot.isoform"] else x["uniprot"], axis=1
)
df_isoforms = df_isoforms.replace(float("nan"), pd.NA).replace("", pd.NA)
df_isoforms = (
    df_isoforms.loc[
        :,
        [
            "uniprot",
            "uniprot.isoform",
            "sequence.id",
            "canonical",
            "erythroid",
            "backup",
            "avoid",
            "keywords.erythroid",
            "keywords.backup",
            "keywords.avoid",
        ],
    ]
    .drop_duplicates()
    .reset_index(drop=True)
)

### Sequences
#### Extract sequences using UniParc Mapping

In [None]:
df_isoforms_sequences = df_isoforms[
    df_isoforms.apply(lambda x: x["sequence.id"].startswith(x["uniprot"]), axis=1)
].copy()
query_ids = df_isoforms_sequences["sequence.id"].unique()

query_parameters = {
    "format": "tsv",
    "size": 500,
    "compressed": True,
    "fields": ",".join(
        [
            "sequence",
            "length",
        ]
    ),
}

query_key = "initial"
df_results, uniparc, failed_ids, obselete_counts = query_UniProt(
    list(query_ids),
    query_parameters=query_parameters,
    from_db="UniProtKB_AC-ID",
    to_db="UniParc",
    return_failed=True,
)

if failed_ids:
    print(failed_ids)

df_isoforms_sequences = df_isoforms_sequences.merge(
    df_results.set_index("From").rename(
        {
            "Sequence": "sequence",
            "Length": "sequence.length",
            "mass": "sequence.mass",
        },
        axis=1,
    ),
    left_on="sequence.id",
    right_index=True,
)

# #  The following code can be used to address entries with issues in extractions. LDHB example.
# df_isoforms_sequences = pd.concat(
#     (
#         df_isoforms_sequences,
#         pd.DataFrame.from_dict(
#             {
#                 "uniprot": "P07195",
#                 "sequence.id": "P07195",
#                 "canonical": True,
#                 "erythroid": False,
#                 "backup": False,
#                 "avoid": False,
#                 "sequence": "MATLKEKLIAPVAEEEATVPNNKITVVGVGQVGMACAISILGKSLADELALVDVLEDKLKGEMMDLQHGSLFLQTPKIVADKDYSVTANSKIVVVTAGVRQQEGESRLNLVQRNVNVFKFIIPQIVKYSPDCIIIVVSNPVDILTYVTWKLSGLPKHRVIGSGCNLDSARFRYLMAEKLGIHPSSCHGWILGEHGDSSVAVWSGVNVAGVSLQELNPEMGTDNDSENWKEVHKMVVESAYEVIKLKGYTNWAIGLSVADLIESMLKNLSRIHPVSTMVKGMYGIENEVFLSLPCILNARGLTSVINQKLKDDEVAQLKKSADTLWDIQKDLKDL",
#                 "sequence.length": "334",
#             },
#             orient="index",
#         ).T,
#     ),
#     axis=0,
# ).convert_dtypes()

df_isoforms_sequences = df_isoforms_sequences.reset_index(drop=True)

if overwrite:
    df_isoforms_sequences.to_csv(
        database_dirpath / f"{UNIPROT_DB_TAG}_isoforms_sequences.tsv",
        sep="\t",
        index=False,
    )

df_isoforms_sequences

In [None]:
df_model_mappings.merge(df_isoforms_sequences[df_isoforms_sequences["erythroid"]])

In [None]:
mouse_model