# Extract annotations and data from OMIM

The purpose of this notebook is to extract and format OMIM data for subsequent model annotation.

Additionally, the purpose of this notebook is to extract relevant data about morbid SNPs and phenotypes from the database.

## Notebook Requirements:
*  Model genes **must** have the at least one of following annotations stored in the `object.annotation`. Values are expected to be seperated by semicolons. Accepted keys currently include:
    * `"ncbigene"`
    * `"hgnc.symbol"` 
    * `"ensembl"`
    * `"mim"`

## OMIM® - Online Mendelian Inheritance in Man®
To utilize this notebook: 

1. Go to [OMIM database](https://omim.org/) and submit a request to download files.
2. Follow the instructions to obtain download access for MIM files.
3. After receiving access (check email), download the following files:
    * `mim2gene.txt`: A tab-delimited file linking MIM numbers with NCBI Gene IDs, Ensembl Gene IDs, and HGNC Approved Gene Symbols.
    * `mimTitles.txt`A tab-delimited file of MIM numbers and titles.
    * `genemap2.txt`A tab-delimited file containing OMIM's Synopsis of the Human Gene Map including additional information such as genomic coordinates and inheritance.
    * `morbidmap.txt`A tab-delimited file of OMIM's Synopsis of the Human Gene Map (same as `genemap2.txt` above) sorted alphabetically by disorder.
4. Remember clear out any personal account information and ensure the downloaded OMIM file(s) remains local!

### Citations
Hamosh A, Scott AF, Amberger JS, Bocchini CA, McKusick VA. Online Mendelian Inheritance in Man (OMIM), a knowledgebase of human genes and genetic disorders. Nucleic Acids Res. 2005 Jan 1;33(Database issue):D514-7. doi: 10.1093/nar/gki033. PMID: 15608251; PMCID: PMC539987.


### Import packages

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from rbc_gem_utils import (
    GEM_NAME,
    build_string,
    compare_tables,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    split_string,
    visualize_comparison,
)
from rbc_gem_utils.database.mim import (
    MIM_DB_TAG,
    MIM_NUMBER_RE,
    get_last_updated_dates_MIM,
    load_data_MIM,
)

show_versions()

## Set notebook options

In [None]:
db_tag = MIM_DB_TAG

# Compare to/Overwrite previous files
compare_figsize = (5, 5)
compare = True
display_nunique = True
overwrite = True
print_footer_notes = False

# Best mapping keys are hgnc.symbol or mim
mapping_key = "hgnc.symbol"

## Check OMIM version
If the version does not match the expected version, it is because database has been updated since the last time this code was utilized. 
* Major updates appear to be monthly. See last update [here](https://omim.org/statistics/update).
* Data files are updated nightly.

In [None]:
use_interim = False
database_dirpath = get_dirpath(
    "database", db_tag, use_temp="interim" if use_interim else None
)
annotation_dirpath = get_dirpath(
    "annotation", use_temp="interim" if use_interim else None
)

# Ensure directories exist
database_dirpath.mkdir(exist_ok=True, parents=True)
annotation_dirpath.mkdir(exist_ok=True, parents=True)

In [None]:
last_updated = get_last_updated_dates_MIM(database_dirpath)

## Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
model

### Determine mapping method
#### Load model annotations of relevance

In [None]:
annotation_type = "genes"
annotation_cols = ["ncbigene", "hgnc.symbol", "ensembl", "mim"]

df_model_mappings = (
    get_annotation_df(model.genes, annotation_cols)
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)
for col in df_model_mappings.columns:
    df_model_mappings[col] = df_model_mappings[col].apply(lambda x: split_string(x))
    df_model_mappings = df_model_mappings.explode(col).drop_duplicates()
df_model_mappings = df_model_mappings.sort_values(annotation_type)

print(df_model_mappings.nunique(dropna=True))
df_model_mappings = df_model_mappings.reset_index(drop=True)
df_model_mappings

#### Get query IDs

In [None]:
print(f"Mapping key: {mapping_key}")
query_ids = df_model_mappings[mapping_key].dropna().unique()
assert len(set(query_ids)) == len(query_ids), "Duplicate IDs in list to query"
model_search_mapping = df_model_mappings.set_index(annotation_type)[
    mapping_key
].to_dict()
print(f"Number of model genes associated with query: {len(model_search_mapping)}")
print(f"Number of unique IDs to query: {len(query_ids)}")
df_model_mappings[[annotation_type, mapping_key]].drop_duplicates()

## Load MIM data
### MIM Numbers and types

In [None]:
df_mim = load_data_MIM("mim2gene", print_footer_notes=print_footer_notes).fillna("")
df_mim = df_mim.rename(
    {
        "# MIM Number": "mim",
        "Entrez Gene ID (NCBI)": "ncbigene",
        "Approved Gene Symbol (HGNC)": "hgnc.symbol",
        "Ensembl Gene ID (Ensembl)": "ensembl",
        "MIM Entry Type (see FAQ 1.3 at https://omim.org/help/faq)": "type",
    },
    axis=1,
)
df_mim.head()

### Morbid Map

In [None]:
df_morbidmap = load_data_MIM("morbidmap", print_footer_notes=print_footer_notes).fillna(
    ""
)
df_morbidmap["Gene/Locus And Other Related Symbols"] = df_morbidmap[
    "Gene/Locus And Other Related Symbols"
].apply(lambda x: x.split(", ")[0])
rename_mapping = {
    "MIM Number": "mim",
    "# Phenotype": "phenotype",
    "Cyto Location": "cyto.location",
    "Gene/Locus And Other Related Symbols": "morbid.symbol",
}
df_morbidmap = df_morbidmap.loc[:, list(rename_mapping)].rename(rename_mapping, axis=1)
df_morbidmap["mim.phenotype"] = df_morbidmap["phenotype"].apply(
    lambda phenotype: build_string(
        [
            MIM_NUMBER_RE.search(s).group()
            for s in split_string(phenotype, sep=" ")
            if MIM_NUMBER_RE.search(s)
        ]
    )
)
df_morbidmap.head()

### Gene Map

In [None]:
df_genemap = load_data_MIM("genemap2", print_footer_notes=print_footer_notes).fillna("")
# # Seperate mouse genes IDs
# df_model_mim_final["Mouse Gene Symbol"] = df_model_mim_final["Mouse Gene Symbol/ID"].fillna("").apply(lambda x: x.split(" ")[0])
# df_model_mim_final["Mouse Gene Symbol"] = df_model_mim_final["Mouse Gene Symbol"].str.replace(",", ";")
df_genemap["Mouse Gene ID"] = (
    df_genemap["Mouse Gene Symbol/ID"]
    .fillna("")
    .apply(lambda x: x.split(" ")[-1].lstrip("(").rstrip(")"))
)

df_genemap["Mouse Gene ID"] = df_genemap["Mouse Gene ID"].str.replace(",", ";")
# df_genemap["Mouse Gene ID"] = df_genemap["Mouse Gene ID"].str.replace("MGI:", "")

rename_mapping = {
    "# Chromosome": "chromosome",
    "Cyto Location": "cyto.location",
    "MIM Number": "mim",
    "Approved Gene Symbol": "hgnc.symbol",
    "Entrez Gene ID": "ncbigene",
    "Ensembl Gene ID": "ensemble",
    "Phenotypes": "phenotype",
    "Mouse Gene ID": "mgi",
}
df_genemap = df_genemap.loc[:, list(rename_mapping)].rename(rename_mapping, axis=1)
df_genemap["mim.phenotype"] = df_genemap["phenotype"].apply(
    lambda phenotype: build_string(
        [
            MIM_NUMBER_RE.search(s).group()
            for s in split_string(phenotype, sep=" ")
            if MIM_NUMBER_RE.search(s)
        ]
    )
)
df_genemap = df_genemap[
    df_genemap[mapping_key].isin(list(model_search_mapping.values()))
].reset_index(drop=True)
df_genemap

### Titles

In [None]:
df_mimtitles = load_data_MIM("mimTitles", print_footer_notes=print_footer_notes).fillna(
    ""
)
rename_mapping = {
    "MIM Number": "mim",
    "Preferred Title; symbol": "title",
}
df_mimtitles = df_mimtitles.loc[:, list(rename_mapping)].rename(rename_mapping, axis=1)
df_mimtitles["title"] = df_mimtitles["title"].apply(lambda x: split_string(x))
df_mimtitles["title.symbol"] = df_mimtitles["title"].apply(
    lambda x: x[-1] if len(x) > 1 else ""
)
df_mimtitles["title"] = df_mimtitles["title"].apply(lambda x: x[0])

df_mimtitles.head()

## Map to model

In [None]:
df_model_mim = (
    pd.merge(
        df_model_mappings[[annotation_type, mapping_key]].drop_duplicates(),
        df_mim,
        left_on=mapping_key,
        right_on=mapping_key,
        how="left",
        suffixes=("", "_drop"),
    )
    .merge(
        df_morbidmap, left_on="mim", right_on="mim", how="left", suffixes=("", "_drop")
    )
    .merge(
        df_genemap, left_on="mim", right_on="mim", how="left", suffixes=("", "_drop")
    )
    .merge(
        df_mimtitles, left_on="mim", right_on="mim", how="left", suffixes=("", "_drop")
    )
)
df_removed_mim = df_model_mim[df_model_mim["type"] == "moved/removed"]
df_model_mim = df_model_mim[df_model_mim["type"] != "moved/removed"].reset_index(
    drop=True
)
df_model_mim = df_model_mim.drop(
    [c for c in df_model_mim.columns if c.endswith("_drop")], axis=1
).drop_duplicates()

df_model_mim["mim.all"] = df_model_mim[["mim", "mim.phenotype"]].apply(
    lambda x: build_string(sorted(set(x.dropna().unique()))), axis=1
)
df_model_mim = df_model_mim[df_model_mim["type"].str.find("gene") == 0]
df_model_mim = df_model_mim.sort_values(annotation_type).reset_index(drop=True)

df = df_mimtitles[
    df_mimtitles["mim"].isin(df_model_mim["mim.phenotype"].dropna())
].rename(
    {
        "mim": "mim.phenotype",
        "title": "mim.phenotype.title",
        "title.symbol": "mim.phenotype.title.symbol",
    },
    axis=1,
)

df_model_mim = (
    df_model_mim.merge(
        df,
        left_on="mim.phenotype",
        right_on="mim.phenotype",
        how="left",
    )
    .groupby(["genes", "mim", "mim.phenotype"], as_index=False)
    .agg(lambda x: build_string(x.dropna().unique()))
)
df_model_mim

In [None]:
df_model_mim = (
    pd.merge(
        df_model_mappings[[annotation_type, mapping_key]].drop_duplicates(),
        df_mim,
        left_on=mapping_key,
        right_on=mapping_key,
        how="left",
        suffixes=("", "_drop"),
    )
    .merge(
        df_morbidmap, left_on="mim", right_on="mim", how="left", suffixes=("", "_drop")
    )
    .merge(
        df_genemap, left_on="mim", right_on="mim", how="left", suffixes=("", "_drop")
    )
    .merge(
        df_mimtitles, left_on="mim", right_on="mim", how="left", suffixes=("", "_drop")
    )
)
df_removed_mim = df_model_mim[df_model_mim["type"] == "moved/removed"]
df_model_mim = df_model_mim[df_model_mim["type"] != "moved/removed"].reset_index(
    drop=True
)
df_model_mim = df_model_mim.drop(
    [c for c in df_model_mim.columns if c.endswith("_drop")], axis=1
).drop_duplicates()

df_model_mim = df_model_mim.replace("", float("nan"))


df_model_mim["mim.all"] = df_model_mim[["mim", "mim.phenotype"]].apply(
    lambda x: build_string(sorted(set(x.dropna().unique()))), axis=1
)
df_model_mim = df_model_mim[df_model_mim["type"].str.find("gene") == 0]
df_model_mim = df_model_mim.sort_values(annotation_type).reset_index(drop=True)

df_model_mim = (
    df_model_mim.merge(
        df_mimtitles[
            df_mimtitles["mim"].isin(df_model_mim["mim.phenotype"].dropna())
        ].rename(
            {
                "mim": "mim.phenotype",
                "title": "mim.phenotype.title",
                "title.symbol": "mim.phenotype.title.symbol",
            },
            axis=1,
        ),
        left_on="mim.phenotype",
        right_on="mim.phenotype",
        how="left",
    )
    .fillna("")
    .groupby(["genes", "mim", "mim.phenotype"], as_index=False)
    .agg(lambda x: build_string(x.unique()))
)
if overwrite:
    df_model_mim.to_csv(
        database_dirpath / f"{db_tag}_{GEM_NAME}.tsv", sep="\t", index=False
    )

df_model_mim

In [None]:
df_annotations = df_model_mim.loc[
    :, ["genes", "mim.all", "ncbigene", "hgnc.symbol", "ensembl", "mgi"]
]
df_annotations = df_annotations.rename({"mim.all": "mim"}, axis=1)
df_annotations = df_annotations.drop_duplicates()
df_annotations = df_annotations.groupby("genes", as_index=False).agg(
    lambda x: build_string(sorted(x.dropna()))
)
df_annotations = df_annotations.replace(float("nan"), pd.NA).replace("", pd.NA)
if compare:
    compare_on_index = [annotation_type]
    try:
        df_previous = pd.read_csv(
            annotation_dirpath / f"{annotation_type}_{db_tag}.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_annotations.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    for col in df_annotations.columns:
        df = (
            df_annotations[col]
            .apply(lambda x: split_string(x))
            .explode(col)
            .drop_duplicates()
        )
        print(f"{df.name}: {df.nunique()}")
if overwrite:
    df_annotations.to_csv(
        annotation_dirpath / f"{annotation_type}_{db_tag}.tsv", sep="\t", index=False
    )

df_annotations

In [None]:
df_model_mim = df_annotations[["genes", "mim"]].copy()
df_model_mim["mim"] = df_model_mim["mim"].apply(lambda x: split_string(x))
df_model_mim = df_model_mim.explode("mim").drop_duplicates()
df_model_mim

df_model_mim = pd.merge(df_model_mim, df_mim[["mim", "type"]])
df_model_mim = df_model_mim.merge(df_mimtitles).drop_duplicates().reset_index(drop=True)
df_duplicated = df_model_mim[df_model_mim[["mim", "type"]].duplicated()].reset_index(
    drop=True
)
print(df_model_mim[["mim", "type"]].drop_duplicates()["type"].value_counts())
print("Duplicated:\t", df_duplicated["mim"].count())
df_model_mim

In [None]:
df_model_mim = df_annotations.copy()
df_model_mim["mim"] = df_model_mim["mim"].str.split(";")
df_model_mim = df_model_mim.explode("mim").drop_duplicates()
df_model_morbidmap = df_model_mim.merge(df_morbidmap)
df_model_morbidmap = df_model_morbidmap.loc[
    :,
    [
        "genes",
        "hgnc.symbol",
        "ncbigene",
        "mim",
        "mim.phenotype",
        "phenotype",
        "cyto.location",
    ],
]
if overwrite:
    df_model_morbidmap.to_csv(
        database_dirpath / f"{db_tag}_{GEM_NAME}_morbidmap.tsv", sep="\t"
    )

df_model_morbidmap