# Extract annotations and data from EBI Complex Portal

The purpose of this notebook is to extract and format ComplexPortal data for subsequent model annotation. 

Additionally, the purpose of this notebook is to extract relevant data about complex stoichiometry from the database.

## Notebook Requirements:
*  Model genes **must** have the at least one of following annotations stored in the `object.annotation`. Values are expected to be seperated by semicolons. Accepted keys currently include:
    * `"complexportal"`
    * `"uniprot"`
*  Note: Requires internet connection to download information from [Complex Portal](https://www.ebi.ac.uk/complexportal/home).

### Citations
Meldal BHM, Bye-A-Jee H, Gajdoš L, Hammerová Z, Horácková A, Melicher F, Perfetto L, Pokorný D, Lopez MR, Türková A, Wong ED, Xie Z, Casanova EB, Del-Toro N, Koch M, Porras P, Hermjakob H, Orchard S. Complex Portal 2018: extended content and enhanced visualization tools for macromolecular complexes. Nucleic Acids Res. 2019 Jan 8;47(D1):D550-D558. doi: 10.1093/nar/gky1001. PMID: 30357405; PMCID: PMC6323931.

## Setup
### Import packages

In [None]:
import re
from warnings import warn

import matplotlib.pyplot as plt
import pandas as pd
from rbc_gem_utils import (
    GEM_NAME,
    build_string,
    check_database_release_online,
    compare_tables,
    explode_column,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    split_string,
    visualize_comparison,
)
from rbc_gem_utils.database.complexportal import (
    COMPLEXPORTAL_DB_TAG,
    COMPLEXPORTAL_RELEASE_EXPECTED,
    COMPLEXPORTAL_URL,
    parse_complex_participants,
)
from rbc_gem_utils.database.uniprot import UNIPROT_ISOFORM_ID_RE
from rbc_gem_utils.util import ensure_iterable

show_versions()

## Set notebook options

In [None]:
db_tag = COMPLEXPORTAL_DB_TAG
expected_release = COMPLEXPORTAL_RELEASE_EXPECTED
database_url = COMPLEXPORTAL_URL
download_database = False

compare_figsize = (5, 5)
compare = True
display_nunique = True
overwrite = True

## Check Complex Portal release
* According to [Complex Portal](https://www.ebi.ac.uk/complexportal/download), database is released monthly. 
* All files are available from the ftp site  and will be updated as new complexes with every data release.
* If the current release does not match the expected release, it is because database has been updated since the last time this code was utilized.
    * If the notebook works without needing any significant modifications, the only update needed is to the release in the [complex.py](../../src/rbc_gem_utils/database/complexportal.py) source code file to resolve the issue.

In [None]:
taxomony_int = 9606
use_interim = not check_database_release_online(
    db_tag, verbose=True, **{"taxomony_int": taxomony_int}
)
# Use different directory paths for unexpected behavior
if use_interim:
    warn(
        "Online release of database has been updated since the last time notebook was used."
    )


database_dirpath = get_dirpath(
    "database", db_tag, use_temp="interim" if use_interim else None
)
annotation_dirpath = get_dirpath(
    "annotation", use_temp="interim" if use_interim else None
)
# Ensure directories exist
database_dirpath.mkdir(exist_ok=True, parents=True)
annotation_dirpath.mkdir(exist_ok=True, parents=True)

## Load ComplexPortal data
### Download new files and update database
If an argument is not provided (`arg=None`), its default value for the repository used. 

In [None]:
if download_database:
    df_complexportal_data = pd.read_csv(f"{database_url}/{taxomony_int}.tsv", sep="\t")
    df_complexportal_data.to_csv(
        database_dirpath / f"{taxomony_int}.tsv", sep="\t", index=False
    )
else:
    df_complexportal_data = pd.read_csv(
        database_dirpath / f"{taxomony_int}.tsv",
        sep="\t",
    )
df_complexportal_data

## Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
model

### Extract current annotations from model

In [None]:
annotation_type = "genes"
annotation_cols = ["uniprot", "complexportal"]
mapping_key = "complexportal"

df_model_mappings = (
    get_annotation_df(model.genes, annotation_cols)
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)
for col in df_model_mappings.columns:
    df_model_mappings = explode_column(df_model_mappings, name=col, sep=";")
df_model_mappings = df_model_mappings.sort_values(annotation_type)

print(df_model_mappings.nunique(dropna=True))
df_model_mappings = df_model_mappings.reset_index(drop=True)

df_model_mappings

### Map to model genes


In [None]:
col = "Expanded participant list"
df = parse_complex_participants(
    df_complexportal_data.set_index("#Complex ac")[col]
).reset_index(drop=False)
df = df.rename({"#Complex ac": "complexportal"}, axis=1)

df["uniprot"] = df["participants"].apply(lambda x: ensure_iterable(x.split("-")[0]))
df = df.explode("uniprot")

if mapping_key == "complexportal":
    df = df[df["complexportal"].isin(df_model_mappings["complexportal"].values)]
else:
    df["participants"] = df["participants"].apply(lambda x: x.split("-")[0])
    df = df[df["participants"].isin(df_model_mappings[mapping_key].values)]
df_complex_participants = df.copy()
df_complex_participants = df_complex_participants.sort_values(
    ["complexportal", "uniprot"], ascending=[True, True]
)
df_complex_participants = df_complex_participants.reset_index(drop=True)
print(df_complex_participants.nunique())
df_complex_participants

## Format ComplexPortal information for all files

In [None]:
df_model_complex_annotations = df_complexportal_data[
    df_complexportal_data["#Complex ac"].isin(df_complex_participants["complexportal"])
].copy()
df_model_complex_annotations = df_model_complex_annotations.reset_index(drop=True)
rename_mapping = {
    "#Complex ac": "complexportal",
    "Recommended name": "complexportal.name",
    "Aliases for complex": "complexportal.alias",
    "Identifiers (and stoichiometry) of molecules in complex": "complexportal.stoichiometry.all",
    "Experimental evidence": "complexportal.evidence",
    "Cross references": "complexportal.cross_references",
    "Description": "complexportal.description",
    "Complex properties": "complexportal.properties",
    "Complex assembly": "complexportal.assembly",
    "Ligand": "complexportal.ligand",
    "Disease": "complexportal.disease",
    "Agonist": "complexportal.agonist",
    "Antagonist": "complexportal.antagonist",
    "Comment": "complexportal.comment",
    "Source": "complexportal.source",
    "Taxonomy identifier": "taxomony",
    "Evidence Code": "eco",
    "Go Annotations": "go",
    "Expanded participant list": "uniprot",
}
df_model_complex_annotations = df_model_complex_annotations.replace("-", "")
df_model_complex_annotations = df_model_complex_annotations.loc[
    :, list(rename_mapping)
].rename(rename_mapping, axis=1)
df_model_complex_annotations
for key in df_model_complex_annotations.columns:
    df_model_complex_annotations[key] = df_model_complex_annotations[key].apply(
        lambda x: build_string(x.split("|")) if isinstance(x, str) else x
    )

    if key == "complexportal.cross_references":
        for subkey in {"reactome", "complex portal", "pubmed", "intenz"}:
            df_model_complex_annotations[subkey] = df_model_complex_annotations[
                key
            ].apply(
                lambda x: build_string(
                    [
                        s.split("(")[0].replace(" ", "").replace(f"{subkey}:", "")
                        for s in x.split(";")
                        if s.startswith(subkey)
                    ]
                )
            )
        for subkey in {"rhea", "efo"}:
            df_model_complex_annotations[subkey] = df_model_complex_annotations[
                key
            ].apply(
                lambda x: build_string(
                    [
                        s.split("(")[0]
                        .replace(" ", "")
                        .lstrip(f":{subkey}")
                        .lower()
                        .replace(f"{subkey}:", "")
                        for s in x.split(";")
                        if s.startswith(subkey)
                    ]
                )
            )
            if subkey == "efo":
                for subsubkey in ["orphanet", "efo"]:  # EFO must be last
                    df_model_complex_annotations[
                        subsubkey
                    ] = df_model_complex_annotations[subkey].apply(
                        lambda x: build_string(
                            [s for s in x.split(";") if s.startswith(subsubkey)]
                        )
                    )

    if key == "complexportal.evidence":
        for subkey in {"intact", "wwpdb"}:
            df_model_complex_annotations[
                subkey.replace("wwpdb", "pdb")
            ] = df_model_complex_annotations[key].apply(
                lambda x: build_string(
                    [
                        s.split("(")[0].replace(f"{subkey}:", "")
                        for s in x.split(";")
                        if s.startswith(subkey)
                    ]
                )
            )
    if key in {"eco", "go", "uniprot"}:
        if key == "uniprot":
            df_model_complex_annotations[
                f"{key}.stoichiometry"
            ] = df_model_complex_annotations[key].apply(
                lambda x: ";".join([s.split("(")[1].rstrip(")") for s in x.split(";")])
            )
        df_model_complex_annotations[key] = df_model_complex_annotations[key].apply(
            lambda x: build_string([s.split("(")[0] for s in x.split(";")])
        )
    if key in {"complexportal.disease"}:
        for subkey in ["mondo", "mim"]:
            df_model_complex_annotations[subkey] = df_model_complex_annotations[
                key
            ].apply(
                lambda x: build_string(
                    re.findall(rf"\[({subkey}.+?(?=]))", x, re.IGNORECASE)
                )
            )
    if key == "complexportal.stoichiometry.all":
        for subkey in ["urs", "chebi"]:
            search_re = re.compile(f"({subkey}.+?(?=;))", re.IGNORECASE)
            if subkey == "urs":
                subkey = "rnacentral"
            df_model_complex_annotations[subkey] = df_model_complex_annotations[
                key
            ].apply(lambda x: build_string(search_re.findall(x)))
            df_model_complex_annotations[
                f"{subkey}.stoichiometry"
            ] = df_model_complex_annotations[subkey].apply(
                lambda x: build_string(
                    [s.split("(")[1].rstrip(")") for s in x.split(";") if s]
                )
            )
            df_model_complex_annotations[f"{subkey}"] = df_model_complex_annotations[
                subkey
            ].apply(
                lambda x: build_string([s.split("(")[0] for s in x.split(";") if s])
            )
df_model_complex_annotations = df_model_complex_annotations.replace(
    float("nan"), pd.NA
).replace("", pd.NA)
df_model_complex_annotations = df_model_complex_annotations.replace("", float("nan"))
df_model_complex_annotations = df_model_complex_annotations.sort_values(
    ["complexportal", "uniprot"], ascending=[True, True]
)
df_model_complex_annotations = df_model_complex_annotations.reset_index(drop=True)
df_model_complex_annotations

## Format Complex Tables
### Complexes and stoichiometry
Information extracted here can be useful for protein constrained modeling. All possible complexes are extracted here. Data is further formatted and refined in the curation notebooks [HarmonizeProteinsComplexes](../curation/HarmonizeProteinsComplexes.ipynb#Complex-Portal).

In [None]:
df_model_complex_data = df_complex_participants.loc[
    :, ["complexportal", "participants", "stoichiometry", "uniprot"]
]
df_model_complex_data["uniprot.isoform"] = df_model_complex_data["participants"].apply(
    lambda x: (
        UNIPROT_ISOFORM_ID_RE.search(x).group()
        if UNIPROT_ISOFORM_ID_RE.search(x)
        else ""
    )
)

df_model_complex_data = df_model_complex_data.merge(
    df_model_complex_annotations.loc[
        :,
        [
            "complexportal",
            "complexportal.name",
            "complexportal.alias",
            "complexportal.stoichiometry.all",
            "reactome",
        ],
    ],
    left_on="complexportal",
    right_on="complexportal",
    how="left",
    suffixes=("", "_drop"),
)
df_model_complex_data = df_model_complex_data.drop(
    [c for c in df_model_complex_data.columns if c.endswith("_drop")], axis=1
)
if overwrite:
    df_model_complex_data.to_csv(
        database_dirpath / f"{db_tag}_{GEM_NAME}_complexes.tsv", sep="\t", index=False
    )
df_model_complex_data

### Annotation tables

In [None]:
df_annotations = df_model_complex_annotations.loc[
    :,
    [
        "complexportal",
        "eco",
        "go",
        "uniprot",
        "rnacentral",
        "chebi",
        "pdb",
        "intact",
        "reactome",
        "pubmed",
        "efo",
        "rhea",
        "orphanet",
        "mondo",
        "mim",
    ],
].copy()
df_annotations["uniprot"] = df_annotations["uniprot"].apply(
    lambda x: [s.split("-")[0] for s in split_string(x)]
)
df_annotations = df_annotations.explode("uniprot")
df_annotations = df_model_mappings.merge(
    df_annotations,
    left_on=mapping_key,
    right_on=mapping_key,
    how="left",
    suffixes=("", "_drop"),
)
to_drop = [c for c in df_annotations.columns if c.endswith("_drop")]
for k in to_drop:
    df_annotations = df_annotations[
        df_annotations[k.strip("_drop")] == df_annotations[k]
    ].drop(k, axis=1)
    df_annotations = df_annotations.drop_duplicates()

df_annotations = df_annotations.sort_values(
    ["genes", "complexportal", "uniprot"]
).reset_index(drop=True)
df_annotations = df_annotations.reset_index(drop=True)

df_gene_annotations = df_annotations.copy()
df_cplx_annotations = df_annotations.copy()
df_cplx_annotations["complexes"] = df_cplx_annotations["complexportal"].str.replace(
    "-", "_"
)

df_annotations

#### Complexes

In [None]:
annotation_type = "complexes"
df_annotations = df_cplx_annotations.groupby(annotation_type, as_index=False).agg(
    lambda x: build_string(sorted(x.dropna().unique()))
)

df_annotations = df_annotations.replace(float("nan"), pd.NA).replace("", pd.NA)
if compare:
    compare_on_index = [annotation_type]
    try:
        df_previous = pd.read_csv(
            annotation_dirpath / f"{annotation_type}_{db_tag}.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_annotations.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    for col in df_annotations.columns:
        df = explode_column(df_annotations, name=col, sep=";")
        df = df[col].drop_duplicates()
        print(f"{df.name}: {df.nunique()}")

if overwrite:
    df_annotations.to_csv(
        annotation_dirpath / f"{annotation_type}_{db_tag}.tsv", sep="\t", index=False
    )

df_annotations

#### Genes

In [None]:
annotation_type = "genes"
df_annotations = df_gene_annotations.groupby(annotation_type, as_index=False).agg(
    lambda x: build_string(list(x.dropna().unique()))
)

df_annotations = df_annotations.replace(float("nan"), pd.NA).replace("", pd.NA)
if compare:
    compare_on_index = [annotation_type]
    try:
        df_previous = pd.read_csv(
            annotation_dirpath / f"{annotation_type}_{db_tag}.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_annotations.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    for col in df_annotations.columns:
        df = explode_column(df_annotations, name=col, sep=";")
        df = df[col].drop_duplicates()
        print(f"{df.name}: {df.nunique()}")

if overwrite:
    df_annotations.to_csv(
        annotation_dirpath / f"{annotation_type}_{db_tag}.tsv", sep="\t", index=False
    )

df_annotations