# Extract metadata information from PubMed

Utilize the [Bio.Entrez package](https://biopython.org/docs/latest/api/Bio.Entrez.html) to get metadata for PubMed articles.

### Import packages

In [None]:
from collections import defaultdict

import pandas as pd
from rbc_gem_utils import (
    GEM_NAME,
    explode_column,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
)
from rbc_gem_utils.database.pubmed import (
    PUBMED_ERYTHROCYTE_KEYWORDS,
    fetch_batch_results_PubMed,
    get_mesh_terms,
    get_value_PubMed,
    search_erythrocyte_keywords_PubMed,
)
from rbc_gem_utils.util import build_string

show_versions()

## Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
model

## Download meta data from PubMed
### Get IDs for query from model

In [None]:
model_pmids = set()
model_dois = set()
model_other = set()
annotation_key = "references"

for attribute_type in ["reactions", "genes", "metabolites"]:
    df = get_annotation_df(getattr(model, attribute_type), annotation_key)
    df = explode_column(df, annotation_key, sep=";").dropna().drop_duplicates()
    references = df["references"].unique()
    pmids = {x.replace("pubmed:", "") for x in references if x.startswith("pubmed")}
    dois = {x for x in references if x.startswith("doi")}
    other = {
        x for x in references if not (x.startswith("pubmed") or x.startswith("doi"))
    }
    print(f"From model {attribute_type}\n" + len(f"From model {attribute_type}") * "-")
    print(f"Number of PubMed IDs: {len(pmids)}")
    print(f"Number of DOI links: {len(dois)}")
    print(f"Number of other IDs: {len(other)}")
    print()
    model_pmids.update(pmids)
    model_dois.update(dois)
    model_other.update(other)
print(f"Number of unique PubMed IDs from model in total: {len(model_pmids)}")
print(f"Number of unique DOIs from model in total: {len(model_dois)}")
print(f"Number of unique IDs, other sources, from model in total: {len(model_other)}")

### Get IDs for query from omic data

In [None]:
proteomic_pmids = set()

In [None]:
df_proteomic_evidence = pd.read_csv(
    get_dirpath("proteomics", use_temp="external") / "proteomic_evidence_table.tsv",
    sep="\t",
    index_col=0,
)
proteomic_pmids = {
    x.replace("pubmed:", "") if x.startswith("pubmed") else x
    for x in set(df_proteomic_evidence.columns)
}
print(f"Number of unique PubMed IDs from proteomics: {len(proteomic_pmids)}")
proteomic_pmids;

#### Combine PubMed IDs from all sources for query

In [None]:
query_ids = set()
print(f"Number of unique PubMed IDs\n===========================")
print(f"Extracted from model: {len(model_pmids)}")
print(f"Proteomic data: {len(proteomic_pmids)}")

for pmids in [model_pmids, proteomic_pmids]:
    query_ids.update(pmids)
query_ids = sorted(query_ids)

print(f"\nTotal number of unique PubMed IDs (additional): {len(query_ids)}")

### Get results from pubmed

In [None]:
only_major = True  # Refers to mesh headings
overwrite = True

# Make sure to use your email
email = "EMAILADDRESS"

In [None]:
all_results, failed_ids = fetch_batch_results_PubMed(
    email=email,
    pubmed_ids=query_ids,
    batch_size=500,
    return_failed=True,
)
if failed_ids:
    print(failed_ids)

### Create DataFrame from results

In [None]:
data = defaultdict(dict)
for idx, result in enumerate(all_results):
    medline_citation = result["MedlineCitation"]

    article = medline_citation["Article"]
    data[idx]["#"] = idx
    data[idx]["PubMed ID"] = get_value_PubMed(medline_citation, "PMID")

    data[idx]["Article/Section Title"] = get_value_PubMed(article, "ArticleTitle")
    data[idx]["Abstract"] = get_value_PubMed(article, "Abstract", "AbstractText")
    data[idx]["Language"] = get_value_PubMed(article, "Language", 0)
    data[idx]["Start Page"] = get_value_PubMed(article, "Pagination", "StartPage")
    data[idx]["End Page"] = get_value_PubMed(article, "Pagination", "EndPage")

    journal = article["Journal"]
    data[idx]["Journal"] = get_value_PubMed(journal, "Title")

    journal_issue = journal["JournalIssue"]
    data[idx]["Volume"] = get_value_PubMed(journal_issue, "Volume")
    data[idx]["Issue"] = get_value_PubMed(journal_issue, "Issue")
    data[idx]["Year"] = get_value_PubMed(journal_issue, "PubDate", "Year")
    data[idx]["Month"] = get_value_PubMed(journal_issue, "PubDate", "Month")
    data[idx]["Day"] = get_value_PubMed(journal_issue, "PubDate", "Day")

    if medline_citation.get("MeshHeadingList", []):
        mesh_values = ([], [])
        for mesh_heading in medline_citation["MeshHeadingList"]:
            mesh_values[0].extend(
                get_mesh_terms(mesh_heading, only_major=False, use_ids=True)
            )
            mesh_values[1].extend(
                get_mesh_terms(mesh_heading, only_major=False, use_ids=False)
            )
        data[idx]["Mesh IDs"] = build_string(mesh_values[0])
        data[idx]["Mesh terms"] = build_string(mesh_values[1])

    if medline_citation.get("KeywordList", []):
        data[idx]["Keywords"] = build_string(
            [
                str(keyword)
                for keyword in medline_citation["KeywordList"][0]
                if (only_major and keyword.attributes.get("MajorTopicYN") == "Y")
                or not only_major
            ]
        )
    # Author list
    data[idx]["Authors"] = build_string(
        [
            f"{author.get('LastName')} {author.get('Initials')}"
            for author in article["AuthorList"]
        ]
    )

    # DOI
    value = article["ELocationID"]
    if value and value[0].attributes.get("EIdType") == "doi":
        data[idx]["DOI"] = str(value[0])

    # Keep track of citation type
    data[idx]["Reference type"] = "Journal, Articles"
    data[idx]["Editors"] = ""
    data[idx]["Book"] = ""
    data[idx]["Publisher"] = ""
    data[idx]["IBSN"] = ""

df_pubmed = pd.DataFrame.from_dict(data, orient="index", dtype=str).fillna("")
df_pubmed

### Add additional references not found on PubMed
* Some references do not have an associated PubMed ID, and require manual addition.
* Some updates about erythrocyte specificity need to be made manually as they could not be detected automatically.
* Updates to PubMed references that could not be automatically be found.
* All manual additions should be formatted in the `additional_references.tsv` file to get added to the final reference table

In [None]:
df_additional = pd.read_csv(
    get_dirpath("database") / "additional_references_manual.tsv",
    sep="\t",
    index_col=0,
    dtype=str,
).fillna("")
assert len(df_additional) == len(model_dois) + len(model_other)
df_erythrocyte = pd.concat((df_pubmed, df_additional))

column_name = "RBC Keywords"
rbc_specific_key = "RBC Specific"
df_erythrocyte[column_name] = search_erythrocyte_keywords_PubMed(
    df=df_erythrocyte,
    text_columns=[
        "Article/Section Title",
        "Abstract",
        "Mesh terms",
        "Keywords",
        "Book",
    ],
    search_keywords=PUBMED_ERYTHROCYTE_KEYWORDS,
    column_name=column_name,
)[column_name]
df_erythrocyte[rbc_specific_key] = df_erythrocyte[column_name] != ""
df_erythrocyte = df_erythrocyte.reset_index(drop=True)
df_erythrocyte

In [None]:
model_dois

In [None]:
indicies = df_erythrocyte[
    df_erythrocyte["PubMed ID"].isin(
        [
            # Uses human erythrocytes
            "13416279",
            "13654516",
            "13563527",
            "13654516",
            "13800264",
            "5643703",
            "8043935",
            "9328029",
            "10477269",
            "11428554",
            "15987364",
        ]
    )
].index
df_erythrocyte.loc[indicies, "RBC Specific"] = True
df_erythrocyte

In [None]:
rbc_count = df_erythrocyte[rbc_specific_key].sum()
print(
    f"RBC specific: {rbc_count} / {len(df_erythrocyte)} ({round(rbc_count/len(df_erythrocyte), 5) * 100}%)"
)

df_erythrocyte = df_erythrocyte.sort_values(
    [rbc_specific_key, "Year", "Month", "Day"],
    ascending=[False, True, True, True],
).reset_index(drop=True)
df_erythrocyte = df_erythrocyte.drop(["#"], axis=1)
df_erythrocyte.index.name = "#"
df_erythrocyte

### Export references

In [None]:
if overwrite:
    df_erythrocyte.to_csv(
        get_dirpath("database") / f"{annotation_key}_{GEM_NAME}.tsv", sep="\t"
    )
df_erythrocyte

#### Visualize histogram of references

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
imagetype = "svg"
transparent = False

interval, shift = 4, 0  # years
fix_range = True
rbc_color = "xkcd:red"

df_years = df_erythrocyte["Year"].astype(int)
year_bounds = (df_years.min(), df_years.max())
year_bounds = (
    df_erythrocyte["Year"].astype(int).min(),
    df_erythrocyte["Year"].astype(int).max(),
)
print(f"Year range:\t {year_bounds}")
if fix_range:
    year_bounds = (
        np.floor_divide(year_bounds[0], interval) * interval + shift,
        np.floor_divide(year_bounds[-1], interval) * interval + shift,
    )
    print(f"Via intervals:\t {year_bounds}")

# Plot histogram
fig, ax = plt.subplots(1, 1, figsize=(5, 2.5))
sns.despine(fig)
# RBC specific
years_rbc = df_erythrocyte[df_erythrocyte[rbc_specific_key]]["Year"].values.astype(int)

# Not RBC specific
years_other = df_erythrocyte[~df_erythrocyte[rbc_specific_key]]["Year"].values.astype(
    int
)
nbins = int((year_bounds[-1] - year_bounds[0]) / interval)
rects = ax.hist(
    [years_rbc, years_other],
    bins=nbins,
    range=year_bounds,
    stacked=True,
    color=[rbc_color, "xkcd:light gray"],
    edgecolor="black",
)

xticks = np.linspace(year_bounds[0], year_bounds[1], int(nbins / 2) + 1)
ax.set_xticks(
    xticks,
    labels=[str(int(x)) for x in xticks],
    rotation=45,
    ha="right",
)
ax.yaxis.set_label_coords(0.05, 1.05)
ax.legend(
    handles=[mpl.patches.Patch(edgecolor="black", facecolor=rbc_color)],
    labels=[rbc_specific_key],
    frameon=False,
    handleheight=1,
    handlelength=1,
    fontsize="large",
    loc="upper left",
)
ax.set_yticks([0, 50, 100], minor=False)
ax.set_yticks([25, 75], minor=True)
ax.xaxis.set_tick_params(labelsize="large")
ax.yaxis.set_tick_params(labelsize="large")
ax.set_xlabel("Year published", fontdict={"size": "x-large"})
ax.set_ylabel("Count", fontdict={"size": "x-large"}, loc="top", rotation=0)
ax.set_title("Curated References", fontdict={"size": "x-large"})
fig.tight_layout()

dirpath = get_dirpath(use_temp="processed") / model.id
dirpath.mkdir(exist_ok=True, parents=True)
if overwrite:
    fig.savefig(
        dirpath / f"{annotation_key}_Histogram.{imagetype}",
        transparent=transparent,
    )
fig;