# OntoMap Mondo

This project is to investigate AI functions in BigQuery and apply these to the task of mapping ontology terms between disease ontologies.

In [13]:
# Imports

import time
import pandas as pd

## Prepare Google Cloud Environment

In [2]:
# Initialize Google Cloud Login. This only needs to be done once when running the notebook.

#! gcloud auth login

### Create BigQuery Cloud resource connection
You will need to create a [Cloud resource connection](https://cloud.google.com/bigquery/docs/create-cloud-resource-connection) to enable BigQuery to interact with Vertex AI services.

In [3]:
# This only needs to be done once so comment out if re-running the notebook.

# !bq mk --connection --location=us \
#     --connection_type=CLOUD_RESOURCE test_connection

### Set permissions for Service Account
The resource connection service account requires certain project-level permissions to interact with Vertex AI.

In [4]:
# SERVICE_ACCT = !bq show --format=prettyjson --connection us.test_connection | grep "serviceAccountId" | cut -d '"' -f 4
# SERVICE_ACCT_EMAIL = SERVICE_ACCT[-1]
# print(SERVICE_ACCT_EMAIL)

In [5]:
# PROJECT_ID="ontomap-mondo"
# SA_EMAIL="bqcx-1038190416641-ocdv@gcp-sa-bigquery-condel.iam.gserviceaccount.com"

# !gcloud projects add-iam-policy-binding "$PROJECT_ID" \
#   --member="serviceAccount:$SA_EMAIL" \
#   --role="roles/bigquery.connectionUser"

# !gcloud projects add-iam-policy-binding "$PROJECT_ID" \
#   --member="serviceAccount:$SA_EMAIL" \
#   --role="roles/aiplatform.user"

# # wait 60 seconds, give IAM updates time to propagate, otherwise, following cells will fail
# time.sleep(60)

---
## Filter Mondo Mapping Gold Standard data

In [15]:
# Read in Mondo mapping file `data/mondo_mappings.tsv` from running the SPARQL query `sparql/get_mappings.rq`

df = pd.read_csv('data/mondo_mappings.tsv', sep='\t')
df.head()

Unnamed: 0,?curie,?label,?xref,?source
0,MONDO:0000001,disease,DOID:4,EFO:0000408
1,MONDO:0000001,disease,DOID:4,MONDO:equivalentTo
2,MONDO:0000001,disease,ICD9:799.9,MONDO:i2s
3,MONDO:0000001,disease,ICD9:799.9,MONDO:relatedTo
4,MONDO:0000001,disease,MEDGEN:4347,MONDO:MEDGEN


In [17]:
# Filter dataframe to rows where `?xref` starts with DOID to start with only data 
# where a Mondo class is mapped to a Disease Ontology term

mondo_do_df = df[df['?xref'].str.startswith('DOID:', na=False)].copy()
mondo_do_df.reset_index(drop=True, inplace=True)

mondo_do_df.head()

Unnamed: 0,?curie,?label,?xref,?source
0,MONDO:0000001,disease,DOID:4,EFO:0000408
1,MONDO:0000001,disease,DOID:4,MONDO:equivalentTo
2,MONDO:0000004,adrenocortical insufficiency,DOID:10493,MONDO:equivalentTo
3,MONDO:0000009,"inherited bleeding disorder, platelet-type",DOID:2218,MONDO:equivalentTo
4,MONDO:0000023,infantile liver failure,DOID:0080716,MONDO:equivalentTo


In [18]:
# Get stats for filtered Mondo dataframe
mondo_do_df.nunique()

?curie     11554
?label     11554
?xref      11722
?source     1272
dtype: int64