The CINECA data has ICD codes for diseases rather than a proper ontology. As we are interested in ontology expansion when querying, I want to replace these with ontology terms from ontologies such as MONDO and/or HP which have a hierarchy and allow for ontology expansion from more general terms.

The ethnicity ontologies are also somewhat problematic as they are quite generalised and the labels are not consistent with the ontology curie given. I will also attempt to recurate these.

These terms are in the individuals sheet

I will use the EBI OLS API and maybe the ZOOMA one too.

https://www.ebi.ac.uk/ols/docs/api

In [6]:
import requests as rq
import pandas as pd
import json
import numpy as np
import openpyxl as op
from requests.structures import CaseInsensitiveDict
import pprint
from collections import OrderedDict


Load workbook, make individuals dataframe

In [7]:
excel_ss = op.load_workbook("/Users/marionfs/Documents/GitHub/sbeacon-exploration/data/CINECA_synthetic_cohort_EUROPE_UK1/Beacon-v2-Models_CINECA_UK1.xlsx", read_only=True)
ind_df = pd.DataFrame(excel_ss['individuals'].values)
ind_df_columns = ind_df.values[:1]
ind_df.set_axis(list(ind_df_columns), axis=1, inplace=True)
ind_df.drop(index=ind_df.index[0], axis=0, inplace=True)

  warn(msg)
  ind_df.set_axis(list(ind_df_columns), axis=1, inplace=True)


In [8]:
ind_df

Unnamed: 0,﻿diseases_ageOfOnset,diseases_diseaseCode.id,diseases_diseaseCode.label,diseases_familyHistory,diseases_notes,diseases_severity.id,diseases_severity.label,diseases_stage.id,diseases_stage.label,ethnicity.id,...,treatments_cumulativeDose.referenceRange.low,treatments_cumulativeDose.referenceRange.unit,treatments_cumulativeDose.unit.id,treatments_cumulativeDose.unit.label,treatments_cumulativeDose.value,treatments_doseIntervals,treatments_routeOfAdministration.id,treatments_routeOfAdministration.label,treatments_treatmentCode.id,treatments_treatmentCode.label
1,,,,,,,,,,NCIT:C42331,...,,,,,,,,,,
2,,,,,,,,,,NCIT:C41261,...,,,,,,,,,,
3,,,,,,,,,,NCIT:C41260,...,,,,,,,,,,
4,,,,,,,,,,NCIT:C67109,...,,,,,,,,,,
5,,,,,,,,,,NCIT:C67109,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2500,,"ICD10:D70,ICD10:E06","agranulocytosis,thyroiditis",,,,,,,NCIT:C43856,...,,,,,,,,,,
2501,,"ICD10:D86,ICD10:K74","sarcoidosis,fibrosis and cirrhosis of liver",,,,,,,NCIT:C67109,...,,,,,,,,,,
2502,,,,,,,,,,NCIT:C67109,...,,,,,,,,,,
2503,,"ICD10:E06,ICD10:K21","thyroiditis,gastro-oesophageal reflux disease",,,,,,,,...,,,,,,,,,,


## Change disease codes to mondo
use mondo ontology for hierarchical structure for more intuitive searching.

First, Get list of all unique disease labels

In [13]:
label_list = ind_df['diseases_diseaseCode.label'].values.tolist()

label_list = [item for sublist in label_list for item in sublist]
non_none = list(filter(lambda item: item is not None, label_list))

non_none = [item.split(',') for item in non_none]

non_none_flat = [item for sublist in non_none for item in sublist]

distinct_diseases = set(non_none_flat)

In [14]:
distinct_diseases

{'acute bronchitis',
 'agranulocytosis',
 'asthma',
 'bipolar affective disorder',
 'cardiomyopathy',
 'dental caries',
 'eating disorders',
 'fibrosis and cirrhosis of liver',
 'gastro-oesophageal reflux disease',
 'haemorrhoids',
 'influenza due to certain identified influenza virus',
 'insulin-dependent diabetes mellitus',
 'iron deficiency anaemia',
 'multiple sclerosis',
 'obesity',
 'sarcoidosis',
 'schizophrenia',
 'thyroiditis',
 'varicose veins of lower extremities'}

query EBI OLS MONDO for matches for each

In [15]:
# did not end up using Zooma, found OLS was better
# def zooma_lookup(ontology, query_term):
#    base_url = f"http://www.ebi.ac.uk/spot/zooma/v2/api/services/annotate?propertyValue={query_term}&required:[none]&ontologies:[mondo]"
#    headers = CaseInsensitiveDict()
#    headers['Accept'] = "application/json"
#    response = rq.get(base_url)
#    pprint.pprint(json.loads(response.content))

In [16]:
def ontology_lookup(ontology: str, query_term: str):
    """ Ontology Lookup
    
    ontology: The name of the ontology, e.g. mondo
    query_term: the string to lookup in that ontology
    
    Return: {'obo_id': ontology_curie, 
            'label': ontology_label}
    
    Method that helps curate an ontology term in a semi-automated way.
    It first searches the EBI OLS in the specified ontology for the query_term
    and presents numbered options for the user to choose from.
    If a suitable term is not found the user can enter 'n' and provide an 
    alternate query term.
    If still no term is found entering 'n' will provide a null value.
    Returns a dict with the curie in 'obo_id' and the label in 'label'
    """
    base_url = f"http://www.ebi.ac.uk/ols/api/search?q={query_term}&ontology={ontology.lower()}&queryFields=label,synonym"
    headers = CaseInsensitiveDict()
    headers['Accept'] = "application/json"
    response = rq.get(base_url, headers=headers)
    content = json.loads(response.content)
    total_terms_found = len(content['response']['docs'])
    if total_terms_found < 5:
        to_display = total_terms_found
    else:
        to_display = 5
    start=0
    # Check for exact matches first, must be MONDO and must be exact text match
    
    print("Choose the most appropriate ontology below, if none suitable enter n")
    for i in range(start, start+to_display, 1):
        print(f"{i+1}.{content['response']['docs'][i]['obo_id']} {content['response']['docs'][i]['label']}")
    chosen_term = input("Enter the number of the chosen ontology curation: ")
    if chosen_term.lower() == "n":
        new_term = input("If you would like to try to search a different term, enter it now, if not enter n")
        if new_term == "n":
            return None
        else:
            base_url = f"http://www.ebi.ac.uk/ols/api/search?q={new_term}&ontology={ontology.lower()}&queryFields=label,synonym" 
            response = rq.get(base_url, headers=headers)
            content = json.loads(response.content)
            total_terms_found = len(content['response']['docs'])
            if total_terms_found < 5:
                to_display = total_terms_found
            else:
                to_display = 5
            start=0
            print("Choose the most appropriate ontology below, if none suitable enter n")
            for i in range(start, start+to_display, 1):
                print(f"{i+1}.{content['response']['docs'][i]['obo_id']} {content['response']['docs'][i]['label']}")
            chosen_term = input("Enter the number of the chosen ontology curation: ")
            if chosen_term == "n":
                return None
            else:
                return {"obo_id": content['response']['docs'][int(chosen_term) - 1]['obo_id'], 
                        "label": content['response']['docs'][int(chosen_term) - 1]['label']}
    else:
        return {"obo_id": content['response']['docs'][int(chosen_term) - 1]['obo_id'], 
                "label": content['response']['docs'][int(chosen_term) - 1]['label']}

Iterate over diseases to get a dict of disease curations from mondo.

In [17]:
curation_dict = {}
for disease in distinct_diseases:
    print(f"The disease you are curating is: {disease}")
    curation = ontology_lookup("mondo", disease)
    curation_dict[disease] = curation

The disease you are curating is: obesity
Choose the most appropriate ontology below, if none suitable enter n
1.HP:0001513 Obesity
2.MONDO:0019182 inherited obesity
3.MONDO:0005139 morbid obesity
4.MONDO:0011122 obesity disorder
5.MONDO:0009763 obesity-hypoventilation syndrome
Enter the number of the chosen ontology curation: 4
The disease you are curating is: cardiomyopathy
Choose the most appropriate ontology below, if none suitable enter n
1.HP:0001638 Cardiomyopathy
2.MONDO:0004994 cardiomyopathy
3.MONDO:0010771 histiocytoid cardiomyopathy
4.MONDO:0030701 autoimmune cardiomyopathy
5.MONDO:0002824 extrinsic cardiomyopathy
Enter the number of the chosen ontology curation: 2
The disease you are curating is: sarcoidosis
Choose the most appropriate ontology below, if none suitable enter n
1.MONDO:0019338 sarcoidosis
2.MONDO:0006611 skin sarcoidosis
3.MONDO:0001709 hypercalcemic sarcoidosis
4.MONDO:0001708 pulmonary sarcoidosis
5.MONDO:0001707 cardiac sarcoidosis
Enter the number of the 

Needed to add this manually

In [18]:
curation_dict['haemorrhoids'] = {'obo_id': 'MONDO:0004872', 'label': 'hemorrhoid'}

In [19]:
pprint.pprint(curation_dict)

{'acute bronchitis': {'label': 'bronchitis', 'obo_id': 'MONDO:0003781'},
 'agranulocytosis': {'label': 'agranulocytosis', 'obo_id': 'MONDO:0001609'},
 'asthma': {'label': 'asthma', 'obo_id': 'MONDO:0004979'},
 'bipolar affective disorder': {'label': 'manic bipolar affective disorder',
                                'obo_id': 'MONDO:0024612'},
 'cardiomyopathy': {'label': 'cardiomyopathy', 'obo_id': 'MONDO:0004994'},
 'dental caries': {'label': 'dental caries', 'obo_id': 'MONDO:0005276'},
 'eating disorders': {'label': 'eating disorder', 'obo_id': 'MONDO:0005451'},
 'fibrosis and cirrhosis of liver': {'label': 'cirrhosis of liver',
                                     'obo_id': 'MONDO:0005155'},
 'gastro-oesophageal reflux disease': {'label': 'gastroesophageal reflux '
                                                'disease',
                                       'obo_id': 'MONDO:0007186'},
 'haemorrhoids': {'label': 'hemorrhoid', 'obo_id': 'MONDO:0004872'},
 'influenza due to certai

In [20]:
label_dict = {k:v['label'] for k,v in curation_dict.items()}

MONDO doesn't have a single term for fibrosis and cirrhosis of liver, I automatically added the cirrhosis term and will add the fibrosis term in an adhoc way below

In [21]:
label_dict['fibrosis and cirrhosis of liver'] = 'cirrhosis of liver,fibrotic liver disease'

In [22]:
curie_dict = {v['label']:v['obo_id'] for v in curation_dict.values()}

In [23]:
curie_dict['fibrotic liver disease'] = 'MONDO:0100430'

In [24]:
test_df2 = ind_df.replace(label_dict, regex=True)

In [25]:
def make_ontology_string(label_string):
    if label_string:
        split_string = label_string.split(',')
        curies = [curie_dict[label] for label in split_string]
        joined_curies = ','.join(curies)
        return(joined_curies)
    else:
        return 

In [26]:
disease_label_series = test_df2['diseases_diseaseCode.label']
curies_list= []
for row in disease_label_series.iterrows():
    if row[1][0]:
        curies_list.append(make_ontology_string(row[1][0]))
    else:
        curies_list.append(None)

In [27]:
test_df2["diseases_diseaseCode.id"] = curies_list

In [28]:
test_df2

Unnamed: 0,﻿diseases_ageOfOnset,diseases_diseaseCode.id,diseases_diseaseCode.label,diseases_familyHistory,diseases_notes,diseases_severity.id,diseases_severity.label,diseases_stage.id,diseases_stage.label,ethnicity.id,...,treatments_cumulativeDose.referenceRange.low,treatments_cumulativeDose.referenceRange.unit,treatments_cumulativeDose.unit.id,treatments_cumulativeDose.unit.label,treatments_cumulativeDose.value,treatments_doseIntervals,treatments_routeOfAdministration.id,treatments_routeOfAdministration.label,treatments_treatmentCode.id,treatments_treatmentCode.label
1,,,,,,,,,,NCIT:C42331,...,,,,,,,,,,
2,,,,,,,,,,NCIT:C41261,...,,,,,,,,,,
3,,,,,,,,,,NCIT:C41260,...,,,,,,,,,,
4,,,,,,,,,,NCIT:C67109,...,,,,,,,,,,
5,,,,,,,,,,NCIT:C67109,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2500,,"MONDO:0001609,MONDO:0004126","agranulocytosis,thyroiditis",,,,,,,NCIT:C43856,...,,,,,,,,,,
2501,,"MONDO:0019338,MONDO:0005155,MONDO:0100430","sarcoidosis,cirrhosis of liver,fibrotic liver ...",,,,,,,NCIT:C67109,...,,,,,,,,,,
2502,,,,,,,,,,NCIT:C67109,...,,,,,,,,,,
2503,,"MONDO:0004126,MONDO:0007186","thyroiditis,gastroesophageal reflux disease",,,,,,,,...,,,,,,,,,,


## Fix ethnicity labels to match ontology curies

currently the labels do not match the curated codes.

In [29]:
ethnicities = test_df2['ethnicity.label'].values.tolist()
ethnicities_label_list = [item for sublist in ethnicities for item in sublist]
non_none_ethnicities = list(filter(lambda item: item is not None, ethnicities_label_list))
distinct_ethnicities = set(non_none_ethnicities)

In [30]:
distinct_ethnicities

{'African',
 'Any other Asian background',
 'Any other Black background',
 'Any other mixed background',
 'Any other white background',
 'Asian or Asian British',
 'Bangladeshi',
 'Black or Black British',
 'British',
 'Caribbean',
 'Chinese',
 'Indian',
 'Irish',
 'Mixed',
 'Other ethnic group',
 'Pakistani',
 'White',
 'White and Asian',
 'White and Black African',
 'White and Black Caribbean'}

In [31]:
ethnicities = test_df2['ethnicity.id'].values.tolist()
ethnicities_label_list = [item for sublist in ethnicities for item in sublist]
non_none_ethnicities = list(filter(lambda item: item is not None, ethnicities_label_list))
distinct_ethnicities_curies = set(non_none_ethnicities)

In [32]:
distinct_ethnicities_curies

{'NCIT:C16352',
 'NCIT:C41260',
 'NCIT:C41261',
 'NCIT:C42331',
 'NCIT:C43856',
 'NCIT:C67109',
 'NCIT:C77810'}

In [33]:
ethnic_dict = {}
for ethnic in distinct_ethnicities_curies:
    iri = f"http://purl.obolibrary.org/obo/{ethnic.replace(':', '_')}"
    base_url = f"http://www.ebi.ac.uk/ols/api/ontologies/ncit/terms?iri={iri}"
    headers = CaseInsensitiveDict()
    headers['Accept'] = "application/json"
    response = rq.get(base_url, headers=headers)
    content = json.loads(response.content)
    ethnic_dict[ethnic] = content['_embedded']['terms'][0]['label']

In [34]:
ethnic_dict

{'NCIT:C43856': 'Irish',
 'NCIT:C67109': 'Multiracial',
 'NCIT:C16352': 'Black or African American',
 'NCIT:C41260': 'Asian',
 'NCIT:C42331': 'African',
 'NCIT:C77810': 'Caribbean Indian',
 'NCIT:C41261': 'White'}

In [35]:
correct_labels = []
for row in test_df2['ethnicity.id'].iterrows():
    if row[1][0]:
        correct_labels.append(ethnic_dict[row[1][0]])
    else:
        correct_labels.append(None)

In [36]:
test_df2['ethnicity.label'] = correct_labels

In [37]:
test_df2['ethnicity.label']

Unnamed: 0,ethnicity.label
1,African
2,White
3,Asian
4,Multiracial
5,Multiracial
...,...
2500,Irish
2501,Multiracial
2502,Multiracial
2503,


In [38]:
test_df2.to_csv("individuals_updated_ontologies.csv", index=False)