download MONDO ontology and save its terms to a file.

`curl -L 'http://www.ebi.ac.uk/ols/api/ontologies/mondo/terms' -i -H 'Accept: application/json'`

In [27]:
import requests


terms = []
paging = {'page': 0, 'size': 5}
headers = {'Accept': 'application/json'}
r = requests.get('http://www.ebi.ac.uk/ols/api/ontologies/mondo/terms', params=paging, headers=headers)
if r.raise_for_status():
    print("Error on page: " + str(paging))
response = r.json()
terms.extend(x['label'] for x in response['_embedded']['terms'])
terms

['extracellular ligand-gated ion channel activity',
 'ligand-gated ion channel activity',
 'excitatory extracellular ligand-gated ion channel activity',
 'L1CAM',
 'gene']

In [28]:
import requests
import pandas as pd

page = 1
size = 200
paging = {'page': page, 'size': size}
headers = {'Accept': 'application/json'}
r = requests.get('http://www.ebi.ac.uk/ols/api/ontologies/mondo/terms', params=paging, headers=headers)
response = r.json()
total_pages = response['page']['totalPages']
print("Total number of pages: " + str(total_pages))

terms = []
for x in range(total_pages):
    paging = {'page': x, 'size': size}
    r = requests.get('http://www.ebi.ac.uk/ols/api/ontologies/mondo/terms', params=paging)
    response = r.json()
    print("Page retrieved: " + str(x))
    terms.extend(x['label'] for x in response['_embedded']['terms'])

mondo_terms = pd.DataFrame(terms)
mondo_terms.to_csv('data/mondo_vocabulary.csv', index=False)

Total number of pages: 222
Page retrieved: 0
Page retrieved: 1
Page retrieved: 2
Page retrieved: 3
Page retrieved: 4
Page retrieved: 5
Page retrieved: 6
Page retrieved: 7
Page retrieved: 8
Page retrieved: 9
Page retrieved: 10
Page retrieved: 11
Page retrieved: 12
Page retrieved: 13
Page retrieved: 14
Page retrieved: 15
Page retrieved: 16
Page retrieved: 17
Page retrieved: 18
Page retrieved: 19
Page retrieved: 20
Page retrieved: 21
Page retrieved: 22
Page retrieved: 23
Page retrieved: 24
Page retrieved: 25
Page retrieved: 26
Page retrieved: 27
Page retrieved: 28
Page retrieved: 29
Page retrieved: 30
Page retrieved: 31
Page retrieved: 32
Page retrieved: 33
Page retrieved: 34
Page retrieved: 35
Page retrieved: 36
Page retrieved: 37
Page retrieved: 38
Page retrieved: 39
Page retrieved: 40
Page retrieved: 41
Page retrieved: 42
Page retrieved: 43
Page retrieved: 44
Page retrieved: 45
Page retrieved: 46
Page retrieved: 47
Page retrieved: 48
Page retrieved: 49
Page retrieved: 50
Page retrieved

In [39]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher

ukb_data = pd.read_csv('data/ukb_synthetic_data_tofu.csv')
# values = ukb_data['Ethnic background-0.0'].unique()

ukb_data_noise = pd.read_csv('data/ukb_synthetic_data_with_noise.csv')
values_noise = ukb_data_noise['Ethnic background-0.0'].unique()

values = ['African', 'British', 'Asian or Asian British', 'White and Asian',
       'White and Black African', 'Prefer not to answer',
       'Any other Asian background', 'White and Black Caribbean',
       'Black or Black British', 'Any other mixed background',
       'Bangladeshi', 'Do not know', 'Any other Black background',
       'Irish', 'Mixed', 'White', 'Pakistani', 'Caribbean',
       'Other ethnic group', 'Indian', 'Any other white background',
       'Chinese']

for val in values_noise:
    # print(max([SequenceMatcher(None, val, x).ratio() for x in values]))
    max_match = [SequenceMatcher(None, val, x).ratio() for x in values]
    best_match = values[np.argmax(max_match)]
    print(val + ", " + best_match + ", " + str(max(max_match)))
    # print(SequenceMatcher(None, val, values[0]).ratio())

African, African, 1.0
British, British, 1.0
Asian or Asian British, Asian or Asian British, 1.0
White and Asian, White and Asian, 1.0
White and Black African, White and Black African, 1.0
Prefer not to answer, Prefer not to answer, 1.0
Any other Asian background, Any other Asian background, 1.0
White and Black Caribbean, White and Black Caribbean, 1.0
Black or Black British, Black or Black British, 1.0
Any other mixed background, Any other mixed background, 1.0
Bangladeshi, Bangladeshi, 1.0
Do not know, Do not know, 1.0
Asian kr Asiao British, Asian or Asian British, 0.9090909090909091
Any other Black background, Any other Black background, 1.0
Irish, Irish, 1.0
Mixed, Mixed, 1.0
White, White, 1.0
Pakistani, Pakistani, 1.0
Caribjcan, Caribbean, 0.7777777777777778
White and Black Caribbeau, White and Black Caribbean, 0.96
Other ethnic group, Other ethnic group, 1.0
Indian, Indian, 1.0
Brktisr, British, 0.7142857142857143
Any other white background, Any other white background, 1.0
Chvnes

In [1]:
import numpy as np
from difflib import SequenceMatcher

ethnic_b_values_noise = ukb_data['ethnic background'].unique()

ethnic_b_values = ['African', 'British', 'Asian or Asian British', 'White and Asian',
       'White and Black African', 'Prefer not to answer',
       'Any other Asian background', 'White and Black Caribbean',
       'Black or Black British', 'Any other mixed background',
       'Bangladeshi', 'Do not know', 'Any other Black background',
       'Irish', 'Mixed', 'White', 'Pakistani', 'Caribbean',
       'Other ethnic group', 'Indian', 'Any other white background',
       'Chinese']

similarity_list = []
for val in ethnic_b_values_noise:
    similarity = [SequenceMatcher(None, val, x).ratio() for x in ethnic_b_values]
    best_match = ethnic_b_values[np.argmax(similarity)]
    ratio = max(similarity)
    if ratio < 1:
        similarity_list.append({'value': val, 'best_match': best_match, 'ratio': ratio})

pd.DataFrame(similarity_list).head(20)

ethnic_b_mappings = {match['value']: match['best_match'] for match in similarity_list}
ethnic_b_mappings = {**ethnic_b_mappings, **{i:i for i in ethnic_b_values}}

ukb_data['ethnic background'] = ukb_data['ethnic background'].map(ethnic_b_mappings)
pd.DataFrame(ukb_data['ethnic background'].unique(), columns=['values'])

{'a': 1, 'b': 2, 'c': 3}

In [7]:
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
# nltk.download('punkt')

text = ['cant railway station', 'citadel hotel', 'police stn']
for line in text:
    token = word_tokenize(line)
    bigram = list(ngrams(token, 2))
    print(bigram)

[('cant', 'railway'), ('railway', 'station')]
[('citadel', 'hotel')]
[('police', ','), (',', 's'), ('s', 'stn')]


In [37]:
import pandas as pd
from nltk import word_tokenize
from nltk.util import ngrams

mondo_terms = pd.read_csv('data/mondo_vocabulary.csv')
mondo_bigrams = []
for index, row in mondo_terms.iterrows():
    bigrams = list(ngrams(word_tokenize(str(row['TERM'])), 2))
    mondo_bigrams.extend(bigrams)

bigram_df = pd.DataFrame(mondo_bigrams)
bigram_df[2] = 1000000
bigram_df

Unnamed: 0,0,1,2
0,extracellular,ligand-gated,1000000
1,ligand-gated,ion,1000000
2,ion,channel,1000000
3,channel,activity,1000000
4,ligand-gated,ion,1000000
...,...,...,...
113016,sensory,and,1000000
113017,and,autonomic,1000000
113018,autonomic,neuropathy,1000000
113019,neuropathy,type,1000000


In [38]:
bigrams = pd.read_csv('data/frequency_bigramdictionary_en_243_342.txt', sep=" ", header=None)
print(bigrams.shape)
all_bigrams = pd.concat([bigrams, bigram_df])

all_bigrams.to_csv('data/frequency_bigramdictionary_with_mondo.txt', sep=" ", header=None, index=False)

(242342, 3)


In [46]:
mondo_terms = pd.read_csv('data/mondo_vocabulary.csv')

tokenized_terms = []
for index, row in mondo_terms.iterrows():
    tokenized_terms.extend(list(ngrams(word_tokenize(str(row['TERM'])), 1)))

mondo_terms = pd.DataFrame(tokenized_terms)
dic_terms = pd.read_csv('data/frequency_dictionary_en_82_765.txt', sep=" ", header=None)
mondo_terms[1] = 100000000
all_terms = pd.concat([dic_terms, mondo_terms])
all_terms.to_csv('data/frequency_dictionary_with_mondo.txt', sep=" ", header=None, index=False)