### Wordnet Datset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [2]:
wordnet_train_df = pd.read_json('./datasets/WN18RR/wordnet_train.json')
wordnet_train_df = wordnet_train_df.rename(columns={"ID": "original-entity", "term": "entity", "type": "label"})
print("All classes")
print(wordnet_train_df["label"].unique())

All classes
['noun' 'verb' 'adjective' 'adverb']


In [3]:
wordnet_train_df

Unnamed: 0,original-entity,entity,label,sentence
0,__land_reform_NN_1,land reform,noun,
1,__cover_VB_1,cover,verb,cover her face with a handkerchief
2,__phytology_NN_1,phytology,noun,
3,__kamet_NN_1,kamet,noun,
4,__question_NN_1,question,noun,there was a question about my training
...,...,...,...,...
40554,__silver_salmon_NN_1,silver salmon,noun,
40555,__predictability_NN_1,predictability,noun,
40556,__achillea_NN_1,achillea,noun,
40557,__private_property_NN_1,private property,noun,


In [4]:
train, test = train_test_split(wordnet_train_df, train_size=0.9, random_state=42)

In [32]:
#print(train.info(memory_usage='deep'))
#print(test.sample)

In [12]:
train = train.to_dict(orient='records')
test = test.to_dict(orient='records')

data = {
    "train": train,
    "test": test
}

#print(data)
json_data = json.dumps(data, indent=4)

In [13]:
with open('./datasets/WN18RR/wn18rr_entities.json', 'w') as file:
    file.write(json_data)

### Geonames Dataset

In [None]:
{
    "index": 0,
    "name": "Roc Meler",
    "asciname": "Roc Meler",
    "country_code": "AD",
    "country_name": "Andorra",
    "level1": "T",
    "level2": "PK",
    "type-label": "T.PK",
    "type-name": "peak",
    "status": "train"
},

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json
from tqdm.notebook import tqdm_notebook

In [16]:
tqdm_notebook.pandas()

In [36]:
allCoutries = pd.read_csv('./datasets/Geonames/processed-3/allCountries.csv')
country_mapper = pd.read_csv('./assets/CountryCodes/country_codes.csv')
feature_mapper = pd.read_csv('./datasets/Geonames/processed-3/featureCodes_en.csv')

In [37]:
feature_mapper

Unnamed: 0,feature-code,name,description
0,A.ADM1,first-order administrative division,a primary administrative division of a country...
1,A.ADM1H,historical first-order administrative division,a former first-order administrative division
2,A.ADM2,second-order administrative division,a subdivision of a first-order administrative ...
3,A.ADM2H,historical second-order administrative division,a former second-order administrative division
4,A.ADM3,third-order administrative division,a subdivision of a second-order administrative...
...,...,...,...
675,V.SCRB,scrubland,"an area of low trees, bushes, and shrubs stunt..."
676,V.TREE,tree(s),a conspicuous tree used as a landmark
677,V.TUND,tundra,"a marshy, treeless, high latitude plain, domin..."
678,V.VIN,vineyard,a planting of grapevines


In [38]:
allCoutries

Unnamed: 0,name,asciiname,country-code,level-1,level-2
0,Roc Meler,Roc Meler,AD,T,PK
1,Pic de les Abelletes,Pic de les Abelletes,AD,T,PK
2,Estany de les Abelletes,Estany de les Abelletes,AD,H,LK
3,Port Vieux de la Coume d’Ose,Port Vieux de la Coume d'Ose,AD,T,PASS
4,Port de la Cabanette,Port de la Cabanette,AD,T,PASS
...,...,...,...,...,...
9001159,Hebi Seamounts,Hebi Seamounts,,U,SMSU
9001160,Okkonen Canyon,Okkonen Canyon,,U,CNYU
9001161,G5 Sahel,G5 Sahel,,L,RGN
9001162,Long Bay,Long Bay,,H,BAY


In [39]:
train_df = pd.read_json('./datasets/Geonames/geonames_train.json')
train_df = train_df.rename(columns={"term": "name", "type": "type-name"})

In [40]:
train_df

Unnamed: 0,ID,name,type-name
0,0,Pic de Font Blanca,peak
1,1,Roc Mele,mountain
2,2,Pic des Langounelles,peak
3,3,Pic de les Abelletes,peak
4,4,Estany de les Abelletes,lake
...,...,...,...
8078860,8781370,Yuqing Seamount,seamount
8078861,8781371,Zelia Gattai Seamount,seamount
8078862,8781372,Zhenghe Ridge,ridge
8078863,8781373,Akkadian Empire,historical first-order administrative division


In [41]:
agg_func = {
    'country-code': 'first',
    'level-1': 'first',
    'level-2': 'first'
}

allCoutries_agg = allCoutries.groupby('name').agg(agg_func).reset_index()

merged_df = pd.merge(train_df, allCoutries_agg, how='left', left_on='name', right_on='name')

merged_df['combined'] = merged_df['level-1'].astype(str) + '.' + merged_df['level-2'].astype(str)
merged_df = merged_df.fillna({"country-code": "not identified country"})

In [42]:
merged_df

Unnamed: 0,ID,name,type-name,country-code,level-1,level-2,combined
0,0,Pic de Font Blanca,peak,FR,T,PK,T.PK
1,1,Roc Mele,mountain,not identified country,,,nan.nan
2,2,Pic des Langounelles,peak,FR,T,PK,T.PK
3,3,Pic de les Abelletes,peak,AD,T,PK,T.PK
4,4,Estany de les Abelletes,lake,AD,H,LK,H.LK
...,...,...,...,...,...,...,...
8078860,8781370,Yuqing Seamount,seamount,not identified country,U,SMU,U.SMU
8078861,8781371,Zelia Gattai Seamount,seamount,not identified country,,,nan.nan
8078862,8781372,Zhenghe Ridge,ridge,not identified country,U,RDGU,U.RDGU
8078863,8781373,Akkadian Empire,historical first-order administrative division,not identified country,A,ADM1H,A.ADM1H


In [43]:
na_df = merged_df[merged_df.isna().any(axis=1)]
merged_df = merged_df.dropna()

In [44]:
na_df = na_df.drop(columns=['level-1', 'level-2', 'combined'])

In [46]:
na_df

Unnamed: 0,ID,name,type-name,country-code
1,1,Roc Mele,mountain,not identified country
5,5,Port Vieux de la Coume d'Ose,pass,not identified country
23,26,Basers de Vicenc,cliff(s),not identified country
31,35,Sola Vell,slope(s),not identified country
53,59,Cami de la Vall del Riu,trail,not identified country
...,...,...,...,...
8078827,8781334,Putoto Seamount,seamount,not identified country
8078834,8781342,Sao Francisco Canyon,canyon,not identified country
8078850,8781359,Uruti Basin,basin,not identified country
8078861,8781371,Zelia Gattai Seamount,seamount,not identified country


In [47]:
agg_func = {
    'feature-code': 'first',
}

feature_mapper_agg = feature_mapper.groupby('name').agg(agg_func).reset_index()
feature_mapper_agg

Unnamed: 0,name,feature-code
0,State Exam Prep Centre,S.SECP
1,abandoned airfield,S.AIRQ
2,abandoned camp,S.CMPQ
3,abandoned canal,H.CNLQ
4,abandoned factory,S.MFGQ
...,...,...
659,wildlife reserve,L.RESW
660,windmill,S.MLWND
661,wreck,S.WRCK
662,zone,A.ZN


In [48]:
na_df = na_df.rename(columns={"name": "label"})

In [49]:
na_df = pd.merge(na_df, feature_mapper_agg, how='left', left_on='type-name', right_on='name')

In [50]:
na_df

Unnamed: 0,ID,label,type-name,country-code,name,feature-code
0,1,Roc Mele,mountain,not identified country,mountain,T.MT
1,5,Port Vieux de la Coume d'Ose,pass,not identified country,pass,T.PASS
2,26,Basers de Vicenc,cliff(s),not identified country,cliff(s),T.CLF
3,35,Sola Vell,slope(s),not identified country,slope(s),T.SLP
4,59,Cami de la Vall del Riu,trail,not identified country,trail,R.TRL
...,...,...,...,...,...,...
1699951,8781334,Putoto Seamount,seamount,not identified country,seamount,U.SMU
1699952,8781342,Sao Francisco Canyon,canyon,not identified country,canyon,T.CNYN
1699953,8781359,Uruti Basin,basin,not identified country,basin,U.BSNU
1699954,8781371,Zelia Gattai Seamount,seamount,not identified country,seamount,U.SMU


In [51]:
na_df["level-1"], na_df["level-2"] = na_df.progress_apply(lambda x: x["feature-code"].split(".")[0], axis = 1), na_df.progress_apply(lambda x: x["feature-code"].split(".")[1], axis = 1)
na_df['combined'] = na_df['level-1'].astype(str) + '.' + na_df['level-2'].astype(str)

  0%|          | 0/1699956 [00:00<?, ?it/s]

  0%|          | 0/1699956 [00:00<?, ?it/s]

In [52]:
na_df

Unnamed: 0,ID,label,type-name,country-code,name,feature-code,level1,level2
0,1,Roc Mele,mountain,not identified country,mountain,T.MT,T,MT
1,5,Port Vieux de la Coume d'Ose,pass,not identified country,pass,T.PASS,T,PASS
2,26,Basers de Vicenc,cliff(s),not identified country,cliff(s),T.CLF,T,CLF
3,35,Sola Vell,slope(s),not identified country,slope(s),T.SLP,T,SLP
4,59,Cami de la Vall del Riu,trail,not identified country,trail,R.TRL,R,TRL
...,...,...,...,...,...,...,...,...
1699951,8781334,Putoto Seamount,seamount,not identified country,seamount,U.SMU,U,SMU
1699952,8781342,Sao Francisco Canyon,canyon,not identified country,canyon,T.CNYN,T,CNYN
1699953,8781359,Uruti Basin,basin,not identified country,basin,U.BSNU,U,BSNU
1699954,8781371,Zelia Gattai Seamount,seamount,not identified country,seamount,U.SMU,U,SMU


In [53]:
merged_df = pd.concat([merged_df, na_df])

In [54]:
merged_df

Unnamed: 0,ID,name,type-name,country-code,level-1,level-2,combined,label,feature-code,level1,level2
0,0,Pic de Font Blanca,peak,FR,T,PK,T.PK,,,,
2,2,Pic des Langounelles,peak,FR,T,PK,T.PK,,,,
3,3,Pic de les Abelletes,peak,AD,T,PK,T.PK,,,,
4,4,Estany de les Abelletes,lake,AD,H,LK,H.LK,,,,
6,6,Port de la Cabanette,pass,AD,T,PASS,T.PASS,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1699951,8781334,seamount,seamount,not identified country,,,,Putoto Seamount,U.SMU,U,SMU
1699952,8781342,canyon,canyon,not identified country,,,,Sao Francisco Canyon,T.CNYN,T,CNYN
1699953,8781359,basin,basin,not identified country,,,,Uruti Basin,U.BSNU,U,BSNU
1699954,8781371,seamount,seamount,not identified country,,,,Zelia Gattai Seamount,U.SMU,U,SMU


In [None]:
train, test = train_test_split(train_df, train_size=0.99, random_state=42)

In [None]:
train

In [None]:
test

In [17]:
train = train.to_dict(orient='records')
test = test.to_dict(orient='records')

data = {
    "train": train,
    "test": test
}

#print(data)
json_data = json.dumps(data, indent=4)

In [18]:
with open('./datasets/Geonames/geonames_entities.json', 'w') as file:
    file.write(json_data)

In [12]:
train_df = pd.read_json('./datasets/Geonames/geonames_entities.json')
train_df.head()

Unnamed: 0,ID,term,type
0,0,Pic de Font Blanca,peak
1,1,Roc Mele,mountain
2,2,Pic des Langounelles,peak
3,3,Pic de les Abelletes,peak
4,4,Estany de les Abelletes,lake


In [20]:
print(len(train_df['type'].unique()))

660


In [17]:
print(len(train_df['type'].unique()))
print(train_df.count())

660
ID      8078865
term    8078781
type    8078865
dtype: int64


### Medcin

In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [67]:
f = open('./datasets-old/UMLS/label_mapper.json')
label_mapper = json.load(f)

In [68]:
train_df = pd.read_json('./datasets/UMLS/medcin_train.json')
train_df.head()

Unnamed: 0,ID,term,type
0,C0000294-0,mesna (medication),"[organic chemical, pharmacologic substance]"
1,C0000473-1,aminobenzoic acid (medication),"[organic chemical, pharmacologic substance, vi..."
2,C0000477-2,CNS stimulants dalfampridine,"[organic chemical, pharmacologic substance]"
3,C0000618-4,6-mercaptopurine (medication),"[nucleic acid, nucleoside, or nucleotide, phar..."
4,C0000727-5,acute abdomen (diagnosis),[sign or symptom]


In [69]:
def get_label_str(row):
    labels = []
    for label in label_mapper:
        for data in row["type"]:
            # print(data)
            mapper =  [x.lower() for x in label_mapper[label]]
            if data.lower() in mapper:
        # print(label_mapper[label])
                labels.append(label)
    return list(set(labels))

In [70]:
train_df["label-str"] = train_df.apply(get_label_str, axis = 1)

In [71]:
train_df

Unnamed: 0,ID,term,type,label-str
0,C0000294-0,mesna (medication),"[organic chemical, pharmacologic substance]","[A1.4.1.1.1, A1.4.1.2.1]"
1,C0000473-1,aminobenzoic acid (medication),"[organic chemical, pharmacologic substance, vi...","[A1.4.1.1.1, A1.4.1.1.3.4, A1.4.1.2.1]"
2,C0000477-2,CNS stimulants dalfampridine,"[organic chemical, pharmacologic substance]","[A1.4.1.1.1, A1.4.1.2.1]"
3,C0000618-4,6-mercaptopurine (medication),"[nucleic acid, nucleoside, or nucleotide, phar...","[A1.4.1.1.1, A1.4.1.2.1.5]"
4,C0000727-5,acute abdomen (diagnosis),[sign or symptom],[A2.2.2]
...,...,...,...,...
277023,C5685946-346280,WABC receptive testing standard score,[finding],[A2.2]
277024,C5685947-346281,xenograft implantation into articular surface ...,[therapeutic or preventive procedure],[B1.3.1.3]
277025,C5685948-346282,"x-ray of upper GI tract, including Scout abdom...",[diagnostic procedure],[B1.3.1.2]
277026,C5685949-346283,"x-ray of upper GI tract, including Scout abdom...",[diagnostic procedure],[B1.3.1.2]


In [72]:
train_df = train_df.rename(columns={"term": "concept"})

In [74]:
train_df

Unnamed: 0,ID,concept,type,label-str
0,C0000294-0,mesna (medication),"[organic chemical, pharmacologic substance]","[A1.4.1.1.1, A1.4.1.2.1]"
1,C0000473-1,aminobenzoic acid (medication),"[organic chemical, pharmacologic substance, vi...","[A1.4.1.1.1, A1.4.1.1.3.4, A1.4.1.2.1]"
2,C0000477-2,CNS stimulants dalfampridine,"[organic chemical, pharmacologic substance]","[A1.4.1.1.1, A1.4.1.2.1]"
3,C0000618-4,6-mercaptopurine (medication),"[nucleic acid, nucleoside, or nucleotide, phar...","[A1.4.1.1.1, A1.4.1.2.1.5]"
4,C0000727-5,acute abdomen (diagnosis),[sign or symptom],[A2.2.2]
...,...,...,...,...
277023,C5685946-346280,WABC receptive testing standard score,[finding],[A2.2]
277024,C5685947-346281,xenograft implantation into articular surface ...,[therapeutic or preventive procedure],[B1.3.1.3]
277025,C5685948-346282,"x-ray of upper GI tract, including Scout abdom...",[diagnostic procedure],[B1.3.1.2]
277026,C5685949-346283,"x-ray of upper GI tract, including Scout abdom...",[diagnostic procedure],[B1.3.1.2]


In [75]:
train, test = train_test_split(train_df, train_size=0.9, random_state=42)

In [76]:
train["status"] = "train"
test["status"] = "test"

In [77]:
df_concat = pd.concat([train, test])

In [78]:
df_concat

Unnamed: 0,ID,concept,type,label-str,status
101192,C2106775-126398,complete retinal detachment (physical finding),[finding],[A2.2],train
102907,C2108916-128503,knee joint pain only with jumping (symptom),[sign or symptom],[A2.2.2],train
40962,C2030211-51205,heat therapy of posterior surface of right elb...,[therapeutic or preventive procedure],[B1.3.1.3],train
61650,C2056385-77052,tenderness on palpation of dorsal aspect of di...,[finding],[A2.2],train
57447,C2051083-71822,patch on right antihelix (physical finding),[finding],[A2.2],train
...,...,...,...,...,...
156984,C2177051-196045,muscle spasm of infraspinatus region of left s...,[finding],[A2.2],test
163582,C2185389-204301,long-term: overhead reaching with supervision,[therapeutic or preventive procedure],[B1.3.1.3],test
79487,C2079273-99358,iris bombe of left eye (diagnosis),[disease or syndrome],[B2.2.1.2.1],test
191730,C2221120-239617,puncture of finger by knife (physical finding),[finding],[A2.2],test


In [83]:
data = df_concat.to_dict(orient='records')
json_data = json.dumps(data, indent=4)

In [84]:
with open('./datasets/UMLS/medcin_entities.json', 'w') as file:
    file.write(json_data)

### NCI

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [6]:
f = open('./datasets-old/UMLS/label_mapper.json')
label_mapper = json.load(f)

In [7]:
train_df = pd.read_json('./datasets/UMLS/nci_train.json')
train_df.head()

Unnamed: 0,ID,term,type
0,C0000325-1,"1,2-Dihydro-3-methyl-benz(j)aceanthrylene","[organic chemical, hazardous or poisonous subs..."
1,C0000598-2,"5-(o-Chlorobenzyl)-4,5,6,7-tetrahydrothieno(3,...","[organic chemical, pharmacologic substance]"
2,C0000699-3,Antibiotic A-23187,"[organic chemical, antibiotic]"
3,C0000742-4,A-MuLV,[virus]
4,C0000768-5,Congenital Anomaly or Birth Defect,[congenital abnormality]


In [8]:
def get_label_str(row):
    labels = []
    for label in label_mapper:
        for data in row["type"]:
            # print(data)
            mapper =  [x.lower() for x in label_mapper[label]]
            if data.lower() in mapper:
        # print(label_mapper[label])
                labels.append(label)
    return list(set(labels))

In [9]:
train_df["label-str"] = train_df.apply(get_label_str, axis = 1)

In [None]:
train_df

Unnamed: 0,ID,term,type,label-str
0,C0000294-0,mesna (medication),"[organic chemical, pharmacologic substance]","[A1.4.1.1.1, A1.4.1.2.1]"
1,C0000473-1,aminobenzoic acid (medication),"[organic chemical, pharmacologic substance, vi...","[A1.4.1.1.1, A1.4.1.1.3.4, A1.4.1.2.1]"
2,C0000477-2,CNS stimulants dalfampridine,"[organic chemical, pharmacologic substance]","[A1.4.1.1.1, A1.4.1.2.1]"
3,C0000618-4,6-mercaptopurine (medication),"[nucleic acid, nucleoside, or nucleotide, phar...","[A1.4.1.1.1, A1.4.1.2.1.5]"
4,C0000727-5,acute abdomen (diagnosis),[sign or symptom],[A2.2.2]
...,...,...,...,...
277023,C5685946-346280,WABC receptive testing standard score,[finding],[A2.2]
277024,C5685947-346281,xenograft implantation into articular surface ...,[therapeutic or preventive procedure],[B1.3.1.3]
277025,C5685948-346282,"x-ray of upper GI tract, including Scout abdom...",[diagnostic procedure],[B1.3.1.2]
277026,C5685949-346283,"x-ray of upper GI tract, including Scout abdom...",[diagnostic procedure],[B1.3.1.2]


In [10]:
train_df = train_df.rename(columns={"term": "concept"})

In [11]:
train_df

Unnamed: 0,ID,concept,type,label-str
0,C0000325-1,"1,2-Dihydro-3-methyl-benz(j)aceanthrylene","[organic chemical, hazardous or poisonous subs...","[A1.4.1.2.1, A1.4.1.1.5]"
1,C0000598-2,"5-(o-Chlorobenzyl)-4,5,6,7-tetrahydrothieno(3,...","[organic chemical, pharmacologic substance]","[A1.4.1.2.1, A1.4.1.1.1]"
2,C0000699-3,Antibiotic A-23187,"[organic chemical, antibiotic]","[A1.4.1.2.1, A1.4.1.1.1.1]"
3,C0000742-4,A-MuLV,[virus],"[A1.1.4, A1.1.2]"
4,C0000768-5,Congenital Anomaly or Birth Defect,[congenital abnormality],[A1.2.2.1]
...,...,...,...,...
96172,C5671067-120216,AR-v7 Measurement,[laboratory procedure],[B1.3.1.1]
96173,C5671068-120217,Epithelium Attenuation,[finding],[A2.2]
96174,C5671069-120218,Gene Level Copy Number Scores,[intellectual product],[A2.4]
96175,C5671070-120219,RPMA,[molecular biology research technique],[B1.3.2.1]


In [12]:
train, test = train_test_split(train_df, train_size=0.9, random_state=42)

In [13]:
train["status"] = "train"
test["status"] = "test"

In [14]:
df_concat = pd.concat([train, test])

In [15]:
df_concat

Unnamed: 0,ID,concept,type,label-str,status
3627,C0281303-4547,CBDCA/CDDP/TAX,[therapeutic or preventive procedure],[B1.3.1.3],train
16408,C1518157-20527,Major Groove,"[nucleic acid, nucleoside, or nucleotide, biol...","[A1.4.1.2.1.5, A1.4.1.1.3]",train
13955,C1514579-17444,Proximal Connecting Fiber,[cell component],[A1.2.3.4],train
11931,C1511535-14915,Cost Effective Management,[occupation or discipline],[A2.6],train
65005,C4331981-81106,Head and Neck Cancer Clinical Regional Lymph N...,[finding],[A2.2],train
...,...,...,...,...,...
47325,C3639894-59167,COMM - Seriously Thought of Hurting Self,[intellectual product],[A2.4],test
89136,C5447127-111370,Time Amount Answer,[intellectual product],[A2.4],test
57481,C3899909-71812,"2-Methoxy-5-(((2-(2,4,6-trimethoxyphenyl)ethen...","[organic chemical, pharmacologic substance]","[A1.4.1.2.1, A1.4.1.1.1]",test
51709,C3827852-64653,TDI - Change in Magnitude of Effort,[intellectual product],[A2.4],test


In [16]:
data = df_concat.to_dict(orient='records')
json_data = json.dumps(data, indent=4)

In [17]:
with open('./datasets/UMLS/nci_entities.json', 'w') as file:
    file.write(json_data)

### Snomedct

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [19]:
f = open('./datasets-old/UMLS/label_mapper.json')
label_mapper = json.load(f)

In [20]:
train_df = pd.read_json('./datasets/UMLS/snomedct_us_train.json')
train_df.head()

Unnamed: 0,ID,term,type
0,C0000167-0,17-Oxosteroids (substance),"[organic chemical, hormone]"
1,C0000172-1,18-Hydroxycorticosterone (substance),"[organic chemical, pharmacologic substance]"
2,C0000714-2,Temephos (substance),"[organic chemical, hazardous or poisonous subs..."
3,C0000726-4,Abdominal structure (body structure),[body location or region]
4,C0000729-5,[D]Abdominal cramps (situation),[sign or symptom]


In [21]:
def get_label_str(row):
    labels = []
    for label in label_mapper:
        for data in row["type"]:
            # print(data)
            mapper =  [x.lower() for x in label_mapper[label]]
            if data.lower() in mapper:
        # print(label_mapper[label])
                labels.append(label)
    return list(set(labels))

In [22]:
train_df["label-str"] = train_df.apply(get_label_str, axis = 1)

In [23]:
train_df

Unnamed: 0,ID,term,type,label-str
0,C0000167-0,17-Oxosteroids (substance),"[organic chemical, hormone]","[A1.4.1.2.1, A1.4.1.1.3.2]"
1,C0000172-1,18-Hydroxycorticosterone (substance),"[organic chemical, pharmacologic substance]","[A1.4.1.2.1, A1.4.1.1.1]"
2,C0000714-2,Temephos (substance),"[organic chemical, hazardous or poisonous subs...","[A1.4.1.2.1, A1.4.1.1.5]"
3,C0000726-4,Abdominal structure (body structure),[body location or region],[A2.1.5.2]
4,C0000729-5,[D]Abdominal cramps (situation),[sign or symptom],[A2.2.2]
...,...,...,...,...
278369,C5699985-347962,Evacuated blood collection tube with silicone-...,[medical device],[A1.3.1]
278370,C5699986-347963,Evacuated blood collection tube with K2EDTA (d...,[medical device],[A1.3.1]
278371,C5699987-347964,Evacuated blood collection tube with heparin s...,[medical device],[A1.3.1]
278372,C5699988-347965,MAKOplasty of joint (procedure),[therapeutic or preventive procedure],[B1.3.1.3]


In [24]:
train_df = train_df.rename(columns={"term": "concept"})

In [25]:
train_df

Unnamed: 0,ID,concept,type,label-str
0,C0000167-0,17-Oxosteroids (substance),"[organic chemical, hormone]","[A1.4.1.2.1, A1.4.1.1.3.2]"
1,C0000172-1,18-Hydroxycorticosterone (substance),"[organic chemical, pharmacologic substance]","[A1.4.1.2.1, A1.4.1.1.1]"
2,C0000714-2,Temephos (substance),"[organic chemical, hazardous or poisonous subs...","[A1.4.1.2.1, A1.4.1.1.5]"
3,C0000726-4,Abdominal structure (body structure),[body location or region],[A2.1.5.2]
4,C0000729-5,[D]Abdominal cramps (situation),[sign or symptom],[A2.2.2]
...,...,...,...,...
278369,C5699985-347962,Evacuated blood collection tube with silicone-...,[medical device],[A1.3.1]
278370,C5699986-347963,Evacuated blood collection tube with K2EDTA (d...,[medical device],[A1.3.1]
278371,C5699987-347964,Evacuated blood collection tube with heparin s...,[medical device],[A1.3.1]
278372,C5699988-347965,MAKOplasty of joint (procedure),[therapeutic or preventive procedure],[B1.3.1.3]


In [26]:
train, test = train_test_split(train_df, train_size=0.9, random_state=42)

In [27]:
train["status"] = "train"
test["status"] = "test"

In [28]:
df_concat = pd.concat([train, test])

In [29]:
df_concat

Unnamed: 0,ID,concept,type,label-str,status
146825,C0730100-183371,Metatarsophalangeal joint synovium of lesser t...,"[body part, organ, or organ component]",[A1.2.3.1],train
63278,C0395229-79073,Protective suture of eyelid NOS (procedure),[therapeutic or preventive procedure],[B1.3.1.3],train
153326,C1068497-191504,Chromohalobacter canadensis (organism),[bacterium],[A1.1.2],train
100285,C0450885-125356,Spinal acupuncture T10 (body structure),[body location or region],[A2.1.5.2],train
40081,C0313179-50098,Blood group antibody Hut (substance),"[amino acid, peptide, or protein, immunologic ...","[A1.4.1.2.1.7, A1.4.1.1.3.5]",train
...,...,...,...,...,...
254924,C4750618-318577,Otitis externa of right external auditory cana...,[disease or syndrome],[B2.2.1.2.1],test
10671,C0185767-13277,Closed reduction of fracture of alveolar proce...,[therapeutic or preventive procedure],[B1.3.1.3],test
55463,C0341118-69324,Eosinophilic ulcer of esophagus (disorder),[disease or syndrome],[B2.2.1.2.1],test
81734,C0423042-102284,Divergence paralysis (disorder),[sign or symptom],[A2.2.2],test


In [30]:
data = df_concat.to_dict(orient='records')
json_data = json.dumps(data, indent=4)

In [31]:
with open('./datasets/UMLS/snomedct_us_entities.json', 'w') as file:
    file.write(json_data)

### Biological

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [2]:
train_df = pd.read_json('./datasets/GO/biological_process_train.json')
train_df.head()

Unnamed: 0,ID,term,type
0,0,Splicing factor 3a subunit 2,"[positive regulation of gene expression, mRNA ..."
1,1,im:7158514,[biological_process]
2,2,Glycerophosphodiester phosphodiesterase domain...,"[lipid metabolic process, positive regulation ..."
3,3,JUMONJI 13,"[regulation of DNA-templated transcription, ne..."
4,4,TLC domain-containing protein,"[retina development in camera-type eye, cerami..."


In [3]:
train, test = train_test_split(train_df, train_size=0.9, random_state=42)

In [4]:
train["status"] = "train"
test["status"] = "test"

In [5]:
df_concat = pd.concat([train, test])

In [6]:
df_concat

Unnamed: 0,ID,term,type,status
177019,177019,PITSLRE serine/threonine-protein kinase CDC2L1,[biological_process],train
145952,145952,"RL12, ribosomal protein 11 60S large ribosomal...",[translation],train
126132,126132,EMB2555,[biological_process],train
159038,159038,GTP-binding protein ryh1,"[retrograde vesicle-mediated transport, Golgi ...",train
43895,43895,olfactory receptor family 2 subfamily Y member 1G,"[sensory perception of smell, detection of che...",train
...,...,...,...,...
185552,185552,At3g46020,"[positive regulation of mRNA splicing, via spl...",test
170628,170628,Frizzled/smoothened-like sans CRD protein H,[biological_process],test
52950,52950,"gamma-aminobutyric acid (GABA) A receptor, alp...","[monoatomic ion transport, chloride transport,...",test
14107,14107,NT-3 growth factor receptor,"[phosphorylation, transmembrane receptor prote...",test


In [7]:
data = df_concat.to_dict(orient='records')
json_data = json.dumps(data, indent=4)

In [8]:
with open('./datasets/GO/biological_entities.json', 'w') as file:
    file.write(json_data)

### Cellular

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [10]:
train_df = pd.read_json('./datasets/GO/cellular_component_train.json')
train_df.head()

Unnamed: 0,ID,term,type
0,0,neurofilament medium chain b,"[axon, cytoplasm]"
1,1,Cytochrome c mitochondrial import factor,[mitochondrion]
2,2,LIM domain-containing protein 2,"[actin cytoskeleton, cytosol, plasma membrane,..."
3,3,Large ribosomal subunit protein bL32m,"[mitochondrion, mitochondrial large ribosomal ..."
4,4,Fungal STAND N-terminal Goodbye domain-contain...,[cellular_component]


In [11]:
train, test = train_test_split(train_df, train_size=0.9, random_state=42)

In [12]:
train["status"] = "train"
test["status"] = "test"

In [13]:
df_concat = pd.concat([train, test])

In [14]:
df_concat

Unnamed: 0,ID,term,type,status
137635,137635,BnaC09g41740D protein,[nucleoplasm],train
147198,147198,tubulin-folding cofactor E-like,[cytoplasm],train
29637,29637,mediator complex subunit 10 L homeolog,[nucleus],train
109327,109327,YALI0D27170p,[cytosol],train
164569,164569,Canis lupus familiaris (dog) U6 spliceosomal R...,"[U4/U6 x U5 tri-snRNP complex, U6 snRNP]",train
...,...,...,...,...
195343,195343,olfactory receptor family 8 subfamily G member...,[plasma membrane],test
12142,12142,Zinc finger MYND domain-containing protein 11 ...,"[nucleus, nucleoplasm]",test
116244,116244,cyclophilin family peptidyl-prolyl cis-trans i...,"[cytosol, nucleus, cytoplasm]",test
14194,14194,Transport and Golgi organization 14,"[endoplasmic reticulum membrane, endoplasmic r...",test


In [15]:
data = df_concat.to_dict(orient='records')
json_data = json.dumps(data, indent=4)

In [16]:
with open('./datasets/GO/cellular_entities.json', 'w') as file:
    file.write(json_data)

# Molecular

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [18]:
train_df = pd.read_json('./datasets/GO/molecular_function_train.json')
train_df.head()

Unnamed: 0,ID,term,type
0,0,Transcription activator of gluconeogenesis ERT1-1,"[DNA-binding transcription factor activity, RN..."
1,1,HECT domain containing E3 ubiquitin protein li...,"[ubiquitin-protein transferase activity, ubiqu..."
2,2,SET domain containing 1A,"[protein binding, methyltransferase activity, ..."
3,3,EPH receptor B3 S homeolog,"[protein tyrosine kinase activity, protein kin..."
4,4,"cyclin dependent kinase 5, regulatory subunit 1","[calcium ion binding, kinase activity, proteas..."


In [19]:
train, test = train_test_split(train_df, train_size=0.9, random_state=42)

In [20]:
train["status"] = "train"
test["status"] = "test"

In [21]:
df_concat = pd.concat([train, test])

In [22]:
df_concat

Unnamed: 0,ID,term,type,status
191611,191611,"sodium leak channel, non-selective",[monoatomic cation channel activity],train
110865,110865,AT3G05570,[molecular_function],train
181293,181293,RIKEN cDNA 1700023A20 gene,[molecular_function],train
183151,183151,AT5G28610,[molecular_function],train
148678,148678,DSC E3 ubiquitin ligase complex subunit 1,[ubiquitin protein ligase activity],train
...,...,...,...,...
147756,147756,EMB1827,[molecular_function],test
189204,189204,BnaAnng07540D protein,[GTP binding],test
191971,191971,Tubulin beta-7 chain,[structural constituent of cytoskeleton],test
144880,144880,olfactory receptor family 2 subfamily A member 54,[olfactory receptor activity],test


In [23]:
data = df_concat.to_dict(orient='records')
json_data = json.dumps(data, indent=4)

In [24]:
with open('./datasets/GO/molecular_entities.json', 'w') as file:
    file.write(json_data)