In [1]:
import json
import xmltodict
 
def convert_xml_to_json(xml_file, xml_attribs=True, store_output = False, output_path = ''):
    ''' takes xml filepath input and returns dict
        if store_output is True and output filepath is provided, saves to filepath provided
    '''
    with open(xml_file, "rb") as f:
        d = xmltodict.parse(f, xml_attribs=xml_attribs)
        output = json.dumps(d, indent=4)
        if (store_output == True) and (output_path != ''):
            output_file = open(output_path,'w')
            output_file.write(output)
        return output

In [57]:
data = convert_xml_to_json('data/full database.xml',store_output=True,output_path='data/database.json')

In [66]:
d = json.loads(data)

In [78]:
from pymongo import MongoClient
client = MongoClient()
db = client.drugbank
drugs = db.drugs
drugs.insert_many(d['drugbank']['drug'])

<pymongo.results.InsertManyResult at 0x7f11161234c8>

In [8]:
from pymongo import MongoClient
client = MongoClient()
db = client.drugbank
drugs = db.drugs
drugs_sub = db.drugs_sub
drugs_exp = db.drugs_exp

In [5]:
drugs.find({}).count()

11033

In [135]:
def get_dict_structure(d):
    if (type(d)!=list) and (type(d)!=dict):
        return str(type(d))
    elif type(d) == list:
        d = d[0]
        return get_dict_structure(d)
    else:
        keys = list(d.keys())
        struct = {}
        for key in keys:
            struct[key] = get_dict_structure(d[key])
        return struct

In [136]:
print(json.dumps(get_dict_structure(d['drugbank']['drug']), indent=4))

{
    "@type": "<class 'str'>",
    "@created": "<class 'str'>",
    "@updated": "<class 'str'>",
    "drugbank-id": {
        "@primary": "<class 'str'>",
        "#text": "<class 'str'>"
    },
    "name": "<class 'str'>",
    "description": "<class 'str'>",
    "cas-number": "<class 'str'>",
    "unii": "<class 'str'>",
    "state": "<class 'str'>",
    "groups": {
        "group": "<class 'str'>"
    },
    "general-references": {
        "articles": {
            "article": {
                "pubmed-id": "<class 'str'>",
                "citation": "<class 'str'>"
            }
        },
        "textbooks": "<class 'NoneType'>",
        "links": {
            "link": {
                "title": "<class 'str'>",
                "url": "<class 'str'>"
            }
        }
    },
    "synthesis-reference": "<class 'NoneType'>",
    "indication": "<class 'str'>",
    "pharmacodynamics": "<class 'str'>",
    "mechanism-of-action": "<class 'str'>",
    "toxicity": "<class 'str'>",
 

In [81]:
list(drugs.find({}).limit(1))

[{'_id': ObjectId('5b1c904f42f3cb224d853fbd'),
  '@type': 'biotech',
  '@created': '2005-06-13',
  '@updated': '2018-03-02',
  'drugbank-id': [{'@primary': 'true', '#text': 'DB00001'},
   'BTD00024',
   'BIOD00024'],
  'name': 'Lepirudin',
  'description': 'Lepirudin is identical to natural hirudin except for substitution of leucine for isoleucine at the N-terminal end of the molecule and the absence of a sulfate group on the tyrosine at position 63. It is produced via yeast cells. Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.',
  'cas-number': '138068-37-8',
  'unii': 'Y43GF64R34',
  'state': 'liquid',
  'groups': {'group': 'approved'},
  'general-references': {'articles': {'article': [{'pubmed-id': '16244762',
      'citation': 'Smythe MA, Stephens JL, Koerber JM, Mattson JC: A comparison of lepirudin and argatroban outcomes. Clin Appl Thromb Hemost. 2005 Oct;11(4):371-4.'},
     {'pubmed-id': '16690967',
      'citation': 'Tardy B, Lecompte T, Boelhen F

In [3]:
drugs_sub = db.drugs_sub

drugs_sub.insert_many(
    list(drugs.find({},{'_id':0,'drugbank-id':1,
                                    'name':1,'classification':1,
                                    'categories':1,'targets':1,
                                    'drug-interactions':1}))
)

<pymongo.results.InsertManyResult at 0x7f0a5e68af88>

In [89]:
list(drugs_sub.find({}).limit(1))

[{'_id': ObjectId('5b1ee19f42f3cb35b5eed2ad'),
  'drugbank-id': [{'@primary': 'true', '#text': 'DB00001'},
   'BTD00024',
   'BIOD00024'],
  'name': 'Lepirudin',
  'classification': {'description': None,
   'direct-parent': 'Peptides',
   'kingdom': 'Organic Compounds',
   'superclass': 'Organic Acids',
   'class': 'Carboxylic Acids and Derivatives',
   'subclass': 'Amino Acids, Peptides, and Analogues'},
  'categories': {'category': [{'category': 'Amino Acids, Peptides, and Proteins',
     'mesh-id': 'D000602'},
    {'category': 'Anti-coagulant', 'mesh-id': None},
    {'category': 'Anticoagulants', 'mesh-id': 'D000925'},
    {'category': 'Antithrombin Proteins', 'mesh-id': 'D058833'},
    {'category': 'Antithrombins', 'mesh-id': 'D000991'},
    {'category': 'Blood and Blood Forming Organs', 'mesh-id': None},
    {'category': 'Cardiovascular Agents', 'mesh-id': 'D002317'},
    {'category': 'Chemical Actions and Uses', 'mesh-id': 'D020164'},
    {'category': 'Enzyme Inhibitors', 'mesh-i

In [353]:
def unnest_drug_data(d):
    new_d = {}
    try:
        if type(d['drugbank-id']) != list:
            new_d['drug_id'] = d['drugbank-id']['#text']
        else:
            new_d['drug_id'] = d['drugbank-id'][0]['#text']
        new_d['drug_name'] = d['name']
    except:
        #print(d['_id'], 'no id/name')
        return {}

    try:
        new_d['parent'] = d['classification']['direct-parent']
    except:
        new_d['parent'] = None
    try:
        new_d['kingdom'] = d['classification']['kingdom']
    except:
        new_d['kingdom'] = None
    try:
        new_d['superclass'] = d['classification']['superclass']
    except:
        new_d['superclass'] = None
    try:
        new_d['class'] = d['classification']['class']
    except:
        new_d['class'] = None
    try:
        new_d['subclass'] = d['classification']['subclass']
    except:
        new_d['subclass'] = None

    categories = []

    if d['categories'] == None:
        categories.append(None)
    elif type(d['categories']['category']) != list:
        categories.append(d['categories']['category']['category'])
    else:
        for elem in d['categories']['category']:
            categories.append(elem['category'])
    new_d['category'] = categories

    target_action_ids = []
    target_ids = []
    target_names = []
    target_actions = []

    if d['targets'] == None:
        #print(d['_id'], 'no targets')
        return {}
    # if only one target per drug
    elif type(d['targets']['target']) != list:
        targets = d['targets']['target']
        target_ids = [targets['id']]
        target_names = [targets['name']]
        # if action is null, can't make it part of the id
        if targets['actions'] == None:
            target_action_ids.append(targets['id']+'_'+'None')
            target_actions.append(None)
        # if only one action per target and not null
        elif type(targets['actions']['action']) != list:
            target_action_ids = [targets['id']+'_'+targets['actions']['action']]
            target_actions.append(targets['actions']['action'])
        # if multiple actions per target, iterate
        else:
            # get list of actions
            actions = targets['actions']['action']
            for action in actions:
                target_action_ids.append(targets['id']+'_'+action)
                target_actions.append(action)
    # if multiple targets per drug
    else: 
        targets = d['targets']['target']
        for target in targets:
            target_ids.append(target['id'])
            target_names.append(target['name'])
            if target['actions'] == None:
                target_action_ids.append(target['id']+'_'+'None')
                target_actions.append(None)
            elif type(target['actions']['action']) != list:
                target_action_ids.append(target['id']+'_'+target['actions']['action'])
            # if multiple actions per target, iterate
            else:
                # get list of actions
                actions = target['actions']['action']
                for action in actions:
                    target_action_ids.append(target['id']+'_'+action)
                    target_actions.append(action)

    new_d['target'] = list(zip(target_action_ids,target_ids,target_names,target_actions))

    ddi_ids = []
    ddi_names = []
    ddi_sympts = []
    interactions = d['drug-interactions']
    if interactions == None:
        pass
    elif type(interactions['drug-interaction']) != list:
        ddi_ids.append(interactions['drug-interaction']['drugbank-id'])
        ddi_names.append(interactions['drug-interaction']['name'])
        ddi_sympts.append(interactions['drug-interaction']['description'])
    else:
        for drug in interactions['drug-interaction']:
            ddi_ids.append(drug['drugbank-id'])
            ddi_names.append(drug['name'])
            ddi_sympts.append(drug['description'])

    new_d['ddi'] = list(zip(ddi_ids,ddi_names,ddi_sympts))
    
    return new_d

In [354]:
drugs_exp.delete_many({})
drugs_exp.find({}).count()

0

In [355]:
drugs_exp = db.drugs_exp

for elem in drugs_sub.find({}):
    drugs_exp.insert_one(unnest_drug_data(elem))

In [358]:
drugs_exp.find({'drug_id':{'$ne':None}}).count()

7246

In [362]:
list(drugs_exp.find({'drug_id':{'$ne':None}},{'_id':0}).limit(1))[0].keys()

dict_keys(['drug_id', 'drug_name', 'parent', 'kingdom', 'superclass', 'class', 'subclass', 'category', 'target', 'ddi'])

In [None]:
extracted = drugs_exp.aggregate([{'$project':{'_id':0}},
                                 {'$unwind':'$category'},
                                 {'$unwind':'$target'},
                                 {'$unwind':'$ddi'}
                             ])

drugs_unwd = db.drugs_unwd
drugs_unwd.insert_many(extracted)

In [None]:
import pandas as pd
df = pd.DataFrame(list(extracted))
df.head()

In [None]:
df.info()

In [None]:
list_target_action_ids = [x[0] for x in df['target']]
list_target_ids = [x[1] for x in df['target']]
list_target_names = [x[2] for x in df['target']]
list_target_actions = [x[3] for x in df['target']]

In [None]:
df_targets = pd.DataFrame({'target_action_id':list_target_action_ids,
                           'target_id':list_target_ids,
                           'target_name':list_target_names,
                           'target_action':list_target_actions})

(df_targets.groupby(by=['target_id'])['target_action']
         .nunique()
         .reset_index()
         .sort_values(by='target_action',ascending=False)
)

In [None]:
df_targets.info()

In [None]:
df_targets.describe()