# Getting Started
This notebook uses some utility classes to examine the data in the workbench

In [21]:
#Imports
import spark_session_builder
from PWBdata import PWBdata
import ipywidgets as widgets

In [22]:
# Create a sparksession and sparkcontext
sparkSession, sc = spark_session_builder.session()
sc

In [23]:
#  PATH TO THE DATA
#  HDFS browser is at https://hdfs-ui.demo-full-load.shared1.nonprod.entellect.com/explorer.html#/test
hdfs_path = "hdfs://demo-full-load-hdfs-namenode:8020" 

### The PWB class wraps the tables in RWB

In [24]:
pwb = PWBdata(hdfs_path)
[x for x in dir(pwb) if not x.startswith('__')]

['chembl_data_path',
 'chembl_files',
 'get_dataframe',
 'get_table_list',
 'hdfs_path',
 'pp_data_path',
 'pp_files',
 'resnet_data_path',
 'resnet_files',
 'rmc_data_path',
 'rmc_files']

## Pharmapendium data

![Pharmapendium schema](Pharmapendium.PNG)

In [25]:
from IPython.core.display import HTML


def show_top(df, records=10):
    """
    display the top n records in a table
    """
    display(HTML(df.limit(records).toPandas().to_html(escape=False)))

In [41]:
pp_tbls = pwb.get_table_list('pp')
pp_w = widgets.Dropdown(
    options=[x['name'] for x in pp_tbls],
    description='Pharmapendium Tables:',
    disabled=False,
)

display(pp_w)

Dropdown(description='Pharmapendium Tables:', options=('Activitytarget', 'Activitytargetextended', 'Activityta…

In [42]:
print (pp_w.value)
pp_df = pwb.get_dataframe(sparkSession, 'pp', pp_w.value)
show_top(pp_df, 5)

Activitytarget


Unnamed: 0,hdfsIngestTime,offset,ActivityTarget,core_relatedMatch,core_closeMatch,core_changeNote,core_prefLabel,core_scopeNote,core_broadMatch,skos_xl_prefLabel,core_inScheme,core_hiddenLabel,core_altLabel,core_exactMatch,isLeaf,core_broader,core_narrower,core_topConceptOf,skos_xl_hiddenLabel,skos_xl_altLabel,core_related,core_editorialNote,core_example,core_definition,core_note,provenanceJob
0,2020-07-21T09:13:49.868768,94,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/-ucvKv_6u5D,,,,Alpha Adrenoceptors,,,,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/ActivityTargetScheme,,,,False,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/wm0OEo6PEvB,,,,,,,,,,2b8f26b4-298a-4031-9af2-1de3e7b34cb4
1,2020-07-21T09:13:52.872806,116,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/21rhw2G40v9,,,,Voltage-Gated Sodium Channel TTX-Sensitive,,,,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/ActivityTargetScheme,,,,True,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/ecCQT1PXpHF,,,,,,,,,,2b8f26b4-298a-4031-9af2-1de3e7b34cb4
2,2020-07-21T09:13:46.372271,127,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/46StCDpmpSH,,,,Phosphodiesterase 4A1,,,,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/ActivityTargetScheme,,,,True,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/Zzo3DCLzO2-,,,,,,,,,,2b8f26b4-298a-4031-9af2-1de3e7b34cb4
3,2020-07-21T09:13:50.667433,114,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/6qN6qID6kG9,,,,5'-AMP-Activated protein kinase subunit beta-1,,,,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/ActivityTargetScheme,,,,True,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/-NmHwTiDfcG,,,,,,,,,,2b8f26b4-298a-4031-9af2-1de3e7b34cb4
4,2020-07-21T09:13:48.483249,147,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/6saDeTyJDVB,,,,Carbonic Anhydrase IV,,,,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/ActivityTargetScheme,,,,True,https://data.elsevier.com/lifescience/taxonomy/ppplus/activitytarget/WsvL9Z2KdQ5,,,,,,,,,,2b8f26b4-298a-4031-9af2-1de3e7b34cb4


## RMC Data

![RMC Schema](RMC.PNG)

In [43]:
rmc_tbls = pwb.get_table_list('rmc')
rmc_w = widgets.Dropdown(
    options=[x['name'] for x in rmc_tbls],
    description='RMC Tables:',
    disabled=False,
)

display(rmc_w)

Dropdown(description='RMC Tables:', options=('Activitysite', 'Administrationroute', 'Administrationtype', 'Aff…

In [44]:
print (rmc_w.value)
rmc_df = pwb.get_dataframe(sparkSession, 'rmc', rmc_w.value)
show_top(rmc_df, 5)

Analyticaltechnique


Unnamed: 0,hdfsIngestTime,offset,AnalyticalTechnique,hasOtherInfo,hasCreationDate,hasStatus,hasLabel,hasUpdateDate,provenanceJob
0,2020-07-13T13:15:18.178756,1315,https://data.elsevier.com/lifescience/entity/reaxys/analyticaltechnique/10107,,2018-02-16 05:21:28,N,Collagen morphometry,,
1,2020-07-13T13:15:28.399548,30,https://data.elsevier.com/lifescience/entity/reaxys/analyticaltechnique/10158,,2018-02-21 02:03:50,N,whole-body fluorescence imaging,,
2,2020-07-13T13:14:57.659112,269,https://data.elsevier.com/lifescience/entity/reaxys/analyticaltechnique/10419,,2018-03-08 23:47:46,N,Blinded Observation: Caliper + Hematoxylin-eosin staining method,,
3,2020-07-13T13:14:52.492911,1322,https://data.elsevier.com/lifescience/entity/reaxys/analyticaltechnique/10700,,2018-03-27 01:17:37,N,Radio-TLC,,
4,2020-07-13T13:15:41.46678,1511,https://data.elsevier.com/lifescience/entity/reaxys/analyticaltechnique/110,,2013-12-10 12:31:48,A,Trypan blue exclusion staining method,,


## Resnet data

![Resnet Schema](Resnet.PNG)

In [45]:
resnet_tbls = pwb.get_table_list('resnet')
resnet_w = widgets.Dropdown(
    options=[x['name'] for x in resnet_tbls],
    description='Resnet Tables:',
    disabled=False,
)

display(resnet_w)

Dropdown(description='Resnet Tables:', options=('Annotation', 'Binding', 'Biomarker', 'Cellexpression', 'Cello…

In [46]:
print (resnet_w.value)
resnet_df = pwb.get_dataframe(sparkSession, 'resnet', resnet_w.value)
show_top(rmc_df, 5)

Annotation


Unnamed: 0,hdfsIngestTime,offset,AnalyticalTechnique,hasOtherInfo,hasCreationDate,hasStatus,hasLabel,hasUpdateDate,provenanceJob
0,2020-07-13T13:15:18.178756,1315,https://data.elsevier.com/lifescience/entity/reaxys/analyticaltechnique/10107,,2018-02-16 05:21:28,N,Collagen morphometry,,
1,2020-07-13T13:15:28.399548,30,https://data.elsevier.com/lifescience/entity/reaxys/analyticaltechnique/10158,,2018-02-21 02:03:50,N,whole-body fluorescence imaging,,
2,2020-07-13T13:14:57.659112,269,https://data.elsevier.com/lifescience/entity/reaxys/analyticaltechnique/10419,,2018-03-08 23:47:46,N,Blinded Observation: Caliper + Hematoxylin-eosin staining method,,
3,2020-07-13T13:14:52.492911,1322,https://data.elsevier.com/lifescience/entity/reaxys/analyticaltechnique/10700,,2018-03-27 01:17:37,N,Radio-TLC,,
4,2020-07-13T13:15:41.46678,1511,https://data.elsevier.com/lifescience/entity/reaxys/analyticaltechnique/110,,2013-12-10 12:31:48,A,Trypan blue exclusion staining method,,


## ChEMBL Data

![ChEMBL Schema](ChEMBL.PNG)

In [47]:
chembl_tbls = pwb.get_table_list('chembl')
chembl_w = widgets.Dropdown(
    options=[x['name'] for x in chembl_tbls],
    description='ChEMBL Tables:',
    disabled=False,
)

display(chembl_w)

Dropdown(description='ChEMBL Tables:', options=('Actiontype', 'Activity', 'Activityproperty', 'Activitysupport…

In [49]:
print (chembl_w.value)
chembl_df = pwb.get_dataframe(sparkSession, 'chembl', chembl_w.value)
show_top(chembl_df, 5)

Assayparameter


Unnamed: 0,hdfsIngestTime,offset,AssayParameter,parameterComments,parameterStandardUnits,parameterUnits,parameterValue,parameterTextValue,parameterStandardValue,hasRelationSymbol,parameterStandardType,parameterType,parameterStandardTextValue,hasStandardRelationSymbol,partOfAssay,provenanceJob
0,2020-07-10T09:10:21.668351,17101,https://data.elsevier.com/lifescience/entity/chembl/assayparameter/3711328,,uL/min,uL/min,100.0,,100.0,=,sample_flow_rate,sample_flow_rate,,=,https://data.elsevier.com/lifescience/entity/chembl/assay/1640177,
1,2020-07-10T09:10:19.469768,17012,https://data.elsevier.com/lifescience/entity/chembl/assayparameter/3711517,,s-1,s-1,0.000266667,,0.0002667,=,radioligand_koff,radioligand_koff,,=,https://data.elsevier.com/lifescience/entity/chembl/assay/1640183,
2,2020-07-10T09:10:22.487795,17102,https://data.elsevier.com/lifescience/entity/chembl/assayparameter/3712007,,uL/min,uL/min,5.0,,5.0,=,immobilization_flow_rate,immobilization_flow_rate,,=,https://data.elsevier.com/lifescience/entity/chembl/assay/1640204,
3,2020-07-10T09:09:52.373336,17239,https://data.elsevier.com/lifescience/entity/chembl/assayparameter/3712383,,mg.kg-1,mg/kg,10.0,,10.0,=,DOSE,DOSE,,=,https://data.elsevier.com/lifescience/entity/chembl/assay/1055,
4,2020-07-10T09:10:22.456137,17103,https://data.elsevier.com/lifescience/entity/chembl/assayparameter/3712607,,mg.kg-1,mg/kg,400.0,,400.0,=,DOSE,DOSE,,=,https://data.elsevier.com/lifescience/entity/chembl/assay/4388,


In [50]:
sc.stop()