In [1]:
import pandas as pd
import numpy as numpy
from importlib import reload
from tqdm import tqdm_notebook as tqdm
import time

import pdaactconn as pc
from trialexplorer import AACTStudySet

import matplotlib.pyplot
%matplotlib inline

## Create the StudySet object, if no conn as param, defaults to a local connection

In [None]:
ss = AACTStudySet.AACTStudySet(tqdm_handler=tqdm)

### Alternatively can pass a explicit connection object to the class to connect to remote

In [None]:
conn = pc.AACTConnection(source=pc.AACTConnection.REMOTE)
ss = AACTStudySet.AACTStudySet(conn=conn, 
                               tqdm_handler=tqdm)

### Can also set the connection directly

In [None]:
ss.conn.set_source(conn.LOCAL)

### The study set starts with no data, we need to define our primary filter first

using the list_columns() function, we can see the initial columns that we can filter on

In [None]:
ss.list_columns()[:10]

In [None]:
len(ss.list_columns())

### All filters are added on a "AND" basis, but we can and "OR" filters by adding it as 1 filter

filters are added in SQL syntax

In [None]:
ss.add_constraint("start_date >= '2018-01-01'")
ss.add_constraint("start_date <= '2018-12-31'")

In [None]:
ss.show_constraints()

In [None]:
# remove the 1st constraint (0 indexed)
ss.remove_constraint(0)
ss.show_constraints()

In [None]:
# add it back
ss.add_constraint("start_date >= '2018-01-01'")
ss.show_constraints()

### Loads the studies from the main table into memory, saved in the self.studies dataframe

In [None]:
ss.load_studies()

In [None]:
ss.studies.head()

In [None]:
ss.studies.shape

### To add dimensions, we need to know the names of the dimensions

This is loaded into this attribute on init, which includes a list of implemented dimensions

In [None]:
ss.avail_dims.list

### The .avail_dims attribute also supports tab-completion of implemented attributes:

In [None]:
ss.avail_dims.brief_summaries

### to add a dimension, we just need to add the key of the dimension using this method:

This will call the constructor of the dimension handler and link that Object to the StudySet object

In [None]:
ss.add_dimensions(['brief_summaries', 'links', 'designs', 'design_outcomes'])

In [None]:
ss.dimensions

### At init, it is empty, but we can call refresh_dim_data to load it from the database:

this uses the self.studies dataframe as the master list, and creates a temp table in the database with its nct_ids

it the joins the dimension tables to that temp table so we only load the needed data

In [None]:
ss.refresh_dim_data()

### The dimensional data is loaded in the dim object

in 2 forms, raw_data, is the direct return from the database

In [None]:
cur_dim = ss.dimensions['brief_summaries']

In [None]:
cur_dim.raw_data.head()

this is split into a dict keyed by nct_id in .data for ease of use, these refer to the same mem location so do not use any more memory

In [None]:
test_nct_id = ss.studies.index[5]
test_nct_id

In [None]:
cur_dim.data[test_nct_id]

### can access other dims too

In [None]:
cur_dim = ss.dimensions['design_outcomes']

In [None]:
cur_dim.raw_data.head()

In [None]:
cur_dim.data[test_nct_id]

### A key design choice is dropping records from the main studies table:

It will drop the records from the ss.studies table, and clean up any dimensions associated with it.

As a result it is a bit slow:

In [None]:
to_drop = [ss.studies.index[5], ss.studies.index[6], ss.studies.index[7]]
to_drop

In [None]:
ss.drop_studies(to_drop)

The memory reduction is key as the user iterates and modifies the active studies list, 

**so we should test the memory clearing functionality carefully!!**

### Testing dropping of dimensions

In [None]:
to_drop = list(ss.dimensions.keys())
ss.drop_dimensions(to_drop)

In [None]:
ss.dimensions

### Reloading the test case

In [2]:
reload(AACTStudySet)

ss = AACTStudySet.AACTStudySet(tqdm_handler=tqdm)
ss.add_constraint("start_date >= '2018-01-01'")
ss.add_constraint("start_date <= '2018-12-31'")
ss.load_studies()

ss.add_dimensions(['result_groups', 'milestones'])
ss.refresh_dim_data()

25837 studies loaded!
Successfuly added these 2 dimensions: ['result_groups', 'milestones']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=52), HTML(value='')))

Syncing the temp table temp_cur_studies in 52 chunks x 500 records each

 - Loading dimension result_groups
 -- Loading raw data
 -- Creating memory pointers for the .data dictionary keyed by nct_id
 - Loading dimension milestones
 -- Loading raw data
 -- Creating memory pointers for the .data dictionary keyed by nct_id


In [19]:
ss.dimensions['result_groups'].raw_data.head()

Unnamed: 0,id,nct_id,ctgov_group_code,result_type,title,description
0,14105968,NCT02610972,B3,Baseline,Total,Total of all reporting groups
1,14105969,NCT02610972,B2,Baseline,CLINICALLY HEALTHY,Women with a delivery of a healthy normal baby...
2,14105970,NCT02610972,B1,Baseline,CLINICALLY CONFIRMED PREECLAMPSIA,Women clinically diagnosed with preeclampsia (...
3,14105971,NCT02610972,P2,Participant Flow,CLINICALLY HEALTHY,Women with a delivery of a healthy normal baby...
4,14105972,NCT02610972,P1,Participant Flow,CLINICALLY CONFIRMED PREECLAMPSIA,Women clinically diagnosed with preeclampsia (...


In [4]:
cur_dim = ss.dimensions['milestones']

In [8]:
df = cur_dim.raw_data
df.head()

Unnamed: 0,id,nct_id,result_group_id,ctgov_group_code,title,period,description,count
0,5324481,NCT03478891,13048059,P1,NOT COMPLETED,Overall Study,,1
1,5324484,NCT03478891,13048059,P1,COMPLETED,Overall Study,,2
2,5324487,NCT03478891,13048059,P1,Received MAb114,Overall Study,,3
3,5324490,NCT03478891,13048059,P1,STARTED,Overall Study,,3
4,5324631,NCT03465904,13048366,P1,NOT COMPLETED,Overall Study,,89
