

# Exploring MIMIC-IV using Colaboratory and BigQuery

- BigQuery needs to be enabled in CoLaboratory. I followed the instructions [here](https://tech.aaronteoh.com/bigquery-colaboratory-basics/) after creating a Google Cloud project that I named `mimic4-bq`. You will need to modify the code to use the project ID you created.
- It took me a while to get this right and I didn't take good notes, so if anyone else wants to share what they had to do to get BigQuery enabled please share.


# Using `ibis` to connect to MIMIC IV on Google BigQuery

Environments in Google Colaboratory are not persistent. If we use any software that is not part of teh Google Python Colaboratory environment, we must install it during each session.

We are going to be using Ibis, so this must be installed.

In [None]:
!pip install ibis-framework[bigquery]

### Google has a really nice Pandas DataFrame display that we will enable.

In [None]:
%load_ext google.colab.data_table
#%reload_ext google.colab.data_table
#%unload_ext google.colab.data_table


In [None]:
import ibis
import os

In [None]:
project_id="mimic4-bq"
os.environ["GOOGLE_CLOUD_PROJECT"]=project_id
import seaborn as sns

In [None]:
import pandas as pd
import ipywidgets as ipw
from IPython.display import display, HTML, clear_output
import matplotlib.pyplot as plt
from ipywidgets.widgets.interaction import show_inline_matplotlib_plots
from collections import Counter

The Google display helps with having lots of rows, but not with having lots of columns. This class is a rough attempt to be able to scroll through columns. I've also added on a simple visualization. This needs more work, so be patient with unreadable labels, etc.

### Authenticate using `google.colab`

In [None]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

In [None]:
class BQBrowser(ipw.VBox):
    def __init__(self, pid="mimic4-bq", base_project="physionet-data.", 
                 db="mimic_core", exclude="iii", *args, **kwargs):
        self.pid = pid
        self.base_project = base_project
        self.current_db = None
        self.df = None
        print("This will take a bit of time...")
        conn = ibis.bigquery.connect(
                        project_id=pid,
                        dataset_id=base_project+db)
        self.dbs = [d for d in conn.list_databases() if exclude not in d and "mimic" in d]
        self.dbs.sort()
        self.info = {}
        for d in self.dbs:
          print("processing database %s"%d)
          c = ibis.bigquery.connect(
              project_id=pid,
              dataset_id=base_project+d)
          tables = c.list_tables()
          tmp = {t:c.table(t).count().execute() for t in tables}
          self.info[d] = tmp
        clear_output()
        print("Completed")

        self.sdbs = ipw.Dropdown(options=[None]+self.dbs[:], value=None, description="Select DB")
        self.sdbs.observe(self.set_db, "value")

        self.stable = ipw.Dropdown(description="Select Table")
        self.stable.observe(self.set_table, "value")
        self.offset = ipw.IntSlider(description="offset", step=10000)
        self.offset.observe(self.update_offset, "value")
    
        self.out = ipw.Output()
        children = kwargs.get("children", [])

        self.graph_type = ipw.Dropdown(options=[None, "describe", "categorical", "numeric"], value=None, description="Viz Type")
        self.kind = ipw.Dropdown(options=["count", "swarm", "box", "boxen", "violin", "bar", "point"], value="count")
        opts = [None]
        self.xsel = ipw.Dropdown(options=opts, value=None, description="x")
        self.ysel = ipw.Dropdown(options=opts, value=None, description="y")
        self.hsel = ipw.Dropdown(options=opts, value=None, description="hue")
        self.rsel = ipw.Dropdown(options=opts, value=None, description="row var")
        self.csel = ipw.Dropdown(options=opts, value=None, description="col var")

        self.graph_type.observe(self.disp_plot, "value")
        self.kind.observe(self.disp_plot, "value")
        self.xsel.observe(self.disp_plot, "value")
        self.ysel.observe(self.disp_plot, "value")
        self.hsel.observe(self.disp_plot, "value")
        self.rsel.observe(self.disp_plot, "value")
        self.csel.observe(self.disp_plot, "value")
        

        self.plot_out = ipw.Output()
        
        tmp = ipw.HBox([self.graph_type, self.kind, ipw.VBox([self.xsel, self.ysel]), ipw.VBox([self.hsel, self.rsel, self.csel])])

        children= [ipw.HBox([self.sdbs, self.stable, self.offset]), self.out, tmp, self.plot_out] + children


        super(BQBrowser, self).__init__(children=children)
        self.disp_df()
        self.disp_plot()

    def set_db(self, *args):
      if self.sdbs.value == None:
        return 
      self.current_db = self.sdbs.value
      self.conn = ibis.bigquery.connect(
              project_id=self.pid,
              dataset_id=self.base_project+self.current_db)
      opts = [None]+list(self.conn.list_tables())
      self.stable.options = opts
      self.stable.value = None

    def set_table(self, *args):
      if self.stable.value == None:
        return
      tmp = self.info[self.current_db][self.stable.value]
      self.offset.max=tmp
      self.offset.value=0
      self.df = self.conn.table(self.stable.value).limit(10000, offset=self.offset.value).execute()
      opts = [None]+list(self.df.columns)
      self.xsel.options = opts
      self.ysel.options = opts
      self.hsel.options = opts
      self.rsel.options = opts
      self.csel.options = opts

      self.xsel.value = None
      self.ysel.value = None
      self.hsel.value = None
      self.rsel.value = None
      self.csel.value = None
      self.disp()

    def update_offset(self, *args):
      self.df = self.conn.table(self.stable.value).limit(10000, offset=self.offset.value).execute()
      self.disp()

    def disp_df(self, *args):
        self.out.clear_output()
        with self.out:
            display(self.df)

    def disp_plot(self, *args):

        self.plot_out.clear_output()
        if self.graph_type.value == None:
          return
        with self.plot_out:
            if self.graph_type.value == "describe":
              display(self.df.describe())
            else:
              if self.graph_type.value == 'categorical':
                g = sns.catplot(data=self.df, kind=self.kind.value, 
                                x=self.xsel.value,
                                y=self.ysel.value, row=self.rsel.value, 
                                col=self.csel.value,
                                hue=self.hsel.value)
                g.set_xticklabels(rotation=45)
                #)
              else:
                g = sns.displot(self.df, x=self.xsel.value, hue=self.hsel.value)
              
              show_inline_matplotlib_plots()
            
    def disp(self, *args):
      self.disp_df(args)
      self.disp_plot(args)


In [None]:
browser = BQBrowser()


In [None]:
browser