

# Exploring MIMIC-IV using Colaboratory and BigQuery

- BigQuery needs to be enabled in CoLaboratory. I followed the instructions [here](https://tech.aaronteoh.com/bigquery-colaboratory-basics/) after creating a Google Cloud project that I named `mimic4-bq`. You will need to modify the code to use the project ID you created.
- It took me a while to get this right and I didn't take good notes, so if anyone else wants to share what they had to do to get BigQuery enabled please share.


# Using `ibis` to connect to MIMIC IV on Google BigQuery

Environments in Google Colaboratory are not persistent. If we use any software that is not part of teh Google Python Colaboratory environment, we must install it during each session.

We are going to be using Ibis, so this must be installed.

In [None]:
!pip install ibis-framework[bigquery]

### Google has a really nice Pandas DataFrame display that we will enable.

In [None]:
%load_ext google.colab.data_table

In [None]:
import ibis
import os

In [None]:
project_id="mimic4-bq"
os.environ["GOOGLE_CLOUD_PROJECT"]=project_id
import seaborn as sns

In [None]:
import pandas as pd
import ipywidgets as ipw
from IPython.display import display, HTML, clear_output
import matplotlib.pyplot as plt
from ipywidgets.widgets.interaction import show_inline_matplotlib_plots


The Google display helps with having lots of rows, but not with having lots of columns. This class is a rough attempt to be able to scroll through columns. I've also added on a simple visualization. This needs more work, so be patient with unreadable labels, etc.

In [None]:
class PandasBrowser(ipw.VBox):
    def __init__(self, df, fixed=None, *args, **kwargs):
        self.df = df
        if fixed == None:
            self.fixed = [self.df.columns[0]]
        else:
            self.fixed = fixed[:]
        self.cols = [c for c in self.df.columns if c not in self.fixed]
        self.ncols = len(self.cols)
        self.ndisp = max(12-len(self.fixed), 10)
        
        if self.ncols < self.ndisp:
          col_max = 0
        else:
          col_max = self.ncols-self.ndisp
        self.start_col = ipw.IntSlider(min=0, max=col_max, value=0, description="Start col")
    
        self.start_col.observe(self.disp_df, "value")
        self.out = ipw.Output()
        children = kwargs.get("children", [])
        self.sub = None

        self.graph_type = ipw.Dropdown(options=[None, "describe", "categorical", "numeric"], value=None, description="Plot Type")
        self.kind = ipw.Dropdown(options=["count", "swarm", "box", "boxen", "violin", "bar", "point"], value="count")
        opts = [None]+list(self.df.columns)
        self.xsel = ipw.Dropdown(options=opts, value=opts[1], description="x")
        self.ysel = ipw.Dropdown(options=opts, value=None, description="y")
        self.hsel = ipw.Dropdown(options=opts, value=None, description="hue")
        self.rsel = ipw.Dropdown(options=opts, value=None, description="row var")
        self.csel = ipw.Dropdown(options=opts, value=None, description="col var")

        self.graph_type.observe(self.disp_plot, "value")
        self.kind.observe(self.disp_plot, "value")
        self.xsel.observe(self.disp_plot, "value")
        self.ysel.observe(self.disp_plot, "value")
        self.hsel.observe(self.disp_plot, "value")
        self.rsel.observe(self.disp_plot, "value")
        self.csel.observe(self.disp_plot, "value")
        

        self.plot_out = ipw.Output()
        
        tmp = ipw.HBox([self.graph_type, self.kind, ipw.VBox([self.xsel, self.ysel]), ipw.VBox([self.hsel, self.rsel, self.csel])])

        children= [self.start_col, self.out, tmp, self.plot_out] + children


        super(PandasBrowser, self).__init__(children=children)
        self.disp_df()
        self.disp_plot()


    def disp_df(self, *args):
        
        cols = self.fixed + self.cols[self.start_col.value:self.start_col.value+self.ndisp]
        #self.sub = self.df.loc[:, cols]
        self.out.clear_output()
        with self.out:
            display(self.df.loc[:, cols])
    def disp_plot(self, *args):
        self.plot_out.clear_output()
        if self.graph_type.value == None:
          return
        with self.plot_out:
            if self.graph_type.value == "describe":
              display(self.df.loc[:, cols].describe())
            else:
              if self.graph_type.value == 'categorical':
                g = sns.catplot(data=self.df, kind=self.kind.value, 
                                x=self.xsel.value)
                #y=self.ysel.value, row=self.rsel.value, col=self.csel.value)
              else:
                g = sns.pairplot(data=self.df, hue=self.hsel.value)
              g.set_xticklabels(rotation=45)
              show_inline_matplotlib_plots()
            
    def disp(self, *args):
      self.disp_df(args)
      self.disp_plot(args)


### Authenticate using `google.colab`

In [None]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

In [None]:
conn = ibis.bigquery.connect(
    project_id="mimic4-bq",
    dataset_id='physionet-data.mimic_core')

### Once we connect we can list all the databases we have access to

In [None]:
dbs = conn.list_databases()
print(dbs)

### Since I connected to `mimic_core`, I can list the tables in this database

In [None]:
conn.list_tables()

https://cloud.google.com/community/tutorials/bigquery-ibis

In [None]:
patients = conn.table("patients")

### The `schema` method will tell you the data types of each column in the table

In [None]:
patients.schema()

### And do queries

In [None]:
pts = patients.execute(limit=2000)

In [None]:
pv = PandasBrowser(pts)
pv

In [None]:
adm = conn.table("admissions").execute(limit=20000)

In [None]:
PandasBrowser(adm)