# Savor Data

> Extract, transform, explore

---

## Setup

In [1]:
# === Imports and config === #
from os import environ
from pprint import pprint
from pathlib import Path

from airtable import Airtable
from dotenv import load_dotenv
import pandas as pd
import numpy as np

# Plotly imports
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.io as pio

# Set plotly default color theme to dark mode
pio.templates.default = "plotly_dark"

# # Jupyter + Plotly imports (if running in Colab or Visual Studio Code, comment out the 2 lines below)
# import plotly.offline as pyo
# pyo.init_notebook_mode()  # Set plotly to notebook mode / work offline

# Ignore this Numpy warning when using Plotly Express:
# FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.
# import warnings
# warnings.filterwarnings(action='ignore', category=FutureWarning, module='numpy'

pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

# Airtable authentication
load_dotenv(dotenv_path=".env")
base_key = environ.get("AIRTABLE_BASE_KEY")
api_key = environ.get("AIRTABLE_API_KEY")

## Pipelines

- Extract + transform from "dimension" tables to match up related records' primary keys
- Extract + transform time-series data from `engage_log` into dataframe

### Relations

In [11]:
def create_lookup(base_key: str, api_key: str, table_fields: dict) -> pd.Series:
    """Creates an ID lookup series from a set of Airtable tables."""
    series_index = []  # Index (id) array
    series_data = []  # Series data
    for table in tables_and_fields:  # Loop through tables to retrieve records and save in array
        airtable = Airtable(base_key, table, api_key=api_key)  # Connect to table
        records = airtable.get_all(fields=tables_and_fields[table])  # Retrieve records
        for record in records:  # Transform and save records to arrays
            series_index.append(record["id"])
            series_data.append(record["fields"]["name"])
    # Return series, indexed by ID
    return pd.Series(series_data, index=series_index)

In [12]:
# Related tables to match up via ID
tables_and_fields = {
    "mental": ["name"],
    "physical": ["name"],
    "dose": ["name", "supp", "amt", "unit"],
    "who": ["name"],
    "location": ["name", "location", "city", "state"],
    "subloc": ["name"],
    "tag": ["name"],
}

lookup = create_lookup(base_key, api_key, tables_and_fields)
# Confirm it worked by looking up name for id (in this ex, "Code")
lookup["recknqtgREfJulPie"]

'Code'

### Time Series

In [22]:
# Create connection to engage_log table
engage_log = Airtable(base_key, "engage_log", api_key=api_key)

In [44]:
from datetime import datetime, timedelta, timezone

def get_data_for_date_range(
    conn: Airtable,
    start: datetime,
    end: datetime,
) -> pd.DataFrame:
    """Retrieve Airtable data within the specified time range.
    Assumes that datetimes passed are in UTC, so convert to UTC beforehand."""
    # Loop through pages and records within pages, checking the time_in against the range
    # If inside range, append to list of raw data; else break from loop
    raw_records = []
    for page in conn.get_iter(sort=["-time_in"], page_size=50):
        # Page size of 50 = roughly # records in average day
        for record in page:
            # Airtable saves records in UTC
            time_in = datetime.fromisoformat(record["fields"]["time_in"][:-5] + "+00:00")
            if time_in > end:
                continue
            elif time_in > start and time_in < end:
                raw_records.append(record)
            else:
                break
        else:
            continue
        break  # Break outer loop when inner loop is broken
    
    return raw_records

In [45]:
import dateutil

# Create start and end UTC datetimes
mdt = dateutil.tz.gettz("America/Denver")  # MDT timezone info
# Convert the MDT date to UTC
start = datetime(2021, 4, 26, tzinfo=mdt).astimezone(timezone.utc)
end = datetime.now(timezone.utc) # Now, in UTC

raw_records = get_data_for_date_range(engage_log, start, end)

In [46]:
raw_records[:2]

[{'id': 'recaB51Lcj0HUVGYR',
  'fields': {'mental_note': '# Savor Dash',
   'id_num': 24736,
   'session': ['rec6zLHF5M5jrYgbF'],
   'subloc': ['rec0jakijbv2aJ2lj'],
   'tags': ['recyPGKraDv5I0YQZ', 'receKaJGCFL7i4a4g'],
   'time_in': '2021-04-28T21:04:00.000Z',
   'mental': ['recknqtgREfJulPie'],
   'physical': ['recUCSBNFwpBrnDGb', 'recpSKLyNVgI0wdjf'],
   'modified': '2021-04-28T21:03:35.000Z',
   'created': '2021-04-28T21:03:11.000Z',
   'duration': {'specialValue': 'NaN'},
   'project_location': ['recidY4IXWvLNWOp0'],
   'name': '24736-Sit-Cod'},
  'createdTime': '2021-04-28T21:03:11.000Z'},
 {'id': 'recNt5OneZpvRyqmg',
  'fields': {'id_num': 24735,
   'session': ['rec6zLHF5M5jrYgbF'],
   'subloc': ['rec0jakijbv2aJ2lj'],
   'time_in': '2021-04-28T21:00:00.000Z',
   'mental': ['recV7J7xUq1TS4UaA'],
   'physical': ['recUCSBNFwpBrnDGb'],
   'modified': '2021-04-28T21:03:07.000Z',
   'created': '2021-04-28T21:03:00.000Z',
   'duration': {'specialValue': 'NaN'},
   'project_location': 