# Savor Data

> Extract, transform, explore

---

## Setup

In [1]:
# === Imports and config === #
from os import environ
from pprint import pprint
from pathlib import Path

from airtable import Airtable
from dotenv import load_dotenv
import pandas as pd
import numpy as np

# Plotly imports
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.io as pio

# Set plotly default color theme to dark mode
pio.templates.default = "plotly_dark"

# # Jupyter + Plotly imports (if running in Colab or Visual Studio Code, comment out the 2 lines below)
# import plotly.offline as pyo
# pyo.init_notebook_mode()  # Set plotly to notebook mode / work offline

# Ignore this Numpy warning when using Plotly Express:
# FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.
# import warnings
# warnings.filterwarnings(action='ignore', category=FutureWarning, module='numpy'

pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

# Airtable authentication
load_dotenv(dotenv_path=".env")
base_key = environ.get("AIRTABLE_BASE_KEY")
api_key = environ.get("AIRTABLE_API_KEY")

## Pipelines

- Extract + transform from "dimension" tables to match up related records' primary keys
- Extract + transform time-series data from `engage_log` into dataframe

### Relations

In [2]:
def create_lookup(base_key: str, api_key: str, table_fields: dict) -> pd.Series:
    """Creates an ID lookup series from a set of Airtable tables."""
    series_index = []  # Index (id) array
    series_data = []  # Series data
    for table in tables_and_fields:  # Loop through tables to retrieve records and save in array
        airtable = Airtable(base_key, table, api_key=api_key)  # Connect to table
        records = airtable.get_all(fields=tables_and_fields[table])  # Retrieve records
        for record in records:  # Transform and save records to arrays
            series_index.append(record["id"])
            series_data.append(record["fields"]["name"])
    # Return series, indexed by ID
    return pd.Series(series_data, index=series_index)

In [3]:
# Related tables to match up via ID
tables_and_fields = {
    "mental": ["name"],
    "physical": ["name"],
    "dose": ["name", "supp", "amt", "unit"],
    "who": ["name"],
    "location": ["name", "location", "city", "state"],
    "subloc": ["name"],
    "tag": ["name"],
}

lookup = create_lookup(base_key, api_key, tables_and_fields)
# Confirm it worked by looking up name for id (in this ex, "Code")
lookup["recknqtgREfJulPie"]

'Code'

### Time Series DataFrame

The data comes out of the API like this:

```py
[{'id': 'rec09tUYvKhC92ZSg',
  'fields': {'id_num': 24800,
   'session': ['recXt8s9a9fTRTbLf'],
   'name': '24800-Sta-Cod',
   'modified': '2021-04-29T22:05:23.000Z',
   'created': '2021-04-29T22:04:47.000Z',
   'project_location': ['recidY4IXWvLNWOp0'],
   'time_in': '2021-04-29T22:02:00.000Z',
   'mental': ['recknqtgREfJulPie'],
   'physical': ['recpSKLyNVgI0wdjf'],
   'tag': ['receKaJGCFL7i4a4g', 'recyPGKraDv5I0YQZ'],
   'subloc': ['rec0jakijbv2aJ2lj'],
   'mental_note': '# Savor Dash Data'},
  'createdTime': '2021-04-29T22:04:47.000Z'},
 {'id': 'recyGC5jmggOHxBWm',
  'fields': {'id_num': 24799,
   'session': ['recXt8s9a9fTRTbLf'],
   'name': '24799-Sta-Wor',
   'modified': '2021-04-29T21:58:15.000Z',
   'created': '2021-04-29T21:58:06.000Z',
   'project_location': ['recidY4IXWvLNWOp0'],
   'time_in': '2021-04-29T21:57:00.000Z',
   'mental': ['recV7J7xUq1TS4UaA'],
   'physical': ['recpSKLyNVgI0wdjf'],
   'subloc': ['rec0jakijbv2aJ2lj']},
  'createdTime': '2021-04-29T21:58:06.000Z'}]
```

And so needs to be transformed a little bit to be loaded into a DataFrame.

In [26]:
# Create connection to engage_log table
engage_log = Airtable(base_key, "engage_log", api_key=api_key)

In [43]:
from datetime import datetime, timezone

def get_data_for_date_range(conn: Airtable, start: datetime, end: datetime) -> pd.DataFrame:
    """Retrieve Airtable data within the specified time range.
    Assumes that datetimes passed are in UTC, so convert to UTC beforehand."""
    # Loop through pages and records within pages, checking the time_in against the range
    records = []
    for page in conn.get_iter(sort=["-time_in"], page_size=50):
        # Page size of 50 = roughly # records in average day
        for record in page:
            # Airtable saves records in UTC
            time_in = datetime.fromisoformat(record["fields"]["time_in"][:-5] + "+00:00")
            # If inside range, append to list of raw data; else break from loop
            if time_in > end:
                continue
            elif time_in > start and time_in < end:
                # Extract id and fields into flat dictionary
                re_dict = record["fields"]
                re_dict["id"] = record["id"]
                records.append(re_dict)
            else:
                break
        else:
            continue
        break  # Break outer loop when inner loop is broken

    # Load list of records into dataframe and return
    return pd.DataFrame.from_records(records)

In [28]:
import dateutil

# Create start and end UTC datetimes
mdt = dateutil.tz.gettz("America/Denver")  # MDT timezone info
# Convert the MDT date to UTC
start = datetime(2021, 4, 26, tzinfo=mdt).astimezone(timezone.utc)
end = datetime.now(timezone.utc) # Now/today, in UTC

# Retrieve records for date range
df1_engage = get_data_for_date_range(engage_log, start, end)

In [29]:
df1_engage.head()

Unnamed: 0,mental_note,id_num,session,subloc,tag,time_in,mental,physical,modified,created,project_location,name,id,physical_note,dose,who,moment_log
0,# Savor Dash Data,24800,[recXt8s9a9fTRTbLf],[rec0jakijbv2aJ2lj],"[receKaJGCFL7i4a4g, recyPGKraDv5I0YQZ]",2021-04-29T22:02:00.000Z,[recknqtgREfJulPie],[recpSKLyNVgI0wdjf],2021-04-29T22:05:23.000Z,2021-04-29T22:04:47.000Z,[recidY4IXWvLNWOp0],24800-Sta-Cod,rec09tUYvKhC92ZSg,,,,
1,,24799,[recXt8s9a9fTRTbLf],[rec0jakijbv2aJ2lj],,2021-04-29T21:57:00.000Z,[recV7J7xUq1TS4UaA],[recpSKLyNVgI0wdjf],2021-04-29T21:58:15.000Z,2021-04-29T21:58:06.000Z,[recidY4IXWvLNWOp0],24799-Sta-Wor,recyGC5jmggOHxBWm,,,,
2,,24798,[recXt8s9a9fTRTbLf],[rec0jakijbv2aJ2lj],,2021-04-29T21:39:00.000Z,[rec2wZ0LboOW8fyAy],[recUCSBNFwpBrnDGb],2021-04-29T21:38:47.000Z,2021-04-29T21:38:10.000Z,[recidY4IXWvLNWOp0],24798-Sit-Med,recxhjn6VKc2HGZbG,NSDR / mindfulness.,,,
3,,24797,[recXt8s9a9fTRTbLf],[reclkHtEN4C6wr4dv],,2021-04-29T21:29:00.000Z,[recx5VEAjXKeYLjZF],"[rec3AnOkXgiPoni7u, recpg577q4IdfjJZi]",2021-04-29T21:38:31.000Z,2021-04-29T21:37:53.000Z,[recidY4IXWvLNWOp0],24797-Foo-Thi,rec850VNGKKThCWT5,Pistachios and green drink.,,,
4,,24796,[recXt8s9a9fTRTbLf],[recWNsckWelXKeR7d],,2021-04-29T21:17:00.000Z,[recx5VEAjXKeYLjZF],[recpg577q4IdfjJZi],2021-04-29T21:37:52.000Z,2021-04-29T21:16:56.000Z,[recidY4IXWvLNWOp0],24796-Sna-Thi,recufmGBkq9KCJmxn,"Some almonds, trail mix and chicharrones in th...",,,


### Transformation (Expansion)

Obviously it's a little clunky to work with the relational fields in their current form (e.g. `[rec3AnOkXgiPoni7u, recpg577q4IdfjJZi]`).

- [Expand arrays in cells](https://chrisalbon.com/python/data_wrangling/pandas_expand_cells_containing_lists/)
- [DataFrame JOINs](https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html#compare-with-sql-join)

In [46]:
# Expand the arrays within columns to get a single item per column
tag = df1_engage["tag"].apply(pd.Series)

# Rename columns to be more informative
tag = tag.rename(columns = lambda x: f"tag_{x}")
tag[:3]

Unnamed: 0,tag_0,tag_1,tag_2,tag_3
0,receKaJGCFL7i4a4g,recyPGKraDv5I0YQZ,,
1,,,,
2,,,,


In [41]:
# Functionalize the expansion logic
from typing import List

def expand_list_cols(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    """Expands arrays contained in the columns then concatenates them
    back onto the original DataFrame.
    """
    df = df.copy()  # Make copy so original is not modified
    df_list = [None] * len(cols)  # Create list of same length
    for i, col in enumerate(cols):  # Iterate through column names
        # Expand column into its own dataframe
        df_list[i] = df[col].apply(pd.Series)
        # Rename and number columns
        # TODO: if only one column, don't suffix with number
        df_list[i] = df_list[i].rename(columns = lambda x: f"{col}_{x}")
        df = df.drop(columns=[col])

    # Concatenate original with new dataframes
    return pd.concat([df] + df_list, axis=1)

In [42]:
# Expand all relation columns
relations = [
    "mental",
    "physical",
    "tag",
    "subloc",
    "moment_log",
    "who",
    "dose",
    "session",
    "project_location",
]

df2_engage = expand_list_cols(df1_engage, relations)
df2_engage.head()

Unnamed: 0,mental_note,id_num,time_in,modified,created,name,id,physical_note,mental_0,mental_1,physical_0,physical_1,tag_0,tag_1,tag_2,tag_3,subloc_0,moment_log_0,who_0,who_1,who_2,dose_0,dose_1,dose_2,dose_3,session_0,project_location_0
0,# Savor Dash Data,24800,2021-04-29T22:02:00.000Z,2021-04-29T22:05:23.000Z,2021-04-29T22:04:47.000Z,24800-Sta-Cod,rec09tUYvKhC92ZSg,,recknqtgREfJulPie,,recpSKLyNVgI0wdjf,,receKaJGCFL7i4a4g,recyPGKraDv5I0YQZ,,,rec0jakijbv2aJ2lj,,,,,,,,,recXt8s9a9fTRTbLf,recidY4IXWvLNWOp0
1,,24799,2021-04-29T21:57:00.000Z,2021-04-29T21:58:15.000Z,2021-04-29T21:58:06.000Z,24799-Sta-Wor,recyGC5jmggOHxBWm,,recV7J7xUq1TS4UaA,,recpSKLyNVgI0wdjf,,,,,,rec0jakijbv2aJ2lj,,,,,,,,,recXt8s9a9fTRTbLf,recidY4IXWvLNWOp0
2,,24798,2021-04-29T21:39:00.000Z,2021-04-29T21:38:47.000Z,2021-04-29T21:38:10.000Z,24798-Sit-Med,recxhjn6VKc2HGZbG,NSDR / mindfulness.,rec2wZ0LboOW8fyAy,,recUCSBNFwpBrnDGb,,,,,,rec0jakijbv2aJ2lj,,,,,,,,,recXt8s9a9fTRTbLf,recidY4IXWvLNWOp0
3,,24797,2021-04-29T21:29:00.000Z,2021-04-29T21:38:31.000Z,2021-04-29T21:37:53.000Z,24797-Foo-Thi,rec850VNGKKThCWT5,Pistachios and green drink.,recx5VEAjXKeYLjZF,,rec3AnOkXgiPoni7u,recpg577q4IdfjJZi,,,,,reclkHtEN4C6wr4dv,,,,,,,,,recXt8s9a9fTRTbLf,recidY4IXWvLNWOp0
4,,24796,2021-04-29T21:17:00.000Z,2021-04-29T21:37:52.000Z,2021-04-29T21:16:56.000Z,24796-Sna-Thi,recufmGBkq9KCJmxn,"Some almonds, trail mix and chicharrones in th...",recx5VEAjXKeYLjZF,,recpg577q4IdfjJZi,,,,,,recWNsckWelXKeR7d,,,,,,,,,recXt8s9a9fTRTbLf,recidY4IXWvLNWOp0
