# Savor Data

> Extract, transform, explore

---

## Setup

In [1]:
# === Imports and config === #
from os import environ
from pprint import pprint
from pathlib import Path

from airtable import Airtable
from dotenv import load_dotenv
import pandas as pd
import numpy as np

# Plotly imports
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.io as pio

# Set plotly default color theme to dark mode
pio.templates.default = "plotly_dark"

# # Jupyter + Plotly imports (if running in Colab or Visual Studio Code, comment out the 2 lines below)
# import plotly.offline as pyo
# pyo.init_notebook_mode()  # Set plotly to notebook mode / work offline

# Ignore this Numpy warning when using Plotly Express:
# FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.
# import warnings
# warnings.filterwarnings(action='ignore', category=FutureWarning, module='numpy'

pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

# Airtable authentication
load_dotenv(dotenv_path=".env")
base_key = environ.get("AIRTABLE_BASE_KEY")
api_key = environ.get("AIRTABLE_API_KEY")

## Pipelines

- Extract + transform from "dimension" tables to match up related records' primary keys
- Extract + transform time-series data from `engage_log` into dataframe

### Relations

In [4]:
# Related tables to match up via ID
tables_and_fields = {
    "mental": ["name"],
    "physical": ["name"],
    "dose": ["name", "supp", "amt", "unit"],
    "who": ["name"],
    "location": ["name", "location", "city", "state"],
    "subloc": ["name"],
    "tag": ["name"],
}

tuple_data = [  # Array of tuples to be used as multi-index
    # ("physical", "recAS99RkAJBoTCgq")
]
series_data = [  # Series data to be multi-indexed
    # "Surf",
]

# Loop through tables to retrieve records and save in array
for table in tables_and_fields:
    airtable = Airtable(  # Connect to table
        base_key,
        table,
        api_key=api_key
    )
    records = airtable.get_all(fields=tables_and_fields[table])  # Retrieve records
    for record in records:  # Transform and save records to arrays
        tuple_data.append((table, record["id"]))
        series_data.append(record["fields"]["name"])

# Create multi-index object
multi_index = pd.MultiIndex.from_tuples(tuple_data, names=["table", "id"])
# Build the multi-index series that will be used to look up record names by id
lookup = pd.Series(series_data, index=multi_index)

In [6]:
lookup

table   id               
mental  rec0GRGSEJXCTU5od                Learn
        rec2wZ0LboOW8fyAy             Meditate
        rec5yx2uo7ZIXuqzw         Troubleshoot
        rec6RxldfHLJ2lIEQ                Solve
        rec7OsLdgXUN9XczF    Data manipulation
                                   ...        
tag     recyPGKraDv5I0YQZ               python
        recyYB2qZVKDYAADk               sojorn
        recykV5c2g43Nluq8             wishlist
        recywkIrd3W4Pm8B6          trash_panda
        reczTzMYl02Xu8OqS                  nlp
Length: 645, dtype: object

In [10]:
# Confirm it worked by looking up the name for an ID
lookup["mental"]["recknqtgREfJulPie"]
# lookup["mental", "recknqtgREfJulPie"]

'Code'

### Time Series

In [None]:
# Create connection to engage_log table
engage_log = Airtable(base_key, "engage_log", api_key=api_key)

In [None]:
# TODO: Retrieve all data from the past 7 days
for page in engage_log.get_iter(sort=["-time_in"]):
    for record in page: