# TetraScience Scientific Data Cloud Configuration Report
This script generates an Excel report with Agent and Pipeline configuration details for a Tetra Data Platform (TDP) organization.

Executing this script requires an Administrator user's personal token, or a token from a Service User with the Administrator role.

Replace the values in <brackets> in the "Notebook Parameters" section below, and run all cells. When prompted in the "Notebook Parameters" cell, paste a valid authentication token into the input box and hit return/enter.

The output of this script is an Excel file in the `SAVE_DIR` location. If using Google Colab, you can find and download the file from Files browser in the sidebar with the default location.

## Import Statements

In [None]:
!pip install xlsxwriter
import os
import json
import requests
from datetime import datetime
from zoneinfo import ZoneInfo
import pandas as pd
import getpass
import xlsxwriter

## Notebook Parameters
Required parameters:
* `API_URL` = API base URL for your TDP instance, ending in "/v1/"
 * The format is `https://api.<your TDP hostname>/v1/` -- e.g. `https://api.tetrascience.com/v1/`
* `ORG_SLUG` = organization slug for the target TDP organization
 * Locate this in the platform in Administration > Organization Settings > Settings in the "Organization Slug" pane
* `AUTH_TOKEN` = You will be prompted to paste in a value, which must be an Administrator level personal access token for TDP, or token of a Service User with the Administrator role
 * See the linked pages from the [Create a JWT documentation](https://developers.tetrascience.com/reference/authentication#create-a-jwt) for details on these options.

Optional parameters, you may leave the default values:
* `SAVE_DIR` = directory on your local machine to save the output Excel file
 * Leave the default value `./` to save in the same location as this notebook file
* `TZ_IDENTIFIER` = Time zone identifier in the `tz database` used to generate the timestamp in the filename
 * More information about Time Zones in Python can be found [here](https://docs.python.org/3/library/zoneinfo.html).




In [None]:
API_URL = "https://api.<your TDP hostname>/v1/"
ORG_SLUG = ""

SAVE_DIR = "./"
TZ_IDENTIFIER = "America/New_York"

AUTH_TOKEN = getpass.getpass()

## Create filename

This uses the current date and time to document when the configuration was retrieved from the platform.

In [None]:
now = datetime.now(ZoneInfo(TZ_IDENTIFIER))
timestamp = now.strftime("%Y-%m-%d %H:%M:%S")
filenameBase = "TDP-config-report__" + ORG_SLUG + "__"
savefile_name = os.path.join(SAVE_DIR, filenameBase + now.strftime("%Y-%m-%d-%H%M%S") + ".xlsx")
print("savefile_name =", savefile_name)

## Standard API Headers



In [None]:
headers = {"ts-auth-token": AUTH_TOKEN,
           "x-org-slug": ORG_SLUG}

## API Endpoints

In [None]:
PIPELINE_SEARCH = API_URL + "pipeline/search"
AGENT_LIST = API_URL + "agents?include=labels"

## Export Info Sheet

In [None]:
info_data = {"Name": ["timestamp", "API_URL", "ORG_SLUG", "TZ_IDENTIFIER"], "Value": [timestamp, API_URL, ORG_SLUG, TZ_IDENTIFIER]}
info_df = pd.DataFrame(data=info_data)

In [None]:
print(info_df.to_string(index=False, header=False))

## Pipeline Configuration Sheet

In [None]:
def get_pipeline_page(headers, **kwargs):
    """
        Returns a set of pipelines and whether there are more
        pipelines remaining
        Optional args: page_size, page_index
    """
    pipeline_api = PIPELINE_SEARCH + "?"
    if "index" in kwargs.keys():
        page_index = kwargs["index"]
        pipeline_api += "from=" + str(page_index) + "&"
    if "size" in kwargs.keys():
        page_size = kwargs["size"]
        pipeline_api += "size=" + str(page_size) + "&"

    pipeline_response = requests.get(pipeline_api, headers=headers)
    pipeline_response = json.loads(pipeline_response.text)

    return pipeline_response["hits"], pipeline_response["hasNext"]

In [None]:
def get_all_pipelines(headers, size=1):
    """
        Returns list of all pipelines by iterating over full list
        by the size parameter.
    """
    hasNext = True
    index = 0
    all_pipelines = []
    while hasNext == True:
        pipes, hasNext = get_pipeline_page(headers, size=size, index=index)
        all_pipelines += pipes
        index += 1
    return all_pipelines

In [None]:
def get_pipeline_field(data, field):
    if field in ("triggerCondition", "stepsConfig"):
        return json.dumps(data.get(field), indent=2)

    if field == "pipelineConfig":
        nConfig = data.get(field).get("notificationsConfig")
        if nConfig != None:
            # Remove notificationsConfig element and insert separately
            temp = data.get(field).pop("notificationsConfig")
            data["notificationsConfig"] = json.dumps(temp, indent=2)

        return json.dumps(data.get(field), indent=2)

    return data.get(field)

In [None]:
pipeline_list = get_all_pipelines(headers)

In [None]:
pipeline_top_fields = ["id",
                        "name",
                        "description",
                        "status",
                        "createdAt",
                        "updatedAt",
                        "triggerCondition",
                        "maxParallelWorkflows",
                        "priority",
                        "retryBehavior",
                        "retryConfiguration",
                        "protocolSlug",
                        "protocolVersion",
                        "pipelineConfig",
                        "notificationsConfig",
                        "stepsConfig"]
pipeline_cols_to_wrap = ["triggerCondition", "pipelineConfig", "notificationsConfig", "stepsConfig"]

In [None]:
def pipeline_summary(pipeline_info, org):
    pipeline_top_vals = [org] + [get_pipeline_field(pipeline_info, x) for x in pipeline_top_fields]

    return pipeline_top_vals

In [None]:
pipeline_summaries = [pipeline_summary(a, ORG_SLUG) for a in pipeline_list]

In [None]:
pipeline_df = pd.DataFrame(pipeline_summaries, columns = ["orgSlug"] + pipeline_top_fields)

In [None]:
pipeline_df

## Agent Configuration Sheet

In [None]:
agent_response = requests.get(AGENT_LIST, headers=headers)
agent_list = json.loads(agent_response.text)

In [None]:
agent_top_fields = ["orgSlug",
                    "name",
                    "description",
                    "id",
                    "isEnabled",
                    "status",
                    "version",
                    "labels",
                    "tags",
                    "metadata",
                    "host",
                    "createdAt",
                    "updatedAt",
                    "configStatusUpdatedAt",
                    "type",
                    "liveType",
                    "integrationType",
                    "integrationId"]
agent_queue_fields = ["queue_enabled"]
agent_config_fields = ["destination_id"]
agent_paths_fields = ["paths",
                      "paths_start_date",
                      "paths_source_type",
                      "paths_interval",
                      "paths_labels",
                      "paths_tags",
                      "paths_metadata",
                      "paths_patterns",
                      "paths_filewatchmode",
                      "paths_fetch_os_created_user",
                      "paths_archive"]
agent_fields = agent_top_fields + agent_queue_fields + agent_config_fields + agent_paths_fields
# Maintain a list of JSON columns to wrap text in Excel
agent_cols_to_wrap = ["labels", "metadata", "tags"] + agent_paths_fields

In [None]:
def get_agent_field(data, field):
    if field in ("labels", "metadata", "tags"):
        print(field + ": " + json.dumps(data.get(field), indent=2))
        return json.dumps(data.get(field), indent=2)
    print(field + ": " + str(data.get(field)))
    return data.get(field)

In [None]:
def agent_summary(agent_info):
    agent_top_vals = [get_agent_field(agent_info, x) for x in agent_top_fields]

    if agent_info["queue"]:
        agent_queue_vals = [agent_info["queue"]["enabled"]]
    else:
        agent_queue_vals = ["N/A"]*len(agent_queue_fields)

    agent_config_vals = []
    for f in agent_config_fields:
        if (agent_info["type"] == "file-log") and (agent_info["config"] is not None) and (agent_info["config"].get(f) is not None):
            agent_config_vals.append(agent_info["config"][f])
        else:
            agent_config_vals.append("N/A")

    # Get additional File-Log Agent Configuration Information
    if (agent_info["type"] == "file-log") and (agent_info["config"] is not None):
        paths_info = agent_info["config"]["services_configuration"]["fileWatcher"]["paths"]

        agent_subvals = [[x.get("path") for x in paths_info],
                         [x.get("start_date") for x in paths_info],
                         [x.get("source_type") for x in paths_info],
                         [x.get("interval") for x in paths_info],
                         [x.get("labels") for x in paths_info],
                         [x.get("metadata") for x in paths_info],
                         [x.get("tags") for x in paths_info],
                         [x.get("patterns") for x in paths_info],
                         [x.get("file_watch_mode") for x in paths_info],
                         [x.get("fetch_os_created_user") for x in paths_info],
                         [x.get("archive") for x in paths_info]]
        # Format the lists of Path fields for readability
        for index, value in enumerate(agent_subvals):
          agent_subvals[index] = json.dumps(value, indent=2)
    else:
        agent_subvals = ["N/A"]*len(agent_paths_fields)

    return agent_top_vals + agent_queue_vals + agent_config_vals + agent_subvals

In [None]:
agent_summaries = [agent_summary(a) for a in agent_list]

In [None]:
agent_df = pd.DataFrame(agent_summaries, columns = agent_fields)

In [None]:
agent_df

## Save to Excel

In [None]:
def write_df_to_excel(writer, df, sheet_name, cols_to_wrap):
    df.to_excel(writer, sheet_name=sheet_name, index=False)
    if cols_to_wrap:
        workbook  = writer.book
        worksheet = writer.sheets[sheet_name]
        wrap_format = workbook.add_format({'text_wrap': True})

        for col in df.columns.get_indexer(cols_to_wrap):
            excel_header =f"{xlsxwriter.worksheet.xl_col_to_name(col)}:{xlsxwriter.worksheet.xl_col_to_name(col)}"
            worksheet.set_column(excel_header, 30, wrap_format)

In [None]:
with pd.ExcelWriter(savefile_name, engine='xlsxwriter') as writer:
    info_df.to_excel(writer, sheet_name='Info', index=False, header=False)
    write_df_to_excel(writer, agent_df, 'Agent Cfg', agent_cols_to_wrap)
    write_df_to_excel(writer, pipeline_df, 'Pipeline Cfg', pipeline_cols_to_wrap)