In [103]:
# this is a parameter that will get overwritten when run by papermill on a schedules
is_local_development = True

In [104]:
!python -m pip install gitlabdata --upgrade



In [105]:
import configparser

# import sys
import pandas as pd
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
import json, os
from pyprojroot import here
from os import environ as env

In [106]:
from gitlabdata.orchestration_utils import (
    data_science_engine_factory,
    query_dataframe,
    snowflake_engine_factory,
    snowflake_stage_load_copy_remove,
    get_env_from_profile,
    dataframe_uploader,
    write_to_gsheets,
    read_from_gsheets,
    query_executor,
    query_from_file,
)

## Create Snowflake engine

In [107]:
# engine factory can be created using a local role from output
# depending on this notebook being run locally or remotely, the
# engine is creation process is different

if is_local_development:
    snowflake_engine = data_science_engine_factory(
        profile_target="sales_analytics_local"
    )
else:
    snowflake_engine = snowflake_engine_factory(env, "SALES_ANALYTICS")

    raw_db_name = env["SNOWFLAKE_LOAD_DATABASE"]
    prod_db_name = env["SNOWFLAKE_PROD_DATABASE"]

snowflake_engine

Engine(snowflake://nfiguera%40gitlab.com:***@gitlab/RAW/?authenticator=externalbrowser&role=NFIGUERA&warehouse=DEV_XS)

## Credentials for Gsheet manipulation

Remember to give access to the following two users:

- Data Team runner: data-team-sheets-sa@gitlab-analysis.iam.gserviceaccount.com
- Sales Strategy service account: service-revenue-strat-analytic@revenue-strategy-anal-411d5a72.iam.gserviceaccount.com

In [108]:
# read the credentials of the google service account
if is_local_development:
    credentials_path = here("credentials/rsa_gcloud_service_account.json")
    # credentials_path = here("credentials/gsheet_service_file.json")

    with open(credentials_path) as f:
        service_account_credentials = f.read().replace("\n", "")

    # set the credential as a enviroment variable
    os.environ["GSHEETS_SERVICE_ACCOUNT_CREDENTIALS"] = service_account_credentials

# X-Ray daily ETL update process

This notebook is run daily to update the X-Ray dashboard. 

The process is responsible of updating the following datasets:

1. QTD Metrics
2. Historical values subset

## Excute Snowflake query

In [109]:
qtd_metrics = query_from_file(snowflake_engine, "qtd_metrics_for_xray.sql")

In [110]:
# Test
qtd_metrics.qtd_booked_net_arr.sum()

20194208.07

In [111]:
hist_qtr_metrics = query_from_file(snowflake_engine, "hist_qtr_coverages_for_xray.sql")
hist_qtr_metrics["last_extracted_etl"] = date.today()

In [112]:
hist_qtr_metrics.head()

Unnamed: 0,key_agg_day,agg_key_name,agg_key_value,agg_key_day,bookings_linearity,open_1plus_net_arr_coverage,open_3plus_net_arr_coverage,open_4plus_net_arr_coverage,rq_plus_1_open_1plus_net_arr_coverage,rq_plus_1_open_3plus_net_arr_coverage,rq_plus_1_open_4plus_net_arr_coverage,rq_plus_2_open_1plus_net_arr_coverage,rq_plus_2_open_3plus_net_arr_coverage,rq_plus_2_open_4plus_net_arr_coverage,last_extracted_etl
0,entg_emea_neur_1. new_72,key_bu_subbu_division_ot,entg_emea_neur_1. new,72,0.72015,3.486822,3.341787,2.892826,4.271278,2.517282,0.858253,2.890982,1.899353,0.270003,2023-04-13
1,entg_emea_neur_1. new_73,key_bu_subbu_division_ot,entg_emea_neur_1. new,73,0.732437,3.465852,3.346485,2.907103,4.28625,2.522603,0.883942,2.934375,1.91906,0.27538,2023-04-13
2,entg_emea_neur_1. new_74,key_bu_subbu_division_ot,entg_emea_neur_1. new,74,0.744631,3.440084,3.345312,2.91765,4.301697,2.52752,0.9111,2.978396,1.936089,0.28093,2023-04-13
3,entg_emea_neur_1. new_75,key_bu_subbu_division_ot,entg_emea_neur_1. new,75,0.756737,3.409189,3.337745,2.924127,4.317633,2.532059,0.939812,3.022996,1.950217,0.286621,2023-04-13
4,entg_emea_neur_1. new_76,key_bu_subbu_division_ot,entg_emea_neur_1. new,76,0.768757,3.372833,3.323247,2.926183,4.334073,2.536247,0.97017,3.068119,1.961209,0.292421,2023-04-13


In [113]:
hist_fy_n4q_metrics = query_from_file(
    snowflake_engine, "hist_fy_n4q_coverages_for_xray.sql"
)

hist_fy_n4q_metrics["last_extracted_etl"] = date.today()

In [114]:
hist_fy_n4q_metrics.head()

Unnamed: 0,agg_key_value_day,agg_key_value,close_day_of_fiscal_year_normalised,cfy_open_1plus_net_arr_coverage,cfy_open_3plus_net_arr_coverage,n4q_open_1plus_net_arr_coverage,n4q_open_3plus_net_arr_coverage,last_updated_at,last_extracted_etl
0,large_amer_latam_latam_ae generated_71,large_amer_latam_latam_ae generated,71,0.789236,0.808845,0.889402,0.841592,2023-04-13,2023-04-13
1,large_amer_latam_latam_ae generated_72,large_amer_latam_latam_ae generated,72,0.787871,0.80721,0.893598,0.842676,2023-04-13,2023-04-13
2,large_amer_latam_latam_ae generated_73,large_amer_latam_latam_ae generated,73,0.786868,0.805916,0.897784,0.843686,2023-04-13,2023-04-13
3,large_amer_latam_latam_ae generated_74,large_amer_latam_latam_ae generated,74,0.786219,0.804955,0.901962,0.844625,2023-04-13,2023-04-13
4,large_amer_latam_latam_ae generated_75,large_amer_latam_latam_ae generated,75,0.785917,0.80432,0.906132,0.845494,2023-04-13,2023-04-13


## QTD Metrics Extracts

### QTD Transform data into a pre-aggregated table

In [115]:
# SQL output is imported as a dataframe variable called 'df'

df = qtd_metrics.copy()


# check the subset of columns and if see an na or an other
# set all the keys to other
def check_for_others_or_na(x):
    if ("other" in "\t".join(list(x))) or ("na" in list(x)):
        x = x.apply(lambda x: "other")

    return x


### considered keys
agg_key_list = [
    "key_overall",
    "key_sqs",
    "key_ot",
    "key_bu",
    "key_bu_ot",
    "key_bu_sqs",
    "key_bu_subbu",
    "key_bu_subbu_ot",
    "key_bu_subbu_sqs",
    "key_bu_subbu_division",
    "key_bu_subbu_division_ot",
    "key_bu_subbu_division_sqs",
    "key_bu_subbu_division_asm",
]

metrics_list = [
    "qtd_open_1plus_deal_count",
    "qtd_open_3plus_deal_count",
    "qtd_open_4plus_deal_count",
    "qtd_closed_deal_count",
    "qtd_pipe_gen_deal_count",
    "qtd_booked_net_arr",
    "qtd_open_1plus_net_arr",
    "qtd_open_3plus_net_arr",
    "qtd_open_4plus_net_arr",
    "qtd_pipe_gen_net_arr",
    "qtd_created_and_closed_net_arr",
    "rq_plus_1_open_1plus_deal_count",
    "rq_plus_1_open_3plus_deal_count",
    "rq_plus_1_open_4plus_deal_count",
    "rq_plus_1_open_1plus_net_arr",
    "rq_plus_1_open_3plus_net_arr",
    "rq_plus_1_open_4plus_net_arr",
    "rq_plus_2_open_1plus_deal_count",
    "rq_plus_2_open_3plus_deal_count",
    "rq_plus_2_open_4plus_deal_count",
    "rq_plus_2_open_1plus_net_arr",
    "rq_plus_2_open_3plus_net_arr",
    "rq_plus_2_open_4plus_net_arr",
    "minus_1_year_pipe_gen_net_arr",
    "cfy_open_1plus_deal_count",
    "cfy_open_3plus_deal_count",
    "cfy_open_4plus_deal_count",
    "cfy_closed_deal_count",
    "cfy_booked_net_arr",
    "cfy_open_1plus_net_arr",
    "cfy_open_1plus_under_1m_net_arr",
    "cfy_open_3plus_net_arr",
    "cfy_open_4plus_net_arr",
    "next_4q_open_1plus_deal_count",
    "next_4q_open_3plus_deal_count",
    "next_4q_open_4plus_deal_count",
    "next_4q_open_1plus_net_arr",
    "next_4q_open_3plus_net_arr",
    "next_4q_open_4plus_net_arr",
    "next_4q_booked_net_arr",
]

# initialize the dataframe
results = None
##### Initialize a few artificial metrics and update the data types
df[metrics_list] = df[metrics_list].astype(float)

# create an artificial global key to calculate a global curve for the whole company
# df['KEY_OVERALL'] = 'global'
df["key_overall"] = "other"
df.loc[df["key_bu"] != "other", "key_overall"] = "global"

for agg_key_value in agg_key_list:
    temp_agg = df.groupby(agg_key_value)[metrics_list].sum().reset_index()

    temp_agg.rename({agg_key_value: "agg_key_value"}, inplace=True, axis=1)
    temp_agg["agg_key_name"] = agg_key_value

    if results is None:
        results = temp_agg.copy()
    else:
        results = results.append(temp_agg)

results.columns = results.columns.str.lower()

fields_order = [
    "agg_key_value",
    "agg_key_name",
    "qtd_booked_net_arr",
    "qtd_open_1plus_net_arr",
    "qtd_open_3plus_net_arr",
    "qtd_open_4plus_net_arr",
    "qtd_pipe_gen_net_arr",
    "rq_plus_1_open_1plus_net_arr",
    "rq_plus_1_open_3plus_net_arr",
    "rq_plus_1_open_4plus_net_arr",
    "rq_plus_2_open_1plus_net_arr",
    "rq_plus_2_open_3plus_net_arr",
    "rq_plus_2_open_4plus_net_arr",
    "qtd_created_and_closed_net_arr",
    "minus_1_year_pipe_gen_net_arr",
    "qtd_open_1plus_deal_count",
    "qtd_open_3plus_deal_count",
    "qtd_open_4plus_deal_count",
    "qtd_closed_deal_count",
    "qtd_pipe_gen_deal_count",
    "rq_plus_1_open_1plus_deal_count",
    "rq_plus_1_open_3plus_deal_count",
    "rq_plus_1_open_4plus_deal_count",
    "rq_plus_2_open_1plus_deal_count",
    "rq_plus_2_open_3plus_deal_count",
    "rq_plus_2_open_4plus_deal_count",
    "current_fiscal_quarter_day_normalised",
    "last_updated",
    "cfy_open_1plus_deal_count",
    "cfy_open_3plus_deal_count",
    "cfy_open_4plus_deal_count",
    "cfy_closed_deal_count",
    "cfy_booked_net_arr",
    "cfy_open_1plus_net_arr",
    "cfy_open_1plus_under_1m_net_arr",
    "cfy_open_3plus_net_arr",
    "cfy_open_4plus_net_arr",
    "next_4q_open_1plus_deal_count",
    "next_4q_open_3plus_deal_count",
    "next_4q_open_4plus_deal_count",
    "next_4q_booked_net_arr",
    "next_4q_open_1plus_net_arr",
    "next_4q_open_3plus_net_arr",
    "next_4q_open_4plus_net_arr",
    "last_extracted_etl",
]

# I need to keep track in the dashboard of the latest day the extract was taking out, this is shown to the business users and used in the
# historical report
results["current_fiscal_quarter_day_normalised"] = max(
    df["current_fiscal_quarter_day_normalised"]
)
results["last_updated"] = date.today()
results["last_extracted_etl"] = date.today()

qtd_pre_aggregated = results[fields_order].copy()

In [116]:
# Test QTD total
qtd_pre_aggregated[qtd_pre_aggregated["agg_key_name"] == "key_bu"].groupby(
    "agg_key_value"
).agg({"qtd_booked_net_arr": "sum"})

Unnamed: 0_level_0,qtd_booked_net_arr
agg_key_value,Unnamed: 1_level_1
comm,5597009.26
entg,14727008.25
other,-129809.44


### Write to X-Ray source gSheet file

In [117]:
# Write to GSheets
sheet_id = "1Vwu8euxRgIF3QYWK8hAbp4Vy21AlFfpDwI4MaEEiIWk"
sheet_name = "jupyter_qtd_aggregated"
write_to_gsheets(sheet_id, sheet_name, qtd_pre_aggregated)

ERROR:root:<HttpError 400 when requesting https://sheets.googleapis.com/v4/spreadsheets/1Vwu8euxRgIF3QYWK8hAbp4Vy21AlFfpDwI4MaEEiIWk:batchUpdate?fields=replies%2FaddSheet&alt=json returned "Invalid requests[0].addSheet: A sheet with the name "jupyter_qtd_aggregated" already exists. Please enter another name.">


# Historical QTR Coverages Data Extract

In [118]:
# Write to GSheets
sheet_id = "1Vwu8euxRgIF3QYWK8hAbp4Vy21AlFfpDwI4MaEEiIWk"
sheet_name = "jupyter_hist_qtr_aggregated"
write_to_gsheets(sheet_id, sheet_name, hist_qtr_metrics)

ERROR:root:<HttpError 400 when requesting https://sheets.googleapis.com/v4/spreadsheets/1Vwu8euxRgIF3QYWK8hAbp4Vy21AlFfpDwI4MaEEiIWk:batchUpdate?fields=replies%2FaddSheet&alt=json returned "Invalid requests[0].addSheet: A sheet with the name "jupyter_hist_qtr_aggregated" already exists. Please enter another name.">


# Historical FY / N4Q Coverages Data Extract

In [119]:
# Write to GSheets
sheet_id = "1Vwu8euxRgIF3QYWK8hAbp4Vy21AlFfpDwI4MaEEiIWk"
sheet_name = "jupyter_hist_fy_n4q_aggregated"
write_to_gsheets(sheet_id, sheet_name, hist_fy_n4q_metrics)

ERROR:root:<HttpError 400 when requesting https://sheets.googleapis.com/v4/spreadsheets/1Vwu8euxRgIF3QYWK8hAbp4Vy21AlFfpDwI4MaEEiIWk:batchUpdate?fields=replies%2FaddSheet&alt=json returned "Invalid requests[0].addSheet: A sheet with the name "jupyter_hist_fy_n4q_aggregated" already exists. Please enter another name.">
