In [None]:
# this is a parameter that will get overwritten when run by papermill on a schedules
is_local_development = True

In [None]:
!python -m pip install gitlabdata --upgrade

In [None]:
import configparser

# import sys
import pandas as pd
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
import json, os
from pyprojroot import here
from os import environ as env
import re

In [None]:
from gitlabdata.orchestration_utils import (
    data_science_engine_factory,
    query_dataframe,
    snowflake_engine_factory,
    snowflake_stage_load_copy_remove,
    get_env_from_profile,
    dataframe_uploader,
    write_to_gsheets,
    read_from_gsheets,
    query_executor,
    query_from_file,
)

## Create Snowflake engine

In [None]:
# engine factory can be created using a local role from output
# depending on this notebook being run locally or remotely, the
# engine is creation process is different

if is_local_development:
    snowflake_engine = data_science_engine_factory(
        profile_target="sales_analytics_local"
    )
else:
    snowflake_engine = snowflake_engine_factory(env, "SALES_ANALYTICS")

    raw_db_name = env["SNOWFLAKE_LOAD_DATABASE"]
    prod_db_name = env["SNOWFLAKE_PROD_DATABASE"]

snowflake_engine

## Credentials for Gsheet manipulation

Remember to give access to the following two users:

- Data Team runner: data-team-sheets-sa@gitlab-analysis.iam.gserviceaccount.com
- Sales Strategy service account: service-revenue-strat-analytic@revenue-strategy-anal-411d5a72.iam.gserviceaccount.com

In [None]:
# read the credentials of the google service account
if is_local_development:
    credentials_path = here("credentials/rsa_gcloud_service_account.json")
    # credentials_path = here("credentials/gsheet_service_file.json")

    with open(credentials_path) as f:
        service_account_credentials = f.read().replace("\n", "")

    # set the credential as a enviroment variable
    os.environ["GSHEETS_SERVICE_ACCOUNT_CREDENTIALS"] = service_account_credentials

# Tableau ASM consolidation script

The goal of this notebook is to collect multiple different datasets and stack them in a tall table that can be easier to access from Tableau.

## Load Snowflake data

- Opportunity aggregated / detail
- Opportunity snapshot aggregated

In [None]:
# live opportunity detail
detail_opty = query_from_file(snowflake_engine, "tableau_asm_opportunity_detail.sql")
detail_opty["last_extracted_etl"] = date.today()

detail_opty["record_type"] = "opportunity detail"

In [None]:
# live opportunity agg
agg_opty = query_from_file(snowflake_engine, "tableau_asm_opportunity_agg.sql")
agg_opty["last_extracted_etl"] = date.today()

agg_opty["record_type"] = "net arr aggregated"

In [None]:
# snapshot opportunity agg
snap_opty = query_from_file(
    snowflake_engine, "tableau_asm_opportunity_snap_open_closed_agg.sql"
)
snap_opty["last_extracted_etl"] = date.today()
snap_opty["record_type"] = "opportunity snapshot cq open closed agg"

In [None]:
# snapshot opportunity agg
snap_cq1_opty = query_from_file(
    snowflake_engine, "tableau_asm_opportunity_snap_open_cq_plus_1_agg.sql"
)
snap_cq1_opty["last_extracted_etl"] = date.today()
snap_cq1_opty["record_type"] = "opportunity snapshot cq plus 1 open agg"

In [None]:
# snapshot opportunity agg
snap_cq2_opty = query_from_file(
    snowflake_engine, "tableau_asm_opportunity_snap_open_cq_plus_2_agg.sql"
)
snap_cq2_opty["last_extracted_etl"] = date.today()
snap_cq2_opty["record_type"] = "opportunity snapshot cq plus 2 open agg"

In [None]:
# pipe gen aggregated live
pipe_gen_agg_opty = query_from_file(
    snowflake_engine, "tableau_asm_opportunity_pipe_gen_agg_live.sql"
)
pipe_gen_agg_opty["last_extracted_etl"] = date.today()
pipe_gen_agg_opty["record_type"] = "pipe gen live agg narr"

In [None]:
# pipe gen snap aggregated same quarter day
pipe_gen_snap_opty = query_from_file(
    snowflake_engine, "tableau_asm_opportunity_pipe_gen_snap_same_day.sql"
)
pipe_gen_snap_opty["last_extracted_etl"] = date.today()
pipe_gen_snap_opty["record_type"] = "pipe gen snap same day narr"

In [None]:
# pipe gen snaphot quarter end aggregated
pipe_gen_snap_qend_opty = query_from_file(
    snowflake_engine, "tableau_asm_opportunity_pipe_gen_snap_agg.sql"
)
pipe_gen_snap_qend_opty["last_extracted_etl"] = date.today()
pipe_gen_snap_qend_opty["record_type"] = "pipe gen snap quarter end narr"

In [None]:
## Consolidate tables into a single dataframe
datasets = [
    detail_opty,
    agg_opty,
    snap_opty,
    snap_cq1_opty,
    snap_cq2_opty,
    pipe_gen_agg_opty,
    pipe_gen_snap_opty,
    pipe_gen_snap_qend_opty,
]
df_consolidated = pd.concat(datasets, axis=0, ignore_index=True)

In [None]:
df_consolidated.groupby("record_type").agg("count")

# Extra adjustments to the dataset

## Load Industry Category

The input tab is here https://docs.google.com/spreadsheets/d/19PPoHdc5nRZRX3dKGeGyYyTwqZ_x7GwD3EaDnwVm9Xs/edit#gid=976122736

The tab was originally created by Meri and adjusted by me.

In [None]:
GSHEET_ID_NET_ARR = "19PPoHdc5nRZRX3dKGeGyYyTwqZ_x7GwD3EaDnwVm9Xs"


sheet_id = GSHEET_ID_NET_ARR
sheet_name = "input_industry_to_industry_category"

industry_category = read_from_gsheets(sheet_id, sheet_name)


print(len(industry_category))

In [None]:
df_consolidated = df_consolidated.merge(industry_category, on="industry").copy()
df_consolidated["industry"] = df_consolidated["industry_category"]

In [None]:
# merge industry category and substitute the original field with the adjusted one
# to avoid messing with the view
df_consolidated

# Tests



In [None]:
index = (
    (df_consolidated["is_open_stage_1_plus"] == True)
    & (df_consolidated["record_type"] == "net arr aggregated")
    & (df_consolidated["close_fiscal_quarter_name"] == "FY24-Q2")
)
df_consolidated[index].net_arr.sum()

In [None]:
index = (
    (df_consolidated["is_open_stage_1_plus"] == True)
    & (df_consolidated["record_type"] == "net arr aggregated")
    & (df_consolidated["close_fiscal_quarter_name"] == "FY24-Q3")
)
df_consolidated[index].groupby("industry").net_arr.sum()

In [None]:
### Add Temporary fields

df_consolidated["sao_count"] = 0
df_consolidated["churn_contraction_net_arr"] = 0

## Save Consolidated Table into Snowflake

In [None]:
target_columns = [
    "record_type",
    "owner_id",
    "opportunity_owner",
    "account_id",
    "account_name",
    "report_opportunity_user_business_unit",
    "report_opportunity_user_sub_business_unit",
    "report_opportunity_user_division",
    "report_opportunity_user_asm",
    "report_opportunity_user_role_type",
    "deal_size_bin",
    "age_bin",
    "partner_category",
    "sales_qualified_source",
    "stage_name",
    "order_type_stamped",
    "deal_group",
    "sales_type",
    "forecast_category_name",
    "product_category_tier",
    "product_category_deployment",
    "parent_crm_account_upa_country_name",
    "is_web_portal_purchase",
    "is_open",
    "is_stage_1_plus",
    "is_stage_3_plus",
    "fpa_master_bookings_flag",
    "is_eligible_created_pipeline_flag",
    "opportunity_id",
    "opportunity_name",
    "close_date",
    "created_date",
    "pipeline_created_date",
    "report_date",
    "net_arr",
    "booked_net_arr",
    "open_1plus_net_arr",
    "deal_count",
    "booked_deal_count",
    "age_in_days",
    "total_professional_services_value",
    "total_book_professional_services_value",
    "total_lost_professional_services_value",
    "total_open_professional_services_value",
    "prev_quarter_booked_net_arr",
    "prev_quarter_booked_deal_count",
    "prev_quarter_booked_professional_services",
    "prev_year_booked_net_arr",
    "prev_year_booked_deal_count",
    "prev_year_booked_professional_services",
    "is_open_pipeline_range_flag",
    "is_bookings_range_flag",
    "is_open_stage_1_plus",
    "is_open_stage_3_plus",
    "close_fiscal_year",
    "close_fiscal_quarter_name",
    "key_bu_subbu_division_asm_sqs_ot",
    "key_bu_subbu",
    "last_extracted_etl",
    "snapshot_date",
    "is_cfq_flag",
    "is_cfq_plus_1_flag",
    "is_cfq_plus_2_flag",
    "industry",
    "pipeline_landing_quarter",
    "pipeline_created_fiscal_quarter_name",
    "pipeline_created_fiscal_year",
    "lam_dev_count_bin",
    "lam_dev_count",
    "is_pipe_gen_range_flag",
    "prev_year_net_arr",
    "prev_quarter_net_arr",
    "is_eligible_open_pipeline_flag",
    "sao_count",
    "churn_contraction_net_arr",
    "created_fiscal_quarter_name",
    "created_fiscal_year",
    "prev_quarter_booked_churned_contraction_net_arr",
    "booked_churned_contraction_net_arr",
    "booked_churned_contraction_deal_count",
    "churned_contraction_net_arr",
    "prev_year_booked_churned_contraction_net_arr",
    "current_stage_age_bin",
    "is_cfy_flag",
]

In [None]:
# Print columns in the model but not in the master upload list
list(set(df_consolidated).difference(target_columns))

In [None]:
# using datetime module
import datetime

# ct stores current time
ct = datetime.datetime.now()
print("current time:-", ct)

In [None]:
%%time
# upload to database
print(len(df_consolidated))

table_name = "tableau_asm_consolidated_sources"
schema = "SALES_ANALYTICS"

dataframe_uploader(
    dataframe=df_consolidated[target_columns],
    engine=snowflake_engine,
    table_name=table_name,
    schema=schema,
    if_exists="replace",
    add_uploaded_at=True,
)

In [None]:
# using datetime module
import datetime

# ct stores current time
ct = datetime.datetime.now()
print("current time:-", ct)