In [None]:
# this is a parameter that will get overwritten when run by papermill on a schedules
is_local_development = True

# Jupyter JSON upload notebook

The goal is to have a template on how to upload data to Snowflake using pandas dataframe upload and a JSON approach.

The model leverages or is inspired by the following libraries:

- [Gitlab Orchestation Utils](https://gitlab.com/gitlab-data/gitlab-data-utils/-/blob/master/gitlabdata/orchestration_utils.py#L282)
- [Data Science Scoring repo](https://gitlab.com/gitlab-data/data-science-projects/propensity-to-contract-and-churn/-/blob/main/prod/scoring_code.ipynb)

## Status:

20230215 NF: 


The JSON upload process works but only for smaller files (no more than 16MB). That seriously hampers its usefulness. 

The data frame upload process took around 19 minutes to execute the upload for a file of 200k rows. It is slow but it works.

A parameter and a code to handle production vs local run was added to the template. Also the gsheet write function works. Just wondering if we could pass the value as a parameter instead as of a OS variable. 

The JSON file had this error regarding size: 

```
ProgrammingError: (snowflake.connector.errors.ProgrammingError) 100069 (22P02): 01aa607c-0405-b753-0000-289d4df5c93a: Error parsing JSON: document is too large, max size 16777216 bytes
  File 'stages/b712926c-eb9b-4af5-b02c-a2ae80ce832a/upa_summary.json.gz', line 1, character 16777216
  Row 0, column $1
  If you would like to continue loading when an error is encountered, use other values such as 'SKIP_FILE' or 'CONTINUE' for the ON_ERROR option. For more information on loading options, please run 'info loading_data' in a SQL client.
[SQL: copy into raw.sales_analytics.upa_summary_json (jsontext)
                         from @raw.sales_analytics.sales_analytics_load
                         file_format=(type='json'),
                         on_error='abort_statement';]
(Background on this error at: https://sqlalche.me/e/14/f405)

-----------------------------
20230207 NF: The Dataframe upload process works, but the JSON process stills fails with an error 

```
(snowflake.connector.errors.ProgrammingError) 000904 (42000): 01aa2b92-0405-a900-0000-289d4d4d634a: SQL compilation error: error line 1 at position 43
invalid identifier 'JSONTEXT'
[SQL: copy into raw.sales_analytics.upa_summary (jsontext)
                         from @raw.sales_analytics.sales_analytics_load
                         file_format=(type='json'),
                         on_error='skip_file';]
```


```

## Local variable set up:

A local variable in the file .dbt/profiles.yml needs to be created

```
output:
   sales_analytics_local:
     type: snowflake
     threads: 16
     account: gitlab
     user: janesmith@gitlab.com # <-- This will be your GitLab email
     role: sales_analytics # <-- Talk to your manager, usually, it is JSMITH for Jane Smith
     database: RAW
     warehouse: DEV_XS # <-- [ANALYST_XS, ENGINEER_XS], depends on your role
     schema: SALES_ANALYTICS
     authenticator: externalbrowser #
```

In [None]:
# install required packages
! pip install  pygsheets
! pip install "pyarrow<5.1.0,>=5.0.0;"
! pip install --upgrade google.cloud
! pip install --upgrade pandas_gbq
! pip install pyprojroot


In [None]:
!{sys.executable} -m pip install gitlabdata --upgrade

In [None]:
import pygsheets
import configparser
import sys
import snowflake.connector
import pandas as pd
from datetime import datetime
import numpy as np                   # v 1.19.2
import matplotlib.pyplot as plt      # v 3.3.2
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
# calculate the net_arr bucket of open deals
import seaborn as sns
from math import floor 
from datetime import date
import numpy as np
import matplotlib.pyplot as plt

# https://pypi.org/project/pyprojroot/
from pyprojroot import here



In [None]:
from gitlabdata.orchestration_utils import (
    data_science_engine_factory,
    query_dataframe,
    snowflake_engine_factory,
    snowflake_stage_load_copy_remove,
    get_env_from_profile,
    dataframe_uploader,
    write_to_gsheets,
    query_executor
)

In [None]:
import os as os
os.getcwd()

# NF: Just to deal with my working directory changing
# Get the current working directory
cwd = os.getcwd()
print(cwd)
#os.chdir("/Users/nfiguera/repos/sales-strategy-and-analytics-business-intelligence/jupyter_dev/")


## Create Snowflake engine

In [None]:
# engine factory can be created using a local role from output
# depending on this notebook being run locally or remotely, the 
# engine is creation process is different

if is_local_development:
    snowflake_engine = data_science_engine_factory(profile_target="sales_analytics_local")
else:
    snowflake_engine = snowflake_engine_factory(env, "sales_analytics")

snowflake_engine

## Excute Snowflake query

In [None]:
def executeScriptFromFile(filename, engine):
    # Open and read the file as a single buffer
    fd = open(filename, 'r')
    sqlFile = fd.read()
    fd.close()

    print(filename)
    print(len(sqlFile))

    results = -1

    try:
        results = query_dataframe(engine, sqlFile)
    except:
        print("Command did not run")

    return results

In [None]:
upa_summary = executeScriptFromFile('dbt_bob_upa.sql', snowflake_engine)

# TEST Total FY Net ARR
index = (upa_summary['report_fiscal_year'] == 2023)
upa_summary[index].fy_booked_net_arr.sum()

In [None]:
# read the credentials of the google service account
import json, os

if is_local_development:
    credentials_path = here('credentials/gsheet_service_file.json')

    with open(credentials_path) as f:
       service_account_credentials = f.read().replace('\n', '')

    # set the credential as a enviroment variable
    os.environ["GSHEETS_SERVICE_ACCOUNT_CREDENTIALS"] = service_account_credentials


In [None]:
# Write to GSheets
sheet_id = '1eRo30S0G4-QkGBdpz7jBmRSRsMHkfysfi_xv2ab_28Q'
sheet_name = 'new_nf_testing'
write_to_gsheets(sheet_id, sheet_name, upa_summary.head(100))

## Test to_sql from pandas

In [None]:
%%time
print(len(upa_summary))

# this works
dataframe_uploader(
    dataframe = upa_summary,
    engine = snowflake_engine,
    table_name = 'upa_summary',
    schema = "SALES_ANALYTICS",
    if_exists = "replace",
    add_uploaded_at = False
) 

## The JSON Upload process

JSON process expects a table with two columns:
- JSONTEXT with the JSON file as content
- UPDATED AT

The table is then processed by DBT models and exposed in prod as a flat table.

**JSON upload only work for smaller files!**

In [None]:
%%time

# OUTPUT SCORES TO JSON
output_filename = here("data/upa_summary.json")

# test with only 10 rows
output_scores = upa_summary.head(1000)
output_scores.to_json(output_filename, orient="records", date_format="iso")

# this table is later processed using dbt models
json_tablename = 'raw.sales_analytics.upa_summary_json'

# creation of target table
create_json_table_query = 'CREATE OR REPLACE TABLE {} (jsontext string, updated_at date)'.format(json_tablename)

# create or replace existing table with the JSON expected format
query_executor(snowflake_engine, create_json_table_query)

snowflake_stage_load_copy_remove(
    output_filename,
    f"raw.sales_analytics.sales_analytics_load",
    json_tablename,
    snowflake_engine,
)