In [1]:
from sqlanalyzer import column_parser, unbundle, query_analyzer
import re, json, time, sys
import pandas as pd

## given db metadata

In [2]:
def extract_subquery_fields(query, db_fields):
    formatter = column_parser.Parser(query)
    formatted = formatter.format_query(query)
    fields = formatter.match_queried_fields(formatted, db_fields)
    return fields
 
    
def unnest_query_list(query_list):
    preprocess_list = []
    
    for q in query_list:
        for _, query in q.items():
            
            if isinstance(query, str):
                preprocess_list.append(query)
            else:
                
                for sub_q in query:
                    sub_list = []
                    for _, sub_query in sub_q.items():
                        
                        if isinstance(sub_query, str):
                            sub_list.append(sub_query)
                            
                        else:
                            for sub_sub_q in sub_query:
                                for _, sub_sub_query in sub_sub_q.items():
                                    
                                    if isinstance(sub_sub_query, str):
                                        sub_list.append(sub_sub_query)
                                        
                    preprocess_list.extend(sub_list)
                    
    return preprocess_list


In [3]:
query = """WITH opportunity_product_rev AS
  (SELECT opportunity_id,
          op.product_2_id, p.name AS product_name,
                           COALESCE(CAST(quantity AS DOUBLE) * CAST(list_price AS DOUBLE), 0) AS total_price,
                           netsuite_conn_netsuite_item_key_id_c AS netsuite_conn_net_suite_item_key_id_c,
                           COUNT(netsuite_conn_netsuite_item_key_id_c) OVER (PARTITION BY opportunity_id) AS num_netsuite_items
   FROM sfdc.opportunity_product op
   LEFT JOIN sfdc.products p ON op.product_2_id = p.id
   AND p.dt = '{run_date}'
   WHERE op.dt = '{run_date}'
     AND opportunity_id IS NOT NULL
     AND product_name_c IS NOT NULL
     AND op.is_deleted = FALSE ),

product_mapping AS
  (SELECT opportunity_id,
          op.product_name,
          COALESCE(mapped_product, 'unmapped') AS mapped_product,
          CASE
              WHEN op.opportunity_id = '0063600000B1icTAAR'
                   AND op.product_2_id = '01t36000001SzdWAAS' THEN 0.0
              WHEN op.opportunity_id = '00636000004JKS0AAO'
                   AND op.product_2_id = '01t36000000iDrwAAE' THEN 42.0
              WHEN op.opportunity_id = '00636000004JKS0AAO'
                   AND op.product_2_id = '01t36000001SzdWAAS' THEN 58.0
              WHEN op.opportunity_id = '00636000006IgpvAAC'
                   AND op.product_2_id = '01t36000000iDrwAAE' THEN 9.0
              WHEN op.opportunity_id = '00636000006IgpvAAC'
                   AND op.product_2_id = '01t36000001SzdWAAS' THEN 91.0
              WHEN op.opportunity_id = '00636000008xsoWAAQ'
                   AND op.product_2_id = '01t360000007IMZAA2' THEN 3.0
              WHEN op.opportunity_id = '00636000008xsoWAAQ'
                   AND op.product_2_id = '01t36000000iDrwAAE' THEN 79.5
              WHEN op.opportunity_id = '00636000008xsoWAAQ'
                   AND op.product_2_id = '01t36000001SzdWAAS' THEN 17.5
              WHEN op.opportunity_id = '00636000008xsoWAAQ'
                   AND op.product_2_id = '01t36000003Mn0JAAS' THEN 0.0
              WHEN op.opportunity_id = '0063600000IH8B9AAL'
                   AND op.product_2_id = '01t360000005tjBAAQ' THEN 0.2
              WHEN op.opportunity_id = '0063600000IH8B9AAL'
                   AND op.product_2_id = '01t360000007IMZAA2' THEN 9.6
              WHEN op.opportunity_id = '0063600000IH8B9AAL'
                   AND op.product_2_id = '01t36000001SzdWAAS' THEN 61.4
              WHEN op.opportunity_id = '0063600000IH8B9AAL'
                   AND op.product_2_id = '01t36000004WEfoAAG' THEN 28.3
              WHEN op.opportunity_id = '0063600000IH8B9AAL'
                   AND op.product_2_id = '01t36000004OtprAAC' THEN 0.5
              WHEN op.opportunity_id = '0061R00000ogYPlQAM'
                   AND op.product_2_id = '01t36000005vWnJAAU' THEN 13.33
              WHEN op.opportunity_id = '0061R00000ogYPlQAM'
                   AND op.product_2_id = '01t36000005vg85AAA' THEN 66.67
              WHEN op.opportunity_id = '0061R00000ogYPlQAM'
                   AND op.product_2_id = '01t36000005vX1wAAE' THEN 20.0
              ELSE total_price
          END AS list_price_value
   FROM opportunity_product_rev op
   LEFT JOIN
     (SELECT name AS product_name,
             COALESCE(MAX(service_organization_c), 'unmapped') AS mapped_product,
             COALESCE(MAX(service_organization_c), 'unmapped') AS mid_product,
             COALESCE(MAX(sku_id_c), 'unknown') AS endpoint
      FROM sfdc.products
      WHERE dt = '{run_date}'
      GROUP BY name) sm ON op.product_name = sm.product_name
   WHERE total_price > 0
     AND (netsuite_conn_net_suite_item_key_id_c IS NOT NULL
          OR num_netsuite_items = 0 ) ),

pricing_by_method AS
  (SELECT opportunity_id,
          mapped_product,
          product_name,
          SUM(list_price_value) AS list_price_value
   FROM product_mapping
   GROUP BY opportunity_id,
            mapped_product,
            product_name),

opportunity_to_name AS
  (SELECT id AS account_id,
          CONCAT_WS(',', COLLECT_SET(LOWER(name))) AS account_name
   FROM sfdc.accounts
   WHERE dt = '{run_date}'
   GROUP BY id),

opportunity_arr_tmp_org AS
  (SELECT so.account_id,
          COALESCE(account_name, so.account_id) AS account_name,
          id AS opportunity_id,
          prior_opportunity_c AS prior_opportunity_id,
          DATE_FORMAT(service_start_date_c, 'yyyy-MM-dd') AS service_start_day,
          DATE_FORMAT(CASE
                          WHEN stage_name NOT IN ('Won', '7 - ICR', 'Won - Pending')
                               AND service_start_date_c IS NOT NULL THEN service_start_date_c
                          ELSE effective_date_c
                      END, 'yyyy-MM-dd') AS effective_day, CASE
                                                               WHEN service_end_date_c IS NULL THEN ADD_MONTHS(DATE_FORMAT(service_start_date_c, 'yyyy-MM-dd'), 12)
                                                               ELSE DATE_FORMAT(service_end_date_c, 'yyyy-MM-dd')
                                                           END AS service_end_day,
                                                           stage_name,
                                                           TYPE,
                                                           COALESCE(CAST(arr_c AS DOUBLE), 0) AS arr
   FROM sfdc.opportunities so
   LEFT JOIN opportunity_to_name otn ON so.account_id = otn.account_id
   WHERE dt = '{run_date}'
     AND service_start_date_c IS NOT NULL
     AND so.account_id IS NOT NULL
     AND is_deleted = FALSE
     AND (stage_name IN ('Won',
                         '7 - ICR',
                         'Won - Pending')
          OR (TYPE = 'Renewal Business'
              AND stage_name NOT IN ('Lost',
                                     'Dead',
                                     'Closed - No Decision')))
     AND non_enterprise_c = FALSE
     AND (service_end_date_c IS NULL
          OR DATE_FORMAT(service_end_date_c, 'yyyy-MM') > DATE_FORMAT(service_start_date_c, 'yyyy-MM'))
     AND COALESCE(CAST(arr_c AS DOUBLE), 0) > 0 ),

opportunity_arr_tmp AS
  (SELECT a.account_id,
          a.account_name,
          a.opportunity_id,
          a.prior_opportunity_id,
          a.service_start_day,
          a.effective_day,
          a.service_end_day,
          a.stage_name,
          a.type,
          CASE
              WHEN a.stage_name IN ('Won',
                                    '7 - ICR',
                                    'Won - Pending') THEN a.arr
              ELSE b.arr
          END AS arr
   FROM opportunity_arr_tmp_org a
   LEFT JOIN opportunity_arr_tmp_org b ON COALESCE(a.prior_opportunity_id, '') = b.opportunity_id
   WHERE a.stage_name IN ('Won',
                          '7 - ICR',
                          'Won - Pending')
     OR (a.stage_name NOT IN ('Won',
                              '7 - ICR',
                              'Won - Pending')
         AND b.service_end_day >= DATE_SUB('{run_date}', 45) ) ),

opportunity_arr AS
  (SELECT oa.account_id,
          oa.account_name,
          oa.opportunity_id,
          oa.prior_opportunity_id,
          oa.service_start_day,
          CASE
              WHEN oa.opportunity_id IN ('0063600000VQA62AAH',
                                         '00636000004NSbIAAW',
                                         '0063600000VwjuXAAR') THEN oa.service_start_day
              ELSE oa.effective_day
          END AS effective_day,
          CASE
              WHEN DATE_FORMAT(oaa.service_start_day, 'yyyy-MM-01') <= DATE_FORMAT(oa.service_end_day, 'yyyy-MM-01') THEN ADD_MONTHS(DATE_FORMAT(oaa.service_start_day, 'yyyy-MM-01'), -1)
              ELSE oa.service_end_day
          END AS service_end_day,
          CASE
              WHEN DATE_FORMAT(oaa.service_start_day, 'yyyy-MM-01') <= DATE_FORMAT(oa.service_end_day, 'yyyy-MM-01') THEN TRUE
              ELSE FALSE
          END AS is_superceded,
          oa.stage_name,
          oa.type,
          oa.arr
   FROM opportunity_arr_tmp oa
   INNER JOIN opportunity_arr_tmp oaa ON oa.opportunity_id = oaa.prior_opportunity_id
   AND oaa.type = 'Renewal Business'
   AND oaa.prior_opportunity_id IS NOT NULL
   UNION ALL SELECT a.account_id,
                    a.account_name,
                    a.opportunity_id,
                    a.prior_opportunity_id,
                    a.service_start_day,
                    CASE
                        WHEN a.opportunity_id IN ('0063600000VQA62AAH',
                                                  '00636000004NSbIAAW',
                                                  '0063600000VwjuXAAR') THEN a.service_start_day
                        ELSE a.effective_day
                    END AS effective_day,
                    a.service_end_day,
                    FALSE AS is_superceded,
                             a.stage_name,
                             a.type,
                             a.arr
   FROM opportunity_arr_tmp a
   LEFT JOIN opportunity_arr_tmp b ON a.opportunity_id = b.prior_opportunity_id
   AND b.type = 'Renewal Business'
   AND b.prior_opportunity_id IS NOT NULL
   WHERE b.opportunity_id IS NULL ),

pricing_by_method_nw AS
  (SELECT pm.opportunity_id,
          mapped_product,
          product_name,
          list_price_value
   FROM pricing_by_method pm
   INNER JOIN opportunity_arr oa ON pm.opportunity_id = oa.opportunity_id
   WHERE oa.stage_name IN ('Won',
                           '7 - ICR',
                           'Won - Pending')
   UNION ALL SELECT a.opportunity_id, mapped_product, product_name,
                                                      list_price_value
   FROM
     (SELECT opportunity_id,
             prior_opportunity_id
      FROM opportunity_arr
      WHERE stage_name NOT IN ('Won',
                               '7 - ICR',
                               'Won - Pending')
        AND prior_opportunity_id IS NOT NULL ) a
   INNER JOIN pricing_by_method pmm ON a.prior_opportunity_id = pmm.opportunity_id),

account_product AS
  (SELECT account_id,
          COALESCE(mapped_product, 'maps') AS mapped_product,
          COALESCE(product_name, 'unknown') AS product_name, MIN(service_start_day) AS service_start_day,
                                                             MAX(service_end_day) AS service_end_day
   FROM opportunity_arr oa
   LEFT JOIN pricing_by_method_nw pm ON oa.opportunity_id = pm.opportunity_id
   GROUP BY account_id,
            mapped_product,
            product_name),

arr_by_month_dummy AS
  (SELECT am.account_id,
          COALESCE(account_name, am.account_id) AS account_name,
          mapped_product,
          product_name,
          year_month AS service_month
   FROM
     (SELECT year_month,
             'dummy' AS dummy
      FROM wbr.year_month_dummy_final) ym
   INNER JOIN
     (SELECT account_id,
             mapped_product,
             product_name,
             MAX('dummy') AS dummy,
             MIN(DATE_FORMAT(service_start_day, 'yyyy-MM-01')) AS min_month, MAX(DATE_FORMAT(ADD_MONTHS(service_end_day, 1), 'yyyy-MM-01'))AS max_month
      FROM account_product
      GROUP BY account_id,
               mapped_product,
               product_name) am ON ym.dummy = am.dummy
   LEFT JOIN opportunity_to_name otn ON am.account_id = otn.account_id
   WHERE year_month BETWEEN min_month AND max_month ),

opportunity_product AS
  (SELECT oa.*,
          COALESCE(product_name, 'unknown') AS product_name, COALESCE(CASE
                                                                          WHEN mapped_product IN ('bundled') THEN 'maps'
                                                                          ELSE mapped_product
                                                                      END, 'maps') AS mapped_product, COALESCE(list_price_value, 0) AS list_price_value,
                                                                                                      CASE
                                                                                                          WHEN product_name IS NOT NULL
                                                                                                               AND mapped_product NOT IN ('bundled') THEN COALESCE(list_price_value, 0)
                                                                                                          ELSE 0
                                                                                                      END AS product_value, COUNT(product_name) OVER (PARTITION BY oa.account_id,
                                                                                                                                                                   oa.opportunity_id) AS num_items, SUM(COALESCE(list_price_value, 0)) OVER (PARTITION BY oa.account_id,
                                                                                                                                                                                                                                                          oa.opportunity_id) AS total_value,
                                                                                                                                                                                                                                            SUM(CASE
                                                                                                                                                                                                                                                    WHEN product_name IS NOT NULL
                                                                                                                                                                                                                                                         AND mapped_product NOT IN ('bundled') THEN COALESCE(list_price_value, 0)
                                                                                                                                                                                                                                                    ELSE 0
                                                                                                                                                                                                                                                END) OVER (PARTITION BY oa.account_id,
                                                                                                                                                                                                                                                                        oa.opportunity_id) AS total_product_value
   FROM opportunity_arr oa
   LEFT JOIN pricing_by_method_nw pm ON oa.opportunity_id = pm.opportunity_id),

opp_product_share AS
  (SELECT DATE_FORMAT(service_start_day, 'yyyy-MM-01') AS service_start_month,
          DATE_FORMAT(service_end_day_r, 'yyyy-MM-01') AS service_end_month,
          a.*,
          arr * SHARE AS arr_p
   FROM
     (SELECT *,
             CASE
                 WHEN mapped_product = 'maps'
                      AND product_name = 'unknown'
                      AND num_items <= 1 THEN 1
                 WHEN (total_value > 0
                       AND total_product_value = 0) THEN list_price_value / total_value
                 WHEN total_product_value > 0 THEN product_value / total_product_value
                 ELSE 0
             END AS SHARE,
             service_end_day AS service_end_day_r
      FROM opportunity_product) a), 
      
arr_product_exp AS
  (SELECT md.account_id,
          md.account_name,
          md.product_name,
          CASE
              WHEN md.mapped_product IN ('bundled') THEN 'maps'
              ELSE md.mapped_product
          END AS mapped_product, service_month,
                                 COLLECT_SET(opportunity_id) AS opportunity_id_s, COLLECT_SET(CASE
                                                                                                  WHEN stage_name NOT IN ('Won', '7 - ICR', 'Won - Pending') THEN opportunity_id
                                                                                                  ELSE NULL
                                                                                              END) AS opportunity_id_nw, SUM(arr_p) AS arr_p
   FROM arr_by_month_dummy md
   LEFT JOIN opp_product_share ps ON md.account_id = ps.account_id
   AND md.product_name = ps.product_name
   AND md.service_month BETWEEN ps.service_start_month AND ps.service_end_month
   GROUP BY md.account_id,
            md.account_name,
            md.product_name,
            md.mapped_product,
            service_month),
                                    arr_product_prev AS
  (SELECT account_id,
          account_name,
          mapped_product,
          product_name,
          service_month,
          COALESCE(ROUND(arr_p, 2), 0) AS arr_p,
          COALESCE(LAG(ROUND(arr_p, 2)) OVER (PARTITION BY account_id, product_name
                                              ORDER BY service_month ASC), 0) AS prev_arr_p,
          opportunity_id_s AS opportunity_id,
          CONCAT_WS(',', opportunity_id_s) AS opportunity_id_p,
          LAG(CONCAT_WS(',', opportunity_id_s)) OVER (PARTITION BY account_id,
                                                                   product_name
                                                      ORDER BY service_month ASC) AS prev_opportunity_id_p,
                                                     CONCAT_WS(',', opportunity_id_nw) AS opportunity_id_nw
   FROM arr_product_exp),
                                    arr_product_status AS
  (SELECT account_id,
          account_name,
          mapped_product,
          product_name,
          service_month,
          CASE
              WHEN arr_p > 0
                   AND prev_arr_p = 0 THEN 'new'
              WHEN arr_p = 0
                   AND prev_arr_p > 0 THEN 'churn'
              WHEN arr_p > 0
                   AND arr_p < prev_arr_p THEN 'contraction'
              WHEN arr_p > 0
                   AND arr_p > prev_arr_p THEN 'expansion'
              WHEN arr_p > 0
                   AND arr_p = prev_arr_p
                   AND opportunity_id_p <> prev_opportunity_id_p THEN 'renewal'
              WHEN arr_p > 0
                   AND arr_p = prev_arr_p
                   AND opportunity_id_p = prev_opportunity_id_p THEN 'active'
              WHEN arr_p = 0
                   AND prev_arr_p = 0 THEN 'not_active'
              ELSE 'unknown'
          END AS product_status,
          arr_p,
          prev_arr_p,
          opportunity_id,
          opportunity_id_p,
          prev_opportunity_id_p,
          opportunity_id_nw
   FROM arr_product_prev),
                                    arr_product_status_mid AS
  (SELECT COALESCE(mid_product, 'unmapped') AS mid_product,
          ps.*
   FROM arr_product_status ps
   LEFT JOIN wbr.product_service_mapping sm ON ps.product_name = sm.product_name),
                                    

arr_account AS
  (SELECT account_id,
          service_month,
          arr_a,
          prev_arr_a,
          CASE
              WHEN arr_a > 0
                   AND prev_arr_a = 0 THEN 'new'
              WHEN arr_a = 0
                   AND prev_arr_a > 0 THEN 'churn'
              WHEN arr_a > 0
                   AND arr_a < prev_arr_a THEN 'contraction'
              WHEN arr_a > 0
                   AND arr_a > prev_arr_a THEN 'expansion'
              WHEN arr_a > 0
                   AND arr_a = prev_arr_a
                   AND opportunity_id_a <> prev_opportunity_id_a THEN 'renewal'
              WHEN arr_a > 0
                   AND arr_a = prev_arr_a
                   AND opportunity_id_a = prev_opportunity_id_a THEN 'active'
              WHEN arr_a = 0
                   AND prev_arr_a = 0 THEN 'not_active'
              ELSE 'unknown'
          END AS account_status
   FROM
     (SELECT account_id,
             service_month,
             ROUND(arr_a, 2) AS arr_a,
             ROUND(COALESCE(LAG(arr_a) OVER (PARTITION BY account_id
                                             ORDER BY service_month ASC), 0), 2) AS prev_arr_a,
             opportunity_id_a,
             prev_opportunity_id_a
      FROM
          (SELECT account_id,
                service_month,
                SUM(arr_a) OVER (PARTITION BY account_id
                                 ORDER BY service_month ASC) AS arr_a, CONCAT_WS(',', opportunity_id_a) AS opportunity_id_a,
                                                                       LAG(CONCAT_WS(',', opportunity_id_a)) OVER (PARTITION BY account_id
                                                                                                                   ORDER BY service_month ASC) AS prev_opportunity_id_a
         FROM
           (SELECT aa.account_id,
                   aa.service_month,
                   arr_a,
                   opportunity_id_a
            FROM
              (SELECT account_id,
                      service_month,
                      SUM(COALESCE(arr_p, 0) - COALESCE(prev_arr_p, 0)) AS arr_a
               FROM arr_product_status_mid
               GROUP BY account_id,
                        service_month) aa
            LEFT JOIN
              (SELECT account_id,
                      service_month,
                      COLLECT_SET(opp) AS opportunity_id_a
               FROM arr_product_status_mid LATERAL VIEW EXPLODE (opportunity_id) t AS opp
               GROUP BY account_id,
                        service_month) bb ON aa.account_id = bb.account_id
            AND aa.service_month = bb.service_month) 
            a) 
            
          b)          
     c),
                                    
arr_mapped_product AS
  (SELECT account_id,
          mapped_product,
          service_month,
          arr_m,
          prev_arr_m,
          CASE
              WHEN arr_m > 0
                   AND prev_arr_m = 0 THEN 'new'
              WHEN arr_m = 0
                   AND prev_arr_m > 0 THEN 'churn'
              WHEN arr_m > 0
                   AND arr_m < prev_arr_m THEN 'contraction'
              WHEN arr_m > 0
                   AND arr_m > prev_arr_m THEN 'expansion'
              WHEN arr_m > 0
                   AND arr_m = prev_arr_m
                   AND opportunity_id_m <> prev_opportunity_id_m THEN 'renewal'
              WHEN arr_m > 0
                   AND arr_m = prev_arr_m
                   AND opportunity_id_m = prev_opportunity_id_m THEN 'active'
              WHEN arr_m = 0
                   AND prev_arr_m = 0 THEN 'not_active'
              ELSE 'unknown'
          END AS mapped_status
   FROM
     (SELECT account_id,
             mapped_product,
             service_month,
             ROUND(arr_m, 2) AS arr_m,
             ROUND(COALESCE(LAG(arr_m) OVER (PARTITION BY account_id, mapped_product
                                             ORDER BY service_month ASC), 0), 2) AS prev_arr_m,
             opportunity_id_m,
             prev_opportunity_id_m
      FROM
        (SELECT account_id,
                mapped_product,
                service_month,
                SUM(arr_m) OVER (PARTITION BY account_id,
                                              mapped_product
                                 ORDER BY service_month ASC) AS arr_m,
                                CONCAT_WS(',', opportunity_id_m) AS opportunity_id_m,
                                LAG(CONCAT_WS(',', opportunity_id_m)) OVER (PARTITION BY account_id,
                                                                                         mapped_product
                                                                            ORDER BY service_month ASC) AS prev_opportunity_id_m
         FROM
           (SELECT aa.account_id,
                   aa.service_month,
                   aa.mapped_product,
                   arr_m,
                   opportunity_id_m
            FROM
              (SELECT account_id,
                      mapped_product,
                      service_month,
                      SUM(COALESCE(arr_p, 0) - COALESCE(prev_arr_p, 0)) AS arr_m
               FROM arr_product_status_mid
               GROUP BY account_id,
                        mapped_product,
                        service_month) aa
            LEFT JOIN
              (SELECT account_id,
                      mapped_product,
                      service_month,
                      COLLECT_SET(opp) AS opportunity_id_m
               FROM arr_product_status_mid LATERAL VIEW EXPLODE (opportunity_id) t AS opp
               GROUP BY account_id,
                        mapped_product,
                        service_month) bb ON aa.account_id = bb.account_id
            AND aa.service_month = bb.service_month
            AND aa.mapped_product = bb.mapped_product) a) b) c),
                                    
arr_mid_product AS
  (SELECT account_id,
          mid_product,
          service_month,
          arr_mid,
          prev_arr_mid,
          CASE
              WHEN arr_mid > 0
                   AND prev_arr_mid = 0 THEN 'new'
              WHEN arr_mid = 0
                   AND prev_arr_mid > 0 THEN 'churn'
              WHEN arr_mid > 0
                   AND arr_mid < prev_arr_mid THEN 'contraction'
              WHEN arr_mid > 0
                   AND arr_mid > prev_arr_mid THEN 'expansion'
              WHEN arr_mid > 0
                   AND arr_mid = prev_arr_mid
                   AND opportunity_id_mid <> prev_opportunity_id_mid THEN 'renewal'
              WHEN arr_mid > 0
                   AND arr_mid = prev_arr_mid
                   AND opportunity_id_mid = prev_opportunity_id_mid THEN 'active'
              WHEN arr_mid = 0
                   AND prev_arr_mid = 0 THEN 'not_active'
              ELSE 'unknown'
          END AS mid_status
   FROM
     (SELECT account_id,
             mid_product,
             service_month,
             ROUND(arr_mid, 2) AS arr_mid,
             ROUND(COALESCE(LAG(arr_mid) OVER (PARTITION BY account_id, mid_product
                                               ORDER BY service_month ASC), 0), 2) AS prev_arr_mid,
             opportunity_id_mid,
             prev_opportunity_id_mid
      FROM
        (SELECT account_id,
                mid_product,
                service_month,
                SUM(arr_mid) OVER (PARTITION BY account_id,
                                                mid_product
                                   ORDER BY service_month ASC) AS arr_mid,
                                  CONCAT_WS(',', opportunity_id_mid) AS opportunity_id_mid,
                                  LAG(CONCAT_WS(',', opportunity_id_mid)) OVER (PARTITION BY account_id,
                                                                                             mid_product
                                                                                ORDER BY service_month ASC) AS prev_opportunity_id_mid
         FROM
           (SELECT aa.account_id,
                   aa.service_month,
                   aa.mid_product,
                   arr_mid,
                   opportunity_id_mid
            FROM
              (SELECT account_id,
                      mid_product,
                      service_month,
                      SUM(COALESCE(arr_p, 0) - COALESCE(prev_arr_p, 0)) AS arr_mid
               FROM arr_product_status_mid
               GROUP BY account_id,
                        mid_product,
                        service_month) aa
            LEFT JOIN
              (SELECT account_id,
                      mid_product,
                      service_month,
                      COLLECT_SET(opp) AS opportunity_id_mid
               FROM arr_product_status_mid LATERAL VIEW EXPLODE (opportunity_id) t AS opp
               GROUP BY account_id,
                        mid_product,
                        service_month) bb ON aa.account_id = bb.account_id
            AND aa.service_month = bb.service_month
            AND aa.mid_product = bb.mid_product) a) b) c),
                                    
arr_full_status AS
  (SELECT ps.service_month,
          ps.account_id,
          account_name,
          ps.mapped_product,
          ps.mid_product,
          product_name,
          account_status,
          mapped_status,
          mid_status,
          product_status,
          arr_p - prev_arr_p AS arr_p,
          arr_p AS cum_arr_p,
          prev_arr_p AS prev_cum_arr_p, opportunity_id_p,
                                        prev_opportunity_id_p,
                                        opportunity_id_nw
   FROM arr_product_status_mid ps
   INNER JOIN arr_account aa ON ps.account_id = aa.account_id
   AND ps.service_month = aa.service_month
   INNER JOIN arr_mapped_product mp ON ps.account_id = mp.account_id
   AND ps.service_month = mp.service_month
   AND ps.mapped_product = mp.mapped_product
   INNER JOIN arr_mid_product md ON ps.account_id = md.account_id
   AND ps.service_month = md.service_month
   AND ps.mid_product = md.mid_product)

SELECT service_month,
       account_id,
       account_name,
       mapped_product,
       mid_product,
       product_name,
       account_status,
       mapped_status,
       mid_status,
       product_status,
       arr_p,
       cum_arr_p,
       prev_cum_arr_p,
       opportunity_id_p,
       prev_opportunity_id_p,
       opportunity_id_nw
FROM arr_full_status
"""

In [4]:
formatter = column_parser.Parser(query)
formatted = formatter.format_query(query)

In [5]:
analyzer = query_analyzer.Analyzer(query)
query_dict = analyzer.parse_query(query)

In [6]:
preprocess_list = unnest_query_list(query_dict)

In [None]:
preprocess_list

In [16]:
col_list = []
for query in preprocess_list:
    col_list.extend(extract_subquery_fields(query, db_fields))


In [17]:
col_list

[{'database_name': 'sfdc', 'table_name': 'products', 'column_name': 'dt'},
 {'database_name': 'sfdc',
  'table_name': 'opportunity_product',
  'column_name': 'is_deleted'},
 {'database_name': 'sfdc',
  'table_name': 'opportunity_product',
  'column_name': 'opportunity_id'},
 {'database_name': 'sfdc',
  'table_name': 'opportunity_product',
  'column_name': 'dt'},
 {'database_name': 'sfdc', 'table_name': 'products', 'column_name': 'name'},
 {'database_name': 'sfdc',
  'table_name': 'opportunity_product',
  'column_name': 'product_name_c'},
 {'database_name': 'sfdc',
  'table_name': 'opportunity_product',
  'column_name': 'netsuite_conn_netsuite_item_key_id_c'},
 {'database_name': 'sfdc', 'table_name': 'products', 'column_name': 'id'},
 {'database_name': 'sfdc', 'table_name': 'products', 'column_name': 'name'},
 {'database_name': 'sfdc', 'table_name': 'products', 'column_name': 'dt'},
 {'database_name': 'sfdc',
  'table_name': 'products',
  'column_name': 'service_organization_c'},
 {'dat

In [8]:
db_fields_1 = pd.DataFrame({'db_table': 'sfdc.opportunity_product', 
            'all_columns': ['actual_quantity_c',
 'annual_list_price_value_c',
 'annual_product_value_c',
 'annual_recurring_revenue_c',
 'contract_is_12_months_or_more_c',
 'created_by_id',
 'created_date',
 'description',
 'discount_c',
 'end_date_c',
 'final_year_of_contract_c',
 'id',
 'invoice_schedule_c',
 'is_deleted',
 'last_modified_by_id',
 'last_modified_date',
 'line_family_c',
 'list_price',
 'list_price_value_c',
 'monthly_recurring_revenue_c',
 'name',
 'netsuite_conn_netsuite_item_id_import_c',
 'netsuite_conn_netsuite_item_key_id_c',
 'netsuite_conn_pushed_from_netsuite_c',
 'netsuite_conn_start_date_c',
 'opp_end_date_lineitem_end_date_c',
 'opportunity_id',
 'opportunity_product_line_types_c',
 'opportunity_service_days_c',
 'overage_price_c',
 'pricebook_entry_id',
 'product_2_id',
 'product_code',
 'product_family_c',
 'product_name_c',
 'product_value_c',
 'quantity',
 'roll_up_summary_years_c',
 'service_date',
 'service_days_c',
 'service_year_c',
 'service_year_to_text_c',
 'system_modstamp',
 'time_fetched_from_salesforce',
 'total_price',
 'unit_price',
 'update_everything_c',
 'x18_digit_opportunity_id_c',
 'dt']})


In [9]:
db_fields_2 = pd.DataFrame({'db_table': 'sfdc.products', 
            'all_columns': ['availability_c',
 'billing_type_c',
 'cpm_product_c',
 'created_date',
 'exempt_api_calls_c',
 'family',
 'id',
 'implementing_sdks_c',
 'is_active',
 'is_deleted',
 'launch_date_c',
 'name',
 'netsuite_conn_celigo_update_c',
 'netsuite_conn_item_category_c',
 'netsuite_conn_netsuite_id_c',
 'netsuite_conn_sub_type_c',
 'pql_usage_tier_c',
 'product_code',
 'product_id_c',
 'service_organization_c',
 'sku_id_c',
 'volume_discount_c',
 'dt']})


In [10]:
db_fields_3 = pd.DataFrame({'db_table': 'sfdc.accounts', 
            'all_columns': ['account_health_c',
 'account_health_flag_c',
 'account_health_last_touch_c',
 'account_notes_c',
 'account_owner_c',
 'account_owner_id_c',
 'account_segment_c',
 'account_source',
 'account_start_date_c',
 'account_tier_c',
 'add_company_tags_single_c',
 'annual_revenue',
 'billing_city',
 'billing_country',
 'billing_postal_code',
 'billing_state',
 'billing_street',
 'churned_date_c',
 'created_by_id',
 'created_date',
 'crunchbase_funding_c',
 'csm_c',
 'customer_tier_c',
 'domain_c',
 'dscorgpkg_lead_source_c',
 'dscorgpkg_naics_codes_c',
 'dscorgpkg_sic_codes_c',
 'finance_arr_c',
 'github_issue_ticket_c',
 'health_update_c',
 'id',
 'industry',
 'industry_group_c',
 'industry_sector_c',
 'initial_deal_arr_c',
 'initial_deal_date_c',
 'is_deleted',
 'last_activity_date',
 'last_modified_date',
 'lfbn_account_domain_c',
 'lost_opportunities_c',
 'lost_renewals_c',
 'mapbox_username_c',
 'naics_code_c',
 'name',
 'netsuite_conn_channel_tier_c',
 'next_renewal_date_c',
 'number_of_employees',
 'number_of_mapbox_users_c',
 'open_opportunities_c',
 'open_renewals_c',
 'owner_id',
 'owner_role_c',
 'parent_id',
 'partner_status_c',
 'partner_type_c',
 'primary_contact_c',
 'primary_use_case_c',
 'rating',
 'record_type_id',
 'region_c',
 'renewal_manager_c',
 'sb_pf_company_c',
 'sdr_c',
 'segmentation_c',
 'shipping_city',
 'shipping_country',
 'shipping_postal_code',
 'shipping_state',
 'shipping_street',
 'sic',
 'solution_engineer_c',
 'sub_industry_c',
 'sub_region_c',
 'support_engineer_c',
 'type',
 'vertical_c',
 'vertical_formula_c',
 'won_opportunities_c',
 'x18_digit_account_id_c',
 'zendesk_result_c',
 'zendesk_zendesk_organization_c',
 'zendesk_zendesk_organization_id_c',
 'zisf_zoominfo_industry_c',
 'dt']})


In [11]:
db_fields_4 = pd.DataFrame({'db_table': 'sfdc.opportunities', 
            'all_columns': ['account_id',
 'add_company_tag_c',
 'add_use_cases_c',
 'admin_churn_fc_override_c',
 'agenda_c',
 'amount',
 'arr_c',
 'authority_c',
 'authority_detail_c',
 'autorenewal_c',
 'average_contract_value_acv_c',
 'billing_entity_c',
 'budget_in_usd_c',
 'business_goals_notes_c',
 'campaign_id',
 'churn_acv_c',
 'churn_arr_c',
 'close_date',
 'commit_flag_c',
 'compelling_event_c',
 'confirm_enterprise_requirements_c',
 'contract_signed_c',
 'contracted_expansion_c',
 'contraction_acv_c',
 'country_c',
 'created_by_id',
 'created_by_role_c',
 'created_date',
 'csm_c',
 'customer_presentation_date_c',
 'customer_value_prop_c',
 'department_c',
 'economic_buyer_identified_c',
 'effective_date_c',
 'effective_date_mgr_c',
 'effective_date_vp_c',
 'estimated_annual_revenue_c',
 'exit_arr_c',
 'expected_close_date_c',
 'expected_launch_date_c',
 'final_confirmation_on_triptik_c',
 'final_documents_sent_c',
 'forecast_category',
 'forecast_category_name',
 'forecasted_churn_reportable_c',
 'gclid_c',
 'gclid_date_c',
 'github_ticket_c',
 'id',
 'inbound_message_c',
 'interested_in_c',
 'is_closed',
 'is_deleted',
 'is_split',
 'is_won',
 'last_activity_date',
 'last_modified_date',
 'last_referenced_date',
 'last_trip_tik_update_c',
 'latest_hand_off_date_c',
 'lead_source',
 'lead_source_detail_c',
 'lost_because_c',
 'lost_because_competitor_list_c',
 'lost_because_detail_c',
 'lost_because_notes_c',
 'lost_date_c',
 'mapbox_service_owner_c',
 'mapbox_username_c',
 'name',
 'need_detail_c',
 'need_notes_sdr_c',
 'need_sdr_c',
 'net_new_arr_c',
 'net_new_arr_forecast_c',
 'net_new_arr_forecast_mgr_c',
 'net_new_arr_forecast_vp_c',
 'netsuite_conn_bill_to_tier_c',
 'netsuite_conn_current_sales_order_id_c',
 'netsuite_conn_netsuite_sales_order_number_c',
 'netsuite_conn_ship_to_tier_c',
 'new_acv2019_c',
 'next_step',
 'next_step_c',
 'next_step_date_c',
 'next_steps_new_c',
 'non_enterprise_c',
 'notes_c',
 'objectives_c',
 'opp_renewal_risk_c',
 'opp_renewal_risk_flag_c',
 'opportunity_count_c',
 'opportunity_owner_id_c',
 'opportunity_product_lines_c',
 'opportunity_segment_c',
 'original_renewal_date_c',
 'other_use_case_c',
 'owner_id',
 'owner_role_c',
 'partner_reseller_c',
 'poc_kick_off_date_c',
 'pricebook_2_id',
 'primary_competitor_c',
 'primary_use_case_c',
 'primary_use_case_sdr_c',
 'prior_amount_c',
 'prior_close_date_c',
 'prior_opportunity_c',
 'prior_opportunity_service_end_date_c',
 'prior_stage_c',
 'probability',
 'product_acv_c',
 'qualified_by_c',
 'record_type_id',
 'renewal_acv_c',
 'renewal_arr_c',
 'renewal_arr_override_c',
 'renewal_deadline_c',
 'renewal_health_c',
 'renewal_manager_c',
 'renewal_new_agreement_c',
 'requires_legal_c',
 'sal_date_c',
 'sales_engineer_c',
 'sales_forecast_mgr_c',
 'sales_forecast_vp_c',
 'sales_manager_forecast_last_updated_on_c',
 'sales_manager_forecast_updated_manually_c',
 'sales_rep_forecast_last_updated_on_c',
 'sales_to_cs_hand_off_c',
 'se_github_ticket_c',
 'service_days_c',
 'service_end_date_c',
 'service_start_date_c',
 'service_years_c',
 'shipping_entity_c',
 'stage_0_date_c',
 'stage_1_date_c',
 'stage_2_date_c',
 'stage_3_date_c',
 'stage_4_date_c',
 'stage_5_date_c',
 'stage_6_date_c',
 'stage_7_date_c',
 'stage_change_date_c',
 'stage_duration_c',
 'stage_name',
 'stakeholder_identified_c',
 'sub_vertical_c',
 'tcp_confirmed_with_buyer_c',
 'tcp_customer_tech_signoff_c',
 'tcp_end_date_c',
 'tcp_entered_evaluation_status_c',
 'tcp_entered_review_status_c',
 'tcp_lost_because_c',
 'tcp_products_used_c',
 'tcp_risks_c',
 'tcp_solution_architecture_url_c',
 'tcp_solution_fit_score_c',
 'tcp_solution_notes_c',
 'tcp_start_date_c',
 'tcp_status_c',
 'tcp_tech_owner_c',
 'technical_goals_notes_c',
 'territory_2_id',
 'total_contract_value_tcv_c',
 'trip_tik_created_c',
 'trip_tik_url_c',
 'type',
 'vertical_c',
 'vertical_formula_c',
 'vp_forecast_updated_manually_c',
 'weighted_arr_c',
 'won_date_c',
 'x18_digit_opportunity_id_c',
 'years_c',
 'churn_code_c',
 'churn_sub_code_c',
 'dt']})

In [12]:
db_fields_5 = pd.DataFrame({'db_table': 'wbr.year_month_dummy_final', 
            'all_columns': ['year_month']})

In [13]:
db_fields_6 = pd.DataFrame({'db_table': 'wbr.product_service_mapping', 
            'all_columns': ['product_name', 'mapped_product', 'mid_product', 'endpoint']})

In [14]:
df = db_fields_1.append(db_fields_2, ignore_index=True)
df = df.append(db_fields_3, ignore_index=True)
df = df.append(db_fields_4, ignore_index=True)
df = df.append(db_fields_5, ignore_index=True)
df = df.append(db_fields_6, ignore_index=True)

In [15]:
db_fields = df

## given query

In [None]:
query = """"SELECT u.name,\n       b.customer_tier_c,\n       b.name,\n       m.account,\n       b.x18_digit_account_id_c,\n       s.id,\n       m.platform,\n       m.mobile_os,\n       m.num_requests,\n       Row_number() OVER(PARTITION BY s.id) row_\nFROM wbr.map_requests_by_account m\nINNER JOIN\n  (SELECT DISTINCT id\n   FROM mapbox_customer_data.styles\n   WHERE cast(dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n     AND sources LIKE '%mapbox-streets-v7%' ) s ON m.service_metadata_version = s.id\nLEFT JOIN\n  (SELECT customer_tier_c,\n          csm_c,\n          name,\n          mapbox_username_c,\n          x18_digit_account_id_c\n   FROM sfdc.accounts\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON m.account = b.mapbox_username_c\nLEFT JOIN\n  (SELECT name,\n          id\n   FROM sfdc.users\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) u ON b.csm_c = u.id\nWHERE cast(m.dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n  AND m.service_metadata = 'custom'\n  AND m.service = 'styles'\n  AND b.customer_tier_c IN ('Tier 0',\n                            'Tier 1',\n                            'Tier 2',\n                            'Tier 3',\n                            'Tier 4')"
"""

## transformation 1: format query

In [None]:
query = """SELECT api.name, acct.customer_tier_c, acct.name FROM api_requests_by_account api
LEFT JOIN accounts 
acct ON api.user_id = acct.customer_api_id
"""

In [None]:
formatter = column_parser.Parser(query)

In [None]:
formatted = formatter.format_query(query)
print(formatted)

## transformation 2: separate CTE's

In [None]:
query = """WITH a AS
  (SELECT DISTINCT anonymous_id,
                   user_id
   FROM customer_data.segment_identifies
   WHERE dt >= '2018-07-01'),
     b AS
  (SELECT id,
          email,
          created
   FROM customer_data.accounts)
SELECT a.*,
       b.*
FROM a
LEFT JOIN b ON a.user_id = b.id
WHERE context_campaign_name IS NOT NULL
"""

In [None]:
formatter = column_parser.Parser(query)
formatted = formatter.format_query(query)
print(formatted)

In [None]:
formatter = column_parser.Parser(query)
cte_query = formatter.parse_cte(query)
cte_query

In [None]:
cte_query.keys()

In [None]:
formatted = formatter.format_query(cte_query['main'])
cte_query = formatter.parse_cte(formatted)
cte_query

In [None]:
formatted = formatter.format_query(cte_query['main'])
cte_query = formatter.parse_cte(formatted)
cte_query

## transformation 3: match table aliases

In [None]:
query = """SELECT *
FROM api_requests.requests_by_account m
INNER JOIN mapbox_customer_data.styles s ON m.metadata_version = s.id
LEFT JOIN sfdc.users u ON m.csm = u.id
"""

In [None]:
formatter = column_parser.Parser(query)
formatted = formatter.format_query(query)

In [None]:
print(formatted)

In [None]:
table_alias_mapping = formatter.get_table_names(formatted.split('\n'))
table_alias_mapping

## transformation 4: find columns

In [None]:
fields = formatter.match_queried_fields(query, db_fields)

In [None]:
pd.DataFrame(fields).sort_values(by=['database_name', 'table_name', 'column_name'])

## bonus transformation: upload other query metadata (such as timestamp, user)