In [1]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Construct credentials from service account key file
credentials = service_account.Credentials.from_service_account_file(
    "C:\\Users\\user\\Downloads\\hlai27_bq (1)\\hlai27-isom676-srvacct.json")

# Construct a BigQuery client object
client = bigquery.Client(credentials=credentials)

# Customer segmentation query with corrected type conversion error
QUERY = """
-- Customer Segmentation Query
WITH 
product_averages AS (
  SELECT
    prod_id,
    AVG(sales_amt / NULLIF(sales_qty, 0)) AS avg_price
  FROM
    `msba-emory.isom676_machine_learning.transactions`
  GROUP BY
    prod_id
),

customer_base_metrics AS (
  SELECT
    t.cust_id,
    COUNT(DISTINCT t.trans_id) AS transaction_count,
    SUM(t.sales_amt) AS total_spend,
    SUM(t.sales_qty) AS total_quantity,
    MIN(t.trans_dt) AS first_purchase_date,
    MAX(t.trans_dt) AS last_purchase_date
  FROM
    `msba-emory.isom676_machine_learning.transactions` t
  WHERE
    CAST(t.cust_id AS STRING) LIKE '1%'
  GROUP BY
    t.cust_id
),

category_metrics AS (
  SELECT
    t.cust_id,
    p.prod_category,
    p.prod_subcategory,
    p.prod_section,
    COUNT(DISTINCT t.trans_id) AS category_transactions,
    SUM(t.sales_amt) AS category_spend,
    SUM(CASE 
          WHEN t.sales_qty > 0 AND t.sales_amt / t.sales_qty < pa.avg_price
          THEN t.sales_amt 
          ELSE 0 
        END) AS promo_spend
  FROM
    `msba-emory.isom676_machine_learning.transactions` t
  JOIN
    `msba-emory.isom676_machine_learning.products` p ON t.prod_id = p.prod_id
  JOIN
    product_averages pa ON pa.prod_id = t.prod_id
  WHERE
    CAST(t.cust_id AS STRING) LIKE '1%'
  GROUP BY
    t.cust_id, p.prod_category, p.prod_subcategory, p.prod_section
),

customer_category_metrics AS (
  SELECT
    cust_id,
    SUM(promo_spend) / NULLIF(SUM(category_spend), 0) AS promo_ratio,
    COUNT(DISTINCT prod_category) AS unique_categories,
    MAX(CASE WHEN prod_category = 'Baby' THEN 1 ELSE 0 END) AS buys_baby,
    MAX(CASE WHEN prod_section = 'Dairy' AND prod_subcategory = 'Milk' THEN 1 ELSE 0 END) AS buys_milk
  FROM
    category_metrics
  GROUP BY
    cust_id
),

customer_combined AS (
  SELECT
    b.cust_id,
    b.transaction_count,
    b.total_spend,
    b.total_quantity,
    SAFE_DIVIDE(b.total_spend, b.transaction_count) AS avg_basket,
    SAFE_DIVIDE(DATE_DIFF(b.last_purchase_date, b.first_purchase_date, DAY), 
                GREATEST(b.transaction_count - 1, 1)) AS avg_days_between,
    c.promo_ratio,
    c.unique_categories,
    c.buys_baby,
    c.buys_milk
  FROM
    customer_base_metrics b
  JOIN
    customer_category_metrics c ON b.cust_id = c.cust_id
  WHERE
    b.transaction_count >= 3
),

quartile_bounds AS (
  SELECT
    APPROX_QUANTILES(total_spend, 4) AS spend_quartiles,
    APPROX_QUANTILES(promo_ratio, 4) AS promo_quartiles,
    APPROX_QUANTILES(avg_basket, 4) AS basket_quartiles,
    APPROX_QUANTILES(avg_days_between, 4) AS frequency_quartiles,
    APPROX_QUANTILES(unique_categories, 4) AS category_quartiles
  FROM
    customer_combined
)

SELECT
  segment_name,
  COUNT(*) AS customer_count,
  ROUND(AVG(total_spend), 2) AS avg_total_spend,
  ROUND(AVG(IFNULL(promo_ratio, 0) * 100), 2) AS avg_promo_percent,
  ROUND(AVG(transaction_count), 1) AS avg_transactions,
  ROUND(AVG(avg_basket), 2) AS avg_basket_value,
  ROUND(SUM(buys_baby) * 100.0 / COUNT(*), 2) AS pct_buys_baby,
  ROUND(SUM(buys_milk) * 100.0 / COUNT(*), 2) AS pct_buys_milk
FROM (
  SELECT
    *,
    CASE
      WHEN total_spend >= (SELECT spend_quartiles[OFFSET(3)] FROM quartile_bounds)
       AND IFNULL(promo_ratio, 0) <= (SELECT promo_quartiles[OFFSET(1)] FROM quartile_bounds)
      THEN 'Most Valuable Customers'
      WHEN IFNULL(promo_ratio, 0) >= (SELECT promo_quartiles[OFFSET(3)] FROM quartile_bounds)
      THEN 'Cherry-picker'
      WHEN IFNULL(avg_days_between, 999) <= (SELECT frequency_quartiles[OFFSET(1)] FROM quartile_bounds)
       AND unique_categories >= (SELECT category_quartiles[OFFSET(2)] FROM quartile_bounds)
      THEN 'Frequent Diversified Shoppers'
      WHEN buys_baby = 1 AND total_spend >= (SELECT spend_quartiles[OFFSET(2)] FROM quartile_bounds)
      THEN 'High-value Parents'
      WHEN buys_milk = 0 AND transaction_count >= 10
      THEN 'Non-milk Buyers'
      ELSE 'General Shoppers'
    END AS segment_name
  FROM
    customer_combined
) segments
GROUP BY
  segment_name
ORDER BY
  avg_total_spend DESC
"""

# Use a simpler test query to confirm connection is working
TEST_QUERY = """
SELECT 
  DISTINCT prod_category 
FROM 
  `msba-emory.isom676_machine_learning.products` 
LIMIT 10
"""

try:
    # First, run the test query to confirm the connection
    test_job = client.query(TEST_QUERY)
    test_rows = test_job.result()
    
    print("Successfully connected to BigQuery, product categories in the database include:")
    for row in test_rows:
        print(f"- {row.prod_category}")
    
    print("\nRunning customer segmentation analysis, this may take a few minutes...")
    
    # Then run the main query
    query_job = client.query(QUERY)
    rows = query_job.result()
    
    print("\nCustomer Segregation Analysis Results:")
    print("=================================================================================================================")
    print("{:<25} {:<10} {:<15} {:<10} {:<10} {:<15} {:<10} {:<10}".format(
        "Segment", "Count", "Avg Spend", "Promo %", "Avg Txns", "Avg Basket", "Baby %", "Milk %"))
    print("=================================================================================================================")
    
    for row in rows:
        print("{:<25} {:<10} {:<15.2f} {:<10.2f} {:<10.1f} {:<15.2f} {:<10.2f} {:<10.2f}".format(
            row.segment_name,
            row.customer_count,
            row.avg_total_spend,
            row.avg_promo_percent,
            row.avg_transactions,
            row.avg_basket_value,
            row.pct_buys_baby,
            row.pct_buys_milk
        ))
    
except Exception as e:
    print(f"Error occurred during query execution: {e}")

Successfully connected to BigQuery, product categories in the database include:
- 3rd Party Gift Cards
- Optical
- Tobacco
- Skincare
- Natural Foods
- Household Cleaning Needs
- Home Decor
- Cold Beverages
- Commercial
- Medicinal Products

Running customer segmentation analysis, this may take a few minutes...

Customer Segregation Analysis Results:
Segment                   Count      Avg Spend       Promo %    Avg Txns   Avg Basket      Baby %     Milk %    
Most Valuable Customers   115246     9085.01         27.16      114.6      95.00           50.81      94.54     
Frequent Diversified Shoppers 284671     8525.51         38.92      154.9      58.97           50.12      94.79     
High-value Parents        142000     2552.32         37.20      37.6       74.46           100.00     89.45     
Cherry-picker             548967     1210.91         39368256911365.75 40.9       31.38           16.83      57.89     
General Shoppers          969495     737.50          -14567675368518.46