In [1]:
# Set environmental variables with google service credentials
import os
%env GOOGLE_APPLICATION_CREDENTIALS=/home/yexi/interview_questions/ntuc_iowa_liquor/bq_sql_key.json
print(os.getenv('GOOGLE_APPLICATION_CREDENTIALS'))

env: GOOGLE_APPLICATION_CREDENTIALS=/home/yexi/interview_questions/ntuc_iowa_liquor/bq_sql_key.json
/home/yexi/interview_questions/ntuc_iowa_liquor/bq_sql_key.json


In [67]:
# import necessary libraries for analysis
import pandas as pd
import numpy as np
from scipy.stats import norm, t
from statsmodels.stats.weightstats import ztest
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from google.cloud import bigquery

bqclient = bigquery.Client()

In [3]:
# Test BQ Connect
def implicit():
    from google.cloud import storage

    # If you don't specify credentials when constructing the client, the
    # client library will look for credentials in the environment.
    storage_client = storage.Client()

    # Make an authenticated API request
    buckets = list(storage_client.list_buckets())
    print(buckets)

implicit()

[]


In [10]:
# Download query results.
survey_after_action_event = """
WITH survey_after_event_voucher AS
(
    SELECT
        events.customer_id AS customer_id,
        event_ts,
        country,
        survey_response_ts,
        action_type,
        survey_rating
    FROM `bq-sql-practice.foodpanda_interview.events` AS events
    LEFT JOIN `bq-sql-practice.foodpanda_interview.customer_surveys` AS surveys
        ON events.customer_id = surveys.customer_id
    WHERE survey_response_ts > event_ts
)
SELECT 
    customer_id,
    action_type,
    CAST(DATE_TRUNC(survey_response_ts, day) AS date) AS survey_response_date,
    CAST(DATE_TRUNC(event_ts, day) AS date) AS event_date,
    DATE_DIFF(CAST(DATE_TRUNC(survey_response_ts, day) AS date), CAST(DATE_TRUNC(event_ts, day) AS date), day) AS days_diff,
    survey_rating,
    country
FROM survey_after_event_voucher
"""

survey_after_action_event = (
    bqclient.query(survey_after_action_event)
    .result()
    .to_dataframe(
        create_bqstorage_client=True,
    )
)

# Hypothesis Testing
### Null Hypothesis: those users that received a voucher after >30mins of delay on their order will provide a better avg rating than those that did not receive anything
### Alternative Hypothesis: there is no difference in avg rating between those that receive the vouchers after >30mins of delay on their order and those that did not receive anything

In [13]:
# Initial inspection of avg rating between the 2 groups
# We can see that those that received the voucher has an avg rating of 0.61 vs 0.54 of the control group (no action)
survey_after_action_event.groupby('action_type')['survey_rating'].mean()

action_type
absolute_voucher    0.605909
no_action           0.543718
Name: survey_rating, dtype: float64

In [23]:
# Create 2 samples for treatment (voucher) vs control group (no action)
abs_voucher = survey_after_action_event[survey_after_action_event['action_type'] == 'absolute_voucher']['survey_rating'].to_list()
no_action = survey_after_action_event[survey_after_action_event['action_type'] == 'no_action']['survey_rating'].to_list()

print(f"Treatment Group Sample Size: {len(abs_voucher)}")
print(f"Control Group Sample Size: {len(no_action)}")

Treatment Group Sample Size: 14791
Control Group Sample Size: 27689


In [44]:
# Store the sample sizes of the treatment and control groups
abs_voucher_size = len(abs_voucher)
no_action_size = len(no_action)

# Store the means of the treatment and control groups
abs_voucher_mean = np.mean(abs_voucher)
no_action_mean = np.mean(no_action)

# Store the std dev of the treatment and control groups
abs_voucher_std_dev = np.std(abs_voucher, ddof=1)
no_action_std_dev = np.std(no_action, ddof=1)

# z_left and z_right values @95% confident level
z_left = norm.ppf(0.025)
z_right = norm.ppf(0.975)

# Measure lower and upper bound CI
abs_voucher_lower_bound = abs_voucher_mean + (z_left * (abs_voucher_std_dev / np.sqrt(abs_voucher_size)))
abs_voucher_upper_bound = abs_voucher_mean + (z_right * (abs_voucher_std_dev / np.sqrt(abs_voucher_size)))

no_action_lower_bound = no_action_mean + (z_left * (no_action_std_dev / np.sqrt(no_action_size)))
no_action_upper_bound = no_action_mean + (z_right * (no_action_std_dev / np.sqrt(no_action_size)))

In [55]:
# Print our confidence intervals for both our treatment and control groups
print("====Treatment Group: CI @95%====")
print(f"Mean: {round(abs_voucher_mean, 3)}, CI:[{round(abs_voucher_lower_bound, 3)}, {round(abs_voucher_upper_bound, 3)}]")
print("====Control Group: CI @95%====")
print(f"Mean: {round(no_action_mean, 3)}, CI:[{round(no_action_lower_bound, 3)}, {round(no_action_upper_bound, 3)}]")

====Treatment Group: CI @95%====
Mean: 0.606, CI:[0.598, 0.614]
====Control Group: CI @95%====
Mean: 0.544, CI:[0.538, 0.55]


In [89]:
# Using the z-test to test if the difference in mean is statistically significant
z_value, p_value = ztest(abs_voucher, no_action)

# Test at 95% confidence
if p_value < 0.05 or p_value > 0.95:
    print("The voucher treatment is statistically significant at the 95% confidence level and we fail to reject H0.")
else:
    print("The voucher treatment is not statistically significant at the 95% confidence level and we reject H0.")

The voucher treatment is statistically significant at the 95% confidence level and we fail to reject H0.
