In [None]:
# Are there natural groupings of stores, e.g., stores frequented by cherry-pickers versus stores visited by most loyal customers?

In [None]:
# Revised and optimized query
QUERY = """
WITH product_filter AS (
    SELECT *
    FROM `machine_learning.products`
    WHERE prod_category NOT IN ("Gift Cards", "Other", "Front End Service", "Scanning Errors", "Customer Service-Misc", "Empties and Additionals")
),
valid_transactions AS (
    SELECT *
    FROM `machine_learning.transactions` a 
    join product_filter b on a.prod_id  = b.prod_id 
    WHERE trans_dt < "2020-03-01"
    AND a.prod_id IN (SELECT prod_id FROM product_filter)
        AND 
        -- Logic 1: Either sales_qty or sales_wgt is zero, but not both
        ((sales_qty = 0 AND sales_wgt <> 0) OR (sales_qty <> 0 AND sales_wgt = 0))
        AND 
        -- Logics 2 and 3 are parallel conditions
        (
            (prod_category NOT IN ("Coupon", "returns") AND (sales_qty > 0 OR sales_wgt > 0))
            OR
            (prod_category IN ("Coupon", "returns") AND (sales_qty < 0 OR sales_wgt < 0))
        )
    AND sales_amt >= 0
),
transactions_per_day AS (
    SELECT cust_id, trans_dt, COUNT(DISTINCT trans_id) AS trans_per_day
    FROM valid_transactions
    GROUP BY cust_id, trans_dt
    HAVING trans_per_day <= 10
),
eligible_custs AS (
    SELECT v.cust_id
    FROM valid_transactions v
    JOIN transactions_per_day tpd ON v.cust_id = tpd.cust_id AND v.trans_dt = tpd.trans_dt
    GROUP BY v.cust_id
    HAVING COUNT(DISTINCT v.trans_id) >= 5
    AND COUNT(DISTINCT v.trans_dt) >= 5
    AND COUNT(v.trans_id) <= 20000
),
sampled_custs AS (
    SELECT cust_id
    FROM eligible_custs
    WHERE MOD(ABS(FARM_FINGERPRINT(CAST(cust_id AS STRING))), 1000) < 1
)
SELECT tx.*
FROM `valid_transactions` tx
JOIN sampled_custs ON tx.cust_id = sampled_custs.cust_id
WHERE tx.trans_dt < "2020-03-01"
"""

# Execute the query
query_job = client.query(QUERY)  # API request

# Convert to DataFrame
sample_transaction = query_job.to_dataframe()  # Waits for query to finish and converts it to DataFrame


In [None]:
# Load product and profit margin data
product_profit_margin_df = pd.read_excel("C:/Users/ctlan/OneDrive/desktop/AI at Scale/HW/Product Category Profit Margin.xlsx")

# Merge with product_profit_margin to get profit margins
merged_df = pd.merge(sample_transaction, product_profit_margin_df, on='prod_category', how='left')

# Calculate profit for each transaction
merged_df['profit'] = merged_df['sales_amt'] * merged_df['profit_margin']

## store clusering

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Assuming sample_transaction is a DataFrame that's already been loaded with all the columns provided.

# First, we'll calculate the proportion of transactions for each store_id that are for the product category 'Coupons'.
# 1. Mark each transaction whether it is for Coupons
sample_transaction['is_coupon'] = (sample_transaction['prod_category'] == 'Coupons').astype(int)

# 2. For each store_id, calculate the total number of transactions and the number of transactions for Coupons
store_transactions_summary = sample_transaction.groupby('store_id').agg(
    total_transactions=('trans_id', 'nunique'),  # Calculate the unique number of transactions per store
    coupons_transactions=('is_coupon', 'sum')  # Calculate the number of transactions for Coupons
)

# 3. Calculate the proportion of Coupons transactions for each store_id
store_transactions_summary['coupons_ratio'] = store_transactions_summary['coupons_transactions'] / store_transactions_summary['total_transactions']

# 4. Plot a histogram showing the distribution of the Coupons transaction ratio per store, with the y-axis representing the proportion of stores
# Calculate the histogram
counts, bins = np.histogram(store_transactions_summary['coupons_ratio'], bins=np.arange(0, 1.05, 0.05))

# Convert counts to proportions
counts = counts / counts.sum()

# Plot the histogram
plt.bar(bins[:-1], counts, width=0.05, align='edge', edgecolor='black')
plt.xlabel('Coupons Transaction Ratio')
plt.ylabel('Proportion of Stores')
plt.title('Distribution of Coupons Transaction Ratio per Store')
plt.xticks(np.arange(0, 1.1, 0.1))
plt.show()


In [None]:
# Calculate the proportion of stores with a coupons transaction ratio greater than 0.3
proportion_above_03 = (store_transactions_summary['coupons_ratio'] > 0.3).mean()

print(f"Proportion of stores with a coupons transaction ratio greater than 0.3: {proportion_above_03:.2%}")


In [None]:
# Filter the stores with a coupons transaction ratio greater than 0.3
filtered_stores = store_transactions_summary[store_transactions_summary['coupons_ratio'] > 0.3]

# Sort these stores in descending order by the coupons transaction ratio
sorted_stores = filtered_stores.sort_values(by='coupons_ratio', ascending=False)

# Display the sorted DataFrame with store_ids and their respective coupons transaction ratio
sorted_stores_df = sorted_stores[['coupons_ratio']]
sorted_stores_df

In [None]:
### loyal store 

# Step 1: Filter transactions to identify those by loyal customers
loyal_customers = sample_transaction[sample_transaction['cust_id'].apply(lambda x: len(str(x)) == 10)]

# Step 2: Calculate total and loyal customer visits per store
total_visits_per_store = sample_transaction.groupby('store_id')['cust_id'].nunique()
loyal_visits_per_store = loyal_customers.groupby('store_id')['cust_id'].nunique()

# Step 3: Calculate the proportion of loyal customer visits per store
proportion_loyal_visits = (loyal_visits_per_store / total_visits_per_store).fillna(0)

# Convert to DataFrame for easier handling and plotting
proportion_loyal_visits_df = proportion_loyal_visits.reset_index()
proportion_loyal_visits_df.columns = ['store_id', 'loyal_customer_proportion']

# Sort by proportion for better visualization
proportion_loyal_visits_df_sorted = proportion_loyal_visits_df.sort_values(by='loyal_customer_proportion', ascending=False)

# Step 4: Plotting the bar chart
plt.figure(figsize=(10, 6))
plt.bar(proportion_loyal_visits_df_sorted['store_id'].astype(str), proportion_loyal_visits_df_sorted['loyal_customer_proportion'], color='skyblue')
plt.xlabel('Store ID')
plt.ylabel('Proportion of Loyal Customers')
plt.title('Proportion of Loyal Customers by Store')
plt.xticks(rotation=90)  # Rotate store IDs for better readability
plt.show()

In [None]:

# Select the top 10 stores with the highest proportion of loyal customer visits
top_10_stores_by_loyal_customers = sorted_df_by_loyal_customers.head(10)

top_10_stores_by_loyal_customers