In [None]:
# Are there interesting groupings of customers, e.g., most valuable (buy everything at any price) or 
#  cherry-pickers (buy mostly on promotions), defined by certain categories (buy baby products or never buy milk), etc.?

In [None]:
QUERY = (
    """
    WITH FilteredTransactions AS (
        SELECT t.trans_id, t.trans_dt, t.store_id, t.cust_id, t.prod_id,
               t.sales_amt, t.sales_qty, t.sales_wgt, p.prod_category
        FROM `machine_learning.transactions` t
        JOIN `machine_learning.products` p ON t.prod_id = p.prod_id
        WHERE t.trans_dt < '2020-03-01'
        AND t.prod_id != 20189092
        AND p.prod_category NOT IN ('Gift Cards', 'Front End Service', 'Scanning Errors',
                                    'Customer Service-Misc', 'Empties and Additionals')
    ),
    CustomerVisits AS (
        SELECT cust_id,
               COUNT(*) as visit_count,
               SUM(sales_amt) as total_spent
        FROM FilteredTransactions
        GROUP BY cust_id
        HAVING COUNT(*) >= 5 AND SUM(sales_amt) >= 100
    ),
    OverVisits AS (
        SELECT cust_id
        FROM FilteredTransactions
        GROUP BY cust_id, DATE(trans_dt)
        HAVING COUNT(DISTINCT trans_id) > 10
    ),
    ValidCustomers AS (
        SELECT cv.cust_id
        FROM CustomerVisits cv
        WHERE NOT EXISTS (
            SELECT 1
            FROM OverVisits ov
            WHERE ov.cust_id = cv.cust_id
        )
    ),
    ValidTransactions AS (
        SELECT ft.trans_id, ft.trans_dt, ft.store_id, ft.cust_id, ft.prod_id,
               ft.sales_amt, ft.sales_qty, ft.sales_wgt, ft.prod_category
        FROM FilteredTransactions ft
        INNER JOIN ValidCustomers vc ON ft.cust_id = vc.cust_id
    ),
    MultipleCustTrans AS (
        SELECT trans_id
        FROM ValidTransactions
        GROUP BY trans_id
        HAVING COUNT(DISTINCT cust_id) > 1
    ),
CleanedData AS (
    SELECT *
    FROM ValidTransactions vt
    WHERE vt.trans_id NOT IN (SELECT trans_id FROM MultipleCustTrans)
    AND (
        (vt.sales_qty = 0 AND vt.sales_wgt <> 0) OR (vt.sales_qty <> 0 AND vt.sales_wgt = 0) OR (vt.sales_qty = 1 AND vt.sales_wgt <> 0)
    )
    AND (
        (vt.prod_category NOT IN ('Coupons', 'returns') AND (vt.sales_qty > 0 OR vt.sales_wgt > 0))
        OR
        (vt.prod_category IN ('Coupons', 'returns') AND (vt.sales_qty < 0 OR vt.sales_wgt < 0))
    )
),

    TotalTransactions AS (
    SELECT
        cust_id,
        COUNT(trans_id) AS total_transactions
    FROM
        CleanedData
    GROUP BY
        cust_id
),
CouponTransactions AS (
    SELECT
        cust_id,
        COUNT(trans_id) AS coupon_transactions
    FROM
        CleanedData
    WHERE
        trans_id IN (
            SELECT trans_id
            FROM CleanedData
            WHERE prod_id IN (
                SELECT prod_id
                FROM machine_learning.products
                WHERE prod_category = 'Coupons'
            )
        )
    GROUP BY
        cust_id
)
SELECT
    tt.cust_id,
    ct.coupon_transactions,
    tt.total_transactions,
    -- Calculate the frequency of coupon usage as a proportion of total transactions
    SAFE_DIVIDE(ct.coupon_transactions, tt.total_transactions) AS coupon_usage_frequency
FROM
    TotalTransactions tt
JOIN
    CouponTransactions ct ON tt.cust_id = ct.cust_id
ORDER BY
    coupon_usage_frequency DESC;
    """

)

query_job = client.query(QUERY)  # API request
Cherry_picker = query_job.to_dataframe()  # Converts the query results to a DataFrame

# Now you can work with the data as a pandas DataFrame
Cherry_picker

In [None]:
# Assuming cherry_picker is your DataFrame and coupon_usage_frequency is expressed as a decimal
filtered_cherry_pickers = Cherry_picker[Cherry_picker['coupon_usage_frequency'] > 0.03]  # 0.03 corresponds to 3%

# Display the filtered DataFrame
filtered_cherry_pickers


In [None]:
QUERY = (
    """
    WITH FilteredTransactions AS (
        SELECT t.trans_id, t.trans_dt, t.store_id, t.cust_id, t.prod_id,
               t.sales_amt, t.sales_qty, t.sales_wgt, p.prod_category
        FROM `machine_learning.transactions` t
        JOIN `machine_learning.products` p ON t.prod_id = p.prod_id
        WHERE t.trans_dt < '2020-03-01'
        AND t.prod_id != 20189092
        AND p.prod_category NOT IN ('Gift Cards', 'Front End Service', 'Scanning Errors',
                                    'Customer Service-Misc', 'Empties and Additionals')
    ),
    CustomerVisits AS (
        SELECT cust_id,
               COUNT(*) as visit_count,
               SUM(sales_amt) as total_spent
        FROM FilteredTransactions
        GROUP BY cust_id
        HAVING COUNT(*) >= 5 AND SUM(sales_amt) >= 100
    ),
    OverVisits AS (
        SELECT cust_id
        FROM FilteredTransactions
        GROUP BY cust_id, DATE(trans_dt)
        HAVING COUNT(DISTINCT trans_id) > 10
    ),
    ValidCustomers AS (
        SELECT cv.cust_id
        FROM CustomerVisits cv
        WHERE NOT EXISTS (
            SELECT 1
            FROM OverVisits ov
            WHERE ov.cust_id = cv.cust_id
        )
    ),
    ValidTransactions AS (
        SELECT ft.trans_id, ft.trans_dt, ft.store_id, ft.cust_id, ft.prod_id,
               ft.sales_amt, ft.sales_qty, ft.sales_wgt, ft.prod_category
        FROM FilteredTransactions ft
        INNER JOIN ValidCustomers vc ON ft.cust_id = vc.cust_id
    ),
    MultipleCustTrans AS (
        SELECT trans_id
        FROM ValidTransactions
        GROUP BY trans_id
        HAVING COUNT(DISTINCT cust_id) > 1
    ),
CleanedData AS (
    SELECT *
    FROM ValidTransactions vt
    WHERE vt.trans_id NOT IN (SELECT trans_id FROM MultipleCustTrans)
    AND (
        (vt.sales_qty = 0 AND vt.sales_wgt <> 0) OR (vt.sales_qty <> 0 AND vt.sales_wgt = 0) OR (vt.sales_qty = 1 AND vt.sales_wgt <> 0)
    )
    AND (
        (vt.prod_category NOT IN ('Coupons', 'returns') AND (vt.sales_qty > 0 OR vt.sales_wgt > 0))
        OR
        (vt.prod_category IN ('Coupons', 'returns') AND (vt.sales_qty < 0 OR vt.sales_wgt < 0))
    )
),

    BabyProductBuyers AS (
    SELECT DISTINCT cust_id
    FROM CleanedData
    WHERE prod_category = 'Baby'
)

SELECT cust_id
FROM BabyProductBuyers
ORDER BY cust_id;
    """

)

query_job = client.query(QUERY)  # API request
weird_customer = query_job.to_dataframe()  # Converts the query results to a DataFrame

# Now you can work with the data as a pandas DataFrame
weird_customer

### Customer pattern analysis

In [None]:
# Customer pattern analysis
import pandas as pd

# Adjustments for accurately identifying customers by category

# Assuming merged_df is prepared with 'cust_id', 'prod_category', and 'sales_amt'

# Identify all unique customers
all_customers = merged_df['cust_id'].unique()

# 1. Never use coupon
# Find customers who have used coupons
customers_used_coupons = merged_df[merged_df['prod_category'] == 'Coupons']['cust_id'].unique()
# Find customers who have never used a coupon by excluding those who have
customers_never_use_coupon = set(all_customers) - set(customers_used_coupons)
# Calculate their profit
profit_never_use_coupon = merged_df[(merged_df['cust_id'].isin(customers_never_use_coupon)) & (~merged_df['prod_category'].str.contains('Coupons'))]['sales_amt'].sum()

# 2. Only buy One/Two/Three certain categories
# Count unique categories per customer
cust_category_counts = merged_df.groupby('cust_id')['prod_category'].nunique()
# Customers who only buy from 1, 2, or 3 categories
customers_one_two_three_categories = cust_category_counts[(cust_category_counts >= 1) & (cust_category_counts <= 3)].index
# Calculate their profit
profit_one_two_three_categories = merged_df[merged_df['cust_id'].isin(customers_one_two_three_categories)]['sales_amt'].sum()

# 3. Buy everything (50+ categories)
customers_buy_everything = cust_category_counts[cust_category_counts >= 50].index
# Calculate their profit
profit_buy_everything = merged_df[merged_df['cust_id'].isin(customers_buy_everything)]['sales_amt'].sum()

# Total numbers and profits
total_customers = len(all_customers)
total_profit = merged_df['sales_amt'].sum()

# Calculate proportions
data = {
    'Category': ['Never Use Coupon', 'Buy 1-3 Categories', 'Buy 50+ Categories'],
    'Customer Count Proportion': [
        len(customers_never_use_coupon) / total_customers,
        len(customers_one_two_three_categories) / total_customers,
        len(customers_buy_everything) / total_customers
    ],
    'Profit Proportion': [
        profit_never_use_coupon / total_profit,
        profit_one_two_three_categories / total_profit,
        profit_buy_everything / total_profit
    ]
}

# Create the output DataFrame
output_df = pd.DataFrame(data)

output_df


### Loyal Customer Analysis

In [None]:
import pandas as pd

# Assuming merged_df is prepared with columns 'cust_id' and 'profit'

# Mark loyal customers
merged_df['is_loyal'] = merged_df['cust_id'].apply(lambda x: len(str(x)) == 10)

# Calculate numbers of loyal and total customers
total_customers = merged_df['cust_id'].nunique()
loyal_customers = merged_df[merged_df['is_loyal']]['cust_id'].nunique()

# Calculate total and loyal customers' profit
total_profit = merged_df['profit'].sum()
loyal_customers_profit = merged_df[merged_df['is_loyal']]['profit'].sum()

# Calculate proportions
loyal_customers_ratio = loyal_customers / total_customers
loyal_customers_profit_ratio = loyal_customers_profit / total_profit

# Create the output DataFrame
output_df = pd.DataFrame({
    'Metric': ['Loyal Customer Ratio', 'Loyal Customer Profit Ratio'],
    'Value': [loyal_customers_ratio, loyal_customers_profit_ratio]
})

output_df

### Cherry pickers

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1. Label each transaction as whether use Coupons
sample_transaction['is_coupon'] = (sample_transaction['prod_category'] == 'Coupons').astype(int)

# 2. For each cust_id，calculate the total transaction counts and Coupons transaction counts
cust_transactions_summary = sample_transaction.groupby('cust_id').agg(
    total_transactions=('trans_id', 'nunique'), 
    coupons_transactions=('is_coupon', 'sum') 
)

# 3. Calculate cust_id Coupons transaction ratio
cust_transactions_summary['coupons_ratio'] = cust_transactions_summary['coupons_transactions'] / cust_transactions_summary['total_transactions']

# 4. Build distritbution map, Y-axis as customer ratio
counts, bins = np.histogram(cust_transactions_summary['coupons_ratio'], bins=np.arange(0, 1.05, 0.05))

# Turn counts into ratio
counts = counts / counts.sum()

# Plot
plt.bar(bins[:-1], counts, width=0.05, align='edge', edgecolor='black')
plt.xlabel('Coupons Transaction Ratio')
plt.ylabel('Proportion of Customers')
plt.title('Distribution of Coupons Transaction Ratio per Customer')
plt.xticks(np.arange(0, 1.1, 0.1))
plt.show()



### Bar chart

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming merged_df is prepared with 'cust_id', 'prod_category', and 'sales_amt'

# Calculate the number of unique categories purchased by each customer
category_counts_per_customer = merged_df.groupby('cust_id')['prod_category'].nunique()

# Calculate the total profit by customer
profit_per_customer = merged_df.groupby('cust_id')['sales_amt'].sum()

# Merge the two series into a DataFrame for easier analysis
customer_analysis_df = pd.DataFrame({
    'Category_Count': category_counts_per_customer,
    'Total_Profit': profit_per_customer
})

# Define bins for the category count histogram
bins_category_count = pd.cut(customer_analysis_df['Category_Count'], bins=5, labels=['1-5', '6-10', '11-15', '16-20', '21+'])

# Plot histogram for the distribution of customers by category count
plt.figure(figsize=(10, 6))
bins_category_count.value_counts(normalize=True).sort_index().plot(kind='bar', color='red')
plt.title('Customer Distribution by Number of Product Categories Purchased')
plt.xlabel('Number of Product Categories')
plt.ylabel('Proportion of Customers')
plt.show()

# For profit contribution, first assign each customer to a bin based on their category count
customer_analysis_df['Category_Count_Bin'] = bins_category_count

# Then, calculate the total profit for each bin
profit_contribution_by_bin = customer_analysis_df.groupby('Category_Count_Bin')['Total_Profit'].sum()

# Normalize the profit contribution by the total profit to get the proportion
profit_contribution_by_bin_normalized = profit_contribution_by_bin / profit_contribution_by_bin.sum()

# Plot histogram for the profit contribution by category count bin
plt.figure(figsize=(10, 6))
profit_contribution_by_bin_normalized.plot(kind='bar', color='red')
plt.title('Profit Contribution by Number of Product Categories Purchased')
plt.xlabel('Number of Product Categories')
plt.ylabel('Proportion of Profit')
plt.show()
