In [None]:
# Other than product categories and sub-categories, are there other product groupings, e.g., 
#  Key Value Items (KVI) and Key Value Categories (KVC), traffic drivers, always promoted versus seldom/never promoted, etc.?

In [None]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Construct credentials from service account key file
credentials = service_account.Credentials.from_service_account_file(
    'tche368-isom676-srvacct_srvacct.json')  # Update the file path as needed

# Construct a BigQuery client object
client = bigquery.Client(credentials=credentials)

# Revised and optimized query
QUERY = """
WITH product_filter AS (
    SELECT *
    FROM `machine_learning.products`
    WHERE prod_category NOT IN ("Gift Cards", "Other", "Front End Service", "Scanning Errors", "Customer Service-Misc", "Empties and Additionals")
),
valid_transactions AS (
    SELECT *
    FROM `machine_learning.transactions` a 
    join product_filter b on a.prod_id  = b.prod_id 
    WHERE trans_dt < "2020-03-01"
    AND a.prod_id IN (SELECT prod_id FROM product_filter)
        AND 
        -- Logic 1: Either sales_qty or sales_wgt is zero, but not both
        ((sales_qty = 0 AND sales_wgt <> 0) OR (sales_qty <> 0 AND sales_wgt = 0))
        AND 
        -- Logics 2 and 3 are parallel conditions
        (
            (prod_category NOT IN ("Coupon", "returns") AND (sales_qty > 0 OR sales_wgt > 0))
            OR
            (prod_category IN ("Coupon", "returns") AND (sales_qty < 0 OR sales_wgt < 0))
        )
    AND sales_amt >= 0
),
transactions_per_day AS (
    SELECT cust_id, trans_dt, COUNT(DISTINCT trans_id) AS trans_per_day
    FROM valid_transactions
    GROUP BY cust_id, trans_dt
    HAVING trans_per_day <= 10
),
eligible_custs AS (
    SELECT v.cust_id
    FROM valid_transactions v
    JOIN transactions_per_day tpd ON v.cust_id = tpd.cust_id AND v.trans_dt = tpd.trans_dt
    GROUP BY v.cust_id
    HAVING COUNT(DISTINCT v.trans_id) >= 5
    AND COUNT(DISTINCT v.trans_dt) >= 5
    AND COUNT(v.trans_id) <= 20000
),
sampled_custs AS (
    SELECT cust_id
    FROM eligible_custs
    WHERE MOD(ABS(FARM_FINGERPRINT(CAST(cust_id AS STRING))), 1000) < 1
)
SELECT tx.*
FROM `valid_transactions` tx
JOIN sampled_custs ON tx.cust_id = sampled_custs.cust_id
WHERE tx.trans_dt < "2020-03-01"
"""

# Execute the query
query_job = client.query(QUERY)  # API request

# Convert to DataFrame
sample_transaction = query_job.to_dataframe()  # Waits for query to finish and converts it to DataFrame


In [None]:
# Load product and profit margin data
product_profit_margin_df = pd.read_excel("C:/Users/ctlan/OneDrive/desktop/AI at Scale/HW/Product Category Profit Margin.xlsx")

# Merge with product_profit_margin to get profit margins
merged_df = pd.merge(sample_transaction, product_profit_margin_df, on='prod_category', how='left')

# Calculate profit for each transaction
merged_df['profit'] = merged_df['sales_amt'] * merged_df['profit_margin']

## customer clustering

In [None]:
import pandas as pd

# Assuming sample_transaction and product_profit_margin_df have been loaded correctly
# Load the product_profit_margin_df
product_profit_margin_df = pd.read_excel("C:/Users/ctlan/OneDrive/desktop/AI at Scale/HW/Product Category Profit Margin.xlsx")

# Merge with product_profit_margin to get profit margins
merged_df = pd.merge(sample_transaction, product_profit_margin_df, on='prod_category', how='left')

# Calculate profit for each transaction
merged_df['profit'] = merged_df['sales_amt'] * merged_df['profit_margin']

# KVI: Identify top 10 KVI based on total profit
kvi_top10 = merged_df.groupby('prod_id').agg(total_profit=('profit', 'sum')).reset_index().sort_values('total_profit', ascending=False).head(10)

# KVC: Identify top 10 KVC based on total profit
kvc_top10 = merged_df.groupby('prod_category').agg(total_profit=('profit', 'sum')).reset_index().sort_values('total_profit', ascending=False).head(10)

# Traffic items: Identify top 10 products based on sales quantity
traffic_top10 = merged_df.groupby('prod_id').agg(total_sales_qty=('sales_qty', 'sum')).reset_index().sort_values('total_sales_qty', ascending=False).head(10)

# Add 'Type' columns for differentiation
kvi_top10['Type'] = 'KVI'
kvc_top10['Type'] = 'KVC'
traffic_top10['Type'] = 'Traffic'

# Prepare KVC for final DataFrame (adjusting for consistent column names)
kvc_top10['prod_id'] = kvc_top10['prod_category']

# Create an empty list to hold data
data = []

# Add KVI data
for index, row in kvi_top10.iterrows():
    data.append({'Category': 'KVI', 'Ranking_Type': 'Profit', 'Value': row['prod_id']})

# Add KVC data
for index, row in kvc_top10.iterrows():
    data.append({'Category': 'KVC', 'Ranking_Type': 'Profit', 'Value': row['prod_category']})  # Use prod_category for clarity

# Add Traffic data
for index, row in traffic_top10.iterrows():
    data.append({'Category': 'Traffic', 'Ranking_Type': 'Sales Quantity', 'Value': row['prod_id']})

# Convert list to DataFrame
final_output_df = pd.DataFrame(data)

final_output_df

In [None]:
# Calculate the sum of profit for each product
kvi_sum_profit = merged_df.groupby('prod_id')['profit'].sum().reset_index().rename(columns={'profit': 'sum_profit'})

# Sort the products based on the sum of profit and get top 10 KVI
kvi_top10 = kvi_sum_profit.sort_values('sum_profit', ascending=False).head(10)

# Calculate the sum of profit for each product category
kvc_sum_profit = merged_df.groupby('prod_category')['profit'].sum().reset_index().rename(columns={'profit': 'sum_profit'})

# Sort the categories based on the sum of profit and get top 10 KVC
kvc_top10 = kvc_sum_profit.sort_values('sum_profit', ascending=False).head(10)

# Calculate the sum of sales amount for each product to identify traffic drivers
traffic_sum_sales_amt = merged_df.groupby('prod_id')['sales_amt'].sum().reset_index().rename(columns={'sales_amt': 'sum_sales_amt'})

# Sort the products based on the sum of sales amount to get top 10 traffic driving products
traffic_top10 = traffic_sum_sales_amt.sort_values('sum_sales_amt', ascending=False).head(10)

# Note: The resulting top 10 lists for KVI, KVC, and Traffic Drivers are stored in kvi_top10, kvc_top10, and traffic_top10 respectively


In [None]:
kvi_top10

In [None]:
kvc_top10

In [None]:
traffic_top10

## Q5 promote products 

In [None]:


# Assuming sample_transaction has been loaded

# Exclude transactions with 'Coupons' category and 'In-Store'
sample_transaction_filtered = sample_transaction[(sample_transaction['prod_category'] != 'Coupons') & (sample_transaction['prod_category'] != 'In-Store')]

# Convert transaction dates to datetime format and extract the month
sample_transaction_filtered['month'] = pd.to_datetime(sample_transaction_filtered['trans_dt']).dt.to_period('M')

# Group by product and month to calculate monthly sales quantities, excluding 'Coupons'
monthly_sales_qty = sample_transaction_filtered.groupby(['prod_id', 'month'])['sales_qty'].sum().reset_index()

# Calculate the standard deviation of sales quantities for each product, fill NaN with 0
std_sales_qty = monthly_sales_qty.groupby('prod_id')['sales_qty'].std().fillna(0).reset_index().rename(columns={'sales_qty': 'std_sales_qty'})

# Determine "Often Promoted" threshold as the 75th percentile of standard deviations
threshold = std_sales_qty['std_sales_qty'].quantile(0.75)

# Identify "Often Promoted" products based on the threshold
std_sales_qty['promotion_frequency'] = std_sales_qty['std_sales_qty'].apply(lambda x: 'Often Promoted' if x >= threshold else 'Seldom/Never Promoted')

# Get top 20 "Often Promoted" products excluding 'Coupons', sorted by their standard deviation
top_20_often_promoted = std_sales_qty[std_sales_qty['promotion_frequency'] == 'Often Promoted'].sort_values('std_sales_qty', ascending=False).head(20)

# Merge with unique prod_id to prod_category mapping to add 'prod_category', excluding 'Coupons'
prod_id_to_category = sample_transaction_filtered[['prod_id', 'prod_category']].drop_duplicates()
top_20_often_promoted_with_category = pd.merge(top_20_often_promoted, prod_id_to_category, on='prod_id', how='left')

# The resulting DataFrame has top 20 often promoted products, excluding 'Coupons', with their categories
top_20_often_promoted_with_category
