In [66]:
import mysql.connector
import json
from datetime import datetime

def convert_to_datetime(ts):
    """
    Convert a timestamp (either in seconds or milliseconds)
    to a Python datetime object.
    """
    try:
        ts = float(ts)
    except Exception:
        return None
    # If the numeric timestamp is in milliseconds, convert to seconds.
    if ts > 1e10:
        ts /= 1000.0
    return datetime.fromtimestamp(ts)

def load_json_objects(file_path):
    data = []
    seen_ids = set()  # Track unique IDs
    duplicates = 0    # Count duplicates
    
    with open(file_path) as f:
        for line in f:
            if line.strip():
                record = json.loads(line)
                # Get the unique ID from the record (using _id.$oid)
                record_id = record.get('_id', {}).get('$oid')
                if record_id not in seen_ids:
                    seen_ids.add(record_id)
                    data.append(record)
                else:
                    duplicates += 1
                    
    print(f"Loaded {len(data)} unique records from {file_path}")
    print(f"Found {duplicates} duplicate records")
    return data

# Connect to MySQL database
conn = mysql.connector.connect(
    host='localhost',
    user='root',  # replace with your MySQL username
    # password='your_password',  # replace with your MySQL password
)
cursor = conn.cursor()
# First, drop the previous database
cursor.execute("DROP DATABASE IF EXISTS `fetch`")
cursor.execute("CREATE DATABASE `fetch`")
conn.database = 'fetch'

# Create USER table
cursor.execute("""
CREATE TABLE IF NOT EXISTS USER (
    user_id VARCHAR(255) PRIMARY KEY,
    state VARCHAR(255),
    created_date DATETIME,
    last_login DATETIME,
    role VARCHAR(255),
    active BOOLEAN,
    sign_up_source VARCHAR(255)
);
""")

# Create BRAND table
cursor.execute("""
CREATE TABLE IF NOT EXISTS BRAND (
    brand_id VARCHAR(255) PRIMARY KEY,
    barcode VARCHAR(255),
    name VARCHAR(255),
    brand_code VARCHAR(255),
    category VARCHAR(255),
    category_code VARCHAR(255),
    cpg_id VARCHAR(255),
    top_brand BOOLEAN
);
""")

# Create RECEIPT table
cursor.execute("""
CREATE TABLE IF NOT EXISTS RECEIPT (
    receipt_id VARCHAR(255) PRIMARY KEY,
    user_id VARCHAR(255) DEFAULT NULL,
    bonus_points_earned INT,
    bonus_points_earned_reason TEXT,
    create_date DATETIME,
    date_scanned DATETIME,
    finished_date DATETIME,
    modify_date DATETIME,
    points_awarded_date DATETIME,
    points_earned DECIMAL(10, 2),
    purchase_date DATETIME,
    purchased_item_count INT,
    total_spent DECIMAL(10, 2),
    rewards_receipt_status VARCHAR(255)
);
""")

# Create RECEIPT_ITEM table
cursor.execute("""
CREATE TABLE IF NOT EXISTS RECEIPT_ITEM (
    receipt_item_id VARCHAR(255) PRIMARY KEY,
    receipt_id VARCHAR(255),
    barcode VARCHAR(255),
    description TEXT,
    final_price DECIMAL(10, 2),
    item_price DECIMAL(10, 2),
    quantity_purchased INT,
    partner_item_id VARCHAR(255),
    needs_fetch_review BOOLEAN,
    prevent_target_gap_points BOOLEAN,
    user_flagged_barcode VARCHAR(255),
    user_flagged_new_item BOOLEAN,
    user_flagged_price DECIMAL(10, 2),
    user_flagged_quantity INT,
    brand_code VARCHAR(255),
    FOREIGN KEY (receipt_id) REFERENCES RECEIPT(receipt_id)
);
""")

# Load JSON data
users_data = load_json_objects('users.json')
brands_data = load_json_objects('brands.json')
receipts_data = load_json_objects('receipts.json')

# Insert USERS data
for user in users_data:
    try:
        created_date = convert_to_datetime(user.get('createdDate', {}).get('$date', 0))
        last_login = convert_to_datetime(user.get('lastLogin', {}).get('$date', 0))
        cursor.execute("""
            INSERT INTO USER (user_id, state, created_date, last_login, role, active, sign_up_source)
            VALUES (%s, %s, %s, %s, %s, %s, %s)
        """, (
            user['_id']['$oid'],
            user.get('state', ''),
            created_date,
            last_login,
            user.get('role', ''),
            user.get('active', False),
            user.get('signUpSource', '')
        ))
    except mysql.connector.IntegrityError:
        print(f"Skipping duplicate user_id: {user['_id']['$oid']}")
        continue

# Insert BRANDS data
for brand in brands_data:
    try:
        cursor.execute("""
            INSERT INTO BRAND (brand_id, barcode, brand_code, category, category_code, cpg_id, top_brand, name)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        """, (
            brand['_id']['$oid'],
            brand.get('barcode', ''),
            brand.get('brandCode', ''),
            brand.get('category', ''),
            brand.get('categoryCode', ''),
            brand.get('cpg', {}).get('$id', {}).get('$oid', ''),
            brand.get('topBrand', False),
            brand.get('name', '')
        ))
    except mysql.connector.IntegrityError:
        print(f"Skipping duplicate brand_id: {brand['_id']['$oid']}")
        continue

# Insert RECEIPTS data & process nested receipt items
for receipt in receipts_data:
    try:
        receipt_id = receipt['_id']['$oid']
        cursor.execute("""
            INSERT INTO RECEIPT (
                receipt_id,
                user_id,
                bonus_points_earned,
                bonus_points_earned_reason,
                create_date,
                date_scanned,
                finished_date,
                modify_date,
                points_awarded_date,
                points_earned,
                purchase_date,
                purchased_item_count,
                total_spent,
                rewards_receipt_status
            )
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """, (
            receipt_id,
            receipt.get('userId', ''),
            receipt.get('bonusPointsEarned', 0),
            receipt.get('bonusPointsEarnedReason', ''),
            convert_to_datetime(receipt.get('createDate', {}).get('$date', 0)),
            convert_to_datetime(receipt.get('dateScanned', {}).get('$date', 0)),
            convert_to_datetime(receipt.get('finishedDate', {}).get('$date', 0)),
            convert_to_datetime(receipt.get('modifyDate', {}).get('$date', 0)),
            convert_to_datetime(receipt.get('pointsAwardedDate', {}).get('$date', 0)),
            receipt.get('pointsEarned', 0),
            convert_to_datetime(receipt.get('purchaseDate', {}).get('$date', 0)),
            receipt.get('purchasedItemCount', 0),
            receipt.get('totalSpent', 0),
            receipt.get('rewardsReceiptStatus', '')
        ))
    except mysql.connector.IntegrityError as e:
        print(f"Error inserting receipt_id {receipt_id}: {e}")
        continue

    # Process nested receipt items using the key "rewardsReceiptItemList"
    receipt_items = receipt.get("rewardsReceiptItemList", [])
    for index, item in enumerate(receipt_items):
        receipt_item_id = f"{receipt_id}-{index}"

        print(item)
        try:
            cursor.execute("""
                INSERT INTO RECEIPT_ITEM (
                    receipt_item_id,
                    receipt_id,
                    barcode,
                    description,
                    final_price,
                    item_price,
                    quantity_purchased,
                    partner_item_id,
                    needs_fetch_review,
                    prevent_target_gap_points,
                    user_flagged_barcode,
                    user_flagged_new_item,
                    user_flagged_price,
                    user_flagged_quantity,
                    brand_code
                )
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """, (
                receipt_item_id,
                receipt_id,
                item.get('barcode', ''),
                item.get('description', ''),
                float(item.get('finalPrice', 0)),
                float(item.get('itemPrice', 0)), 
                int(item.get('quantityPurchased', 0)),
                item.get('partnerItemId', ''),
                item.get('needsFetchReview', False),
                item.get('preventTargetGapPoints', False),
                item.get('userFlaggedBarcode', ''),
                item.get('userFlaggedNewItem', False),
                float(item.get('userFlaggedPrice', 0)),
                int(item.get('userFlaggedQuantity', 0)),
                item.get('brandCode', '')
            ))
        except mysql.connector.IntegrityError:
            print(f"Skipping duplicate receipt_item_id: {receipt_item_id}")
            continue

conn.commit()

# Verification: print counts for each table
tables = ['USER', 'BRAND', 'RECEIPT', 'RECEIPT_ITEM']
for table in tables:
    print(f"\nTable: {table}")
    cursor.execute(f"SHOW COLUMNS FROM {table};")
    columns_info = cursor.fetchall()  
    columns = [col[0] for col in columns_info]
    print("Columns:", columns)
    
    cursor.execute(f"SELECT * FROM {table} LIMIT 5")
    rows = cursor.fetchall()
    print("Sample rows:")
    for row in rows:
        print(row)

cursor.close()
conn.close()

Loaded 212 unique records from users.json
Found 283 duplicate records
Loaded 1167 unique records from brands.json
Found 0 duplicate records
Loaded 1119 unique records from receipts.json
Found 0 duplicate records
{'barcode': '4011', 'description': 'ITEM NOT FOUND', 'finalPrice': '26.00', 'itemPrice': '26.00', 'needsFetchReview': False, 'partnerItemId': '1', 'preventTargetGapPoints': True, 'quantityPurchased': 5, 'userFlaggedBarcode': '4011', 'userFlaggedNewItem': True, 'userFlaggedPrice': '26.00', 'userFlaggedQuantity': 5}
{'barcode': '4011', 'description': 'ITEM NOT FOUND', 'finalPrice': '1', 'itemPrice': '1', 'partnerItemId': '1', 'quantityPurchased': 1}
{'barcode': '028400642255', 'description': 'DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCED FAT BAG 1 OZ', 'finalPrice': '10.00', 'itemPrice': '10.00', 'needsFetchReview': True, 'needsFetchReviewReason': 'USER_FLAGGED', 'partnerItemId': '2', 'pointsNotAwardedReason': 'Action not allowed for user and CPG', 'pointsPayerId': '5332f5fbe4b

In [84]:
conn = mysql.connector.connect(
    host='localhost',
    user='root',  
    database='fetch',
)

cursor = conn.cursor()

In [117]:
# What are the top 5 brands by receipts scanned for most recent month?

query = """

SELECT 
    DATE_FORMAT(r.date_scanned, '%Y-%m') AS formatted_date,
    b.brand_id,
    b.name,
    COUNT(DISTINCT ri.receipt_id) AS receipts_scanned
FROM BRAND b
INNER JOIN RECEIPT_ITEM ri 
ON (
    CASE
        WHEN b.barcode = ri.user_flagged_barcode THEN 1
        WHEN b.barcode = ri.barcode THEN 2
        WHEN b.brand_code = ri.brand_code AND b.brand_code <> '' AND b.brand_code IS NOT NULL THEN 3
        ELSE 0
    END
) > 0
LEFT JOIN RECEIPT r
ON r.receipt_id = ri.receipt_id
GROUP BY DATE_FORMAT(r.date_scanned, '%Y-%m'), b.brand_id, b.name
ORDER BY formatted_date DESC, receipts_scanned DESC
LIMIT 5;

"""

# Execute the query.
cursor.execute(query)
results = cursor.fetchall()

# Print the results.
print("Top 5 brands based on distinct receipts scanned in the latest month:")
for row in results:
    print(row)

Top 5 brands based on distinct receipts scanned in the latest month:
('2021-02', '5bd201f090fa074576779a1a', 'Viva', 1)
('2021-01', '5332f5fbe4b03c9a25efd0b9', 'Pepsi', 23)
('2021-01', '55a41b88e4b0d0a65b3692f0', 'Kraft', 22)
('2021-01', '5bd2013f965c7d66d92731ec', 'Kleenex', 21)
('2021-01', '592486bee410d61fcea3d130', 'KNORR', 19)


In [118]:
# What are the top 5 brands by receipts scanned for most recent month?
# Test

import pandas as pd

# Assuming 'cursor' is your database cursor
query = """
WITH BrandCount AS (
    SELECT 
        DATE_FORMAT(r.date_scanned, '%Y-%m') AS formatted_date,
        COUNT(DISTINCT b.brand_id) AS brand_count
    FROM BRAND b
    INNER JOIN RECEIPT_ITEM ri 
    ON (
        b.barcode = ri.barcode 
        OR b.barcode = ri.user_flagged_barcode 
        OR (b.brand_code = ri.brand_code AND b.brand_code <> '' AND b.brand_code IS NOT NULL)
    )
    LEFT JOIN RECEIPT r
    ON r.receipt_id = ri.receipt_id
    GROUP BY DATE_FORMAT(r.date_scanned, '%Y-%m')
)
SELECT 
    DATE_FORMAT(r.date_scanned, '%Y-%m') AS formatted_date,
    b.brand_id,
    b.name,
    COUNT(DISTINCT ri.receipt_id) AS receipts_scanned
FROM BRAND b
INNER JOIN RECEIPT_ITEM ri 
ON (
    CASE
        WHEN b.barcode = ri.user_flagged_barcode THEN 1
        WHEN b.barcode = ri.barcode THEN 2
        WHEN b.brand_code = ri.brand_code AND b.brand_code <> '' AND b.brand_code IS NOT NULL THEN 3
        ELSE 0
    END
) > 0
LEFT JOIN RECEIPT r
ON r.receipt_id = ri.receipt_id
WHERE DATE_FORMAT(r.date_scanned, '%Y-%m') IN (
    SELECT formatted_date 
    FROM BrandCount 
    WHERE brand_count >= 5
    UNION
    SELECT DATE_FORMAT(DATE_SUB(MIN(r.date_scanned), INTERVAL 1 MONTH), '%Y-%m')
    FROM BrandCount 
    WHERE brand_count < 5
)
GROUP BY DATE_FORMAT(r.date_scanned, '%Y-%m'), b.brand_id, b.name
ORDER BY formatted_date DESC, receipts_scanned DESC
LIMIT 5;
"""

cursor.execute(query)
results = cursor.fetchall()

# Get column names from the cursor
columns = [desc[0] for desc in cursor.description]

# Create a DataFrame
df = pd.DataFrame(results, columns=columns)

# Print the DataFrame
print(df)


  formatted_date                  brand_id     name  receipts_scanned
0        2021-01  5332f5fbe4b03c9a25efd0b9    Pepsi                23
1        2021-01  55a41b88e4b0d0a65b3692f0    Kraft                22
2        2021-01  5bd2013f965c7d66d92731ec  Kleenex                21
3        2021-01  592486bee410d61fcea3d130    KNORR                19
4        2021-01  5887a372e4b02187f85cdad9  Doritos                19


In [119]:
# What are the top 5 brands by receipts scanned for most recent month?

query = """
-- Step 1: Create a CTE to count the number of unique brands for each year-month
WITH BrandCount AS (
    SELECT 
        DATE_FORMAT(r.date_scanned, '%Y-%m') AS formatted_date,  -- Extract year-month
        COUNT(DISTINCT b.brand_id) AS brand_count  -- Count unique brands in that month
    FROM BRAND b
    INNER JOIN RECEIPT_ITEM ri 
    ON (
        b.barcode = ri.barcode 
        OR b.barcode = ri.user_flagged_barcode 
        OR (b.brand_code = ri.brand_code AND b.brand_code <> '' AND b.brand_code IS NOT NULL)
    )
    LEFT JOIN RECEIPT r
    ON r.receipt_id = ri.receipt_id
    GROUP BY DATE_FORMAT(r.date_scanned, '%Y-%m')  -- Group by year-month
)

-- Step 2: Retrieve the top 5 brands considering the latest available month
SELECT 
    DATE_FORMAT(r.date_scanned, '%Y-%m') AS formatted_date,  -- Format the date as YYYY-MM
    b.brand_id,  -- Brand ID
    b.name,  -- Brand name
    COUNT(DISTINCT ri.receipt_id) AS receipts_scanned  -- Count of distinct receipts scanned
FROM BRAND b
INNER JOIN RECEIPT_ITEM ri 
ON (
    CASE
        WHEN b.barcode = ri.user_flagged_barcode THEN 1
        WHEN b.barcode = ri.barcode THEN 2
        WHEN b.brand_code = ri.brand_code AND b.brand_code <> '' AND b.brand_code IS NOT NULL THEN 3
        ELSE 0
    END
) > 0
LEFT JOIN RECEIPT r
ON r.receipt_id = ri.receipt_id

-- Step 3: Apply filtering logic to ensure at least 5 brands are selected
WHERE DATE_FORMAT(r.date_scanned, '%Y-%m') IN (
    -- If a month has 5 or more brands, use that month
    SELECT formatted_date 
    FROM BrandCount 
    WHERE brand_count >= 5
    
    UNION
    
    -- If the latest month has fewer than 5 brands, include the previous month instead
    SELECT DATE_FORMAT(DATE_SUB(MIN(r.date_scanned), INTERVAL 1 MONTH), '%Y-%m')
    FROM BrandCount 
    WHERE brand_count < 5
)

-- Step 4: Group by formatted year-month and brand details
GROUP BY DATE_FORMAT(r.date_scanned, '%Y-%m'), b.brand_id, b.name

-- Step 5: Order the results by most recent month and the highest receipt count
ORDER BY formatted_date DESC, receipts_scanned DESC

-- Step 6: Limit the output to 5 results
LIMIT 5;
"""

# Execute the query.
cursor.execute(query)
results = cursor.fetchall()

# Print the results.
print("Top 5 brands based on distinct receipts scanned in the latest month:")
for row in results:
    print(row)

Top 5 brands based on distinct receipts scanned in the latest month:
('2021-01', '5332f5fbe4b03c9a25efd0b9', 'Pepsi', 23)
('2021-01', '55a41b88e4b0d0a65b3692f0', 'Kraft', 22)
('2021-01', '5bd2013f965c7d66d92731ec', 'Kleenex', 21)
('2021-01', '592486bee410d61fcea3d130', 'KNORR', 19)
('2021-01', '5887a372e4b02187f85cdad9', 'Doritos', 19)


In [120]:
# How does the ranking of the top 5 brands by receipts scanned for the recent month compare to the ranking for the previous month?
# Example
# Brand   Ranking_this_month  Rainking_last_month
# A        1                  4
# B        2                  2
# C        3                  3
# D        4                  1
# E        5                  N/A

query = """
-- Step 1: Identify months with at least 5 brands; otherwise, use the previous month
WITH BrandCount AS (
    SELECT 
        DATE_FORMAT(r.date_scanned, '%Y-%m') AS formatted_date,  
        COUNT(DISTINCT b.brand_id) AS brand_count  
    FROM BRAND b
    INNER JOIN RECEIPT_ITEM ri 
    ON (
        CASE
            WHEN b.barcode = ri.user_flagged_barcode THEN 1
            WHEN b.barcode = ri.barcode THEN 2
            WHEN b.brand_code = ri.brand_code AND b.brand_code <> '' AND b.brand_code IS NOT NULL THEN 3
            ELSE 0
        END
    ) > 0
    LEFT JOIN RECEIPT r
    ON r.receipt_id = ri.receipt_id
    GROUP BY formatted_date  
),
ValidMonths AS (
    -- Get months with at least 5 brands or fallback to the previous month
    SELECT formatted_date 
    FROM BrandCount 
    WHERE brand_count >= 5
    UNION
    SELECT DATE_FORMAT(DATE_SUB(MIN(formatted_date), INTERVAL 1 MONTH), '%Y-%m')
    FROM BrandCount 
    WHERE brand_count < 5
),
monthly AS (
    -- Step 2: Retrieve brand data for the two most recent valid months
    SELECT 
        DATE_FORMAT(r.date_scanned, '%Y-%m') AS formatted_date,
        b.brand_id,
        b.name,
        COUNT(DISTINCT ri.receipt_id) AS receipts_scanned
    FROM BRAND b
    INNER JOIN RECEIPT_ITEM ri 
    ON (
        b.barcode = ri.barcode 
        OR b.barcode = ri.user_flagged_barcode 
        OR (b.brand_code = ri.brand_code AND b.brand_code <> '' AND b.brand_code IS NOT NULL)
    )
    LEFT JOIN RECEIPT r
    ON r.receipt_id = ri.receipt_id
    WHERE DATE_FORMAT(r.date_scanned, '%Y-%m') IN (SELECT formatted_date FROM ValidMonths)
    GROUP BY formatted_date, b.brand_id, b.name
),
ranked AS (
    -- Step 3: Assign rankings to brands for each valid month
    SELECT 
        formatted_date,
        brand_id,
        name,
        receipts_scanned,
        RANK() OVER (PARTITION BY formatted_date ORDER BY receipts_scanned DESC) AS ranking
    FROM monthly
),
latest_month AS (
    -- Step 4: Identify the most recent valid month
    SELECT formatted_date FROM ValidMonths ORDER BY formatted_date DESC LIMIT 1
),
previous_month AS (
    -- Step 5: Identify the previous valid month
    SELECT formatted_date FROM ValidMonths ORDER BY formatted_date DESC LIMIT 1 OFFSET 1
)
-- Step 6: Compare rankings between the most recent and previous valid months
SELECT 
    r1.name AS Brand,
    r1.ranking AS Ranking_this_month,
    COALESCE(r2.ranking, 'N/A') AS Ranking_last_month
FROM ranked r1
LEFT JOIN ranked r2 
ON r1.brand_id = r2.brand_id  -- Match by brand_id for accuracy
AND r2.formatted_date = (SELECT formatted_date FROM previous_month)  -- Match with previous month
WHERE r1.formatted_date = (SELECT formatted_date FROM latest_month)  -- Get latest month data
ORDER BY r1.ranking;


"""

cursor.execute(query)
results = cursor.fetchall()
print(results)

# Filter only the top 5 ranked brands
top_5_brands = results[:5]

print("Brand | Ranking_this_month | Ranking_last_month")
print("------------------------------------------------")
for row in top_5_brands:
    print(f"{row[0]} | {row[1]} | {row[2]}")

[('Pepsi', 1, 'N/A'), ('Kraft', 2, 'N/A'), ('Kleenex', 3, 'N/A'), ('Doritos', 4, 'N/A'), ('KNORR', 4, 'N/A'), ('Rice A Roni', 6, 'N/A'), ('Cracker Barrel Cheese', 7, 'N/A'), ("Mayo by HELLMANN'S/BEST FOODS", 7, 'N/A'), ('Swanson', 9, 'N/A'), ('Yuban Coffee', 9, 'N/A'), ('Dole Chilled Fruit Juices', 9, 'N/A'), ('Tostitos', 9, 'N/A'), ('Planters', 13, 'N/A'), ('Quaker', 14, 'N/A'), ('Jell-O', 15, 'N/A'), ('Mountain Dew', 15, 'N/A'), ('Philadelphia', 17, 'N/A'), ('Velveeta', 17, 'N/A'), ('Cool Whip', 19, 'N/A'), ('Cheetos', 19, 'N/A'), ('Classico', 19, 'N/A'), ('Lunchables', 19, 'N/A'), ('Diet Chris Cola', 19, 'N/A'), ('Prego', 19, 'N/A'), ('Pepperidge Farm', 19, 'N/A'), ('Kettle Brand', 26, 'N/A'), ('V8', 26, 'N/A'), ('Huggies', 28, 'N/A'), ('Oscar Mayer', 28, 'N/A'), ('Huggies', 28, 'N/A'), ('Grey Poupon', 31, 'N/A'), ('Pacific Foods', 31, 'N/A'), ('JUST CRACK AN EGG Scramble Kit', 31, 'N/A'), ('Ore-Ida', 31, 'N/A'), ('Cottonelle', 31, 'N/A'), ('KLONDIKE', 31, 'N/A'), ('Stove Top', 31, 

In [121]:
# When considering average spend from receipts with 'rewardsReceiptStatus’ of ‘Accepted’ or ‘Rejected’, which is greater?
# When considering total number of items purchased from receipts with 'rewardsReceiptStatus’ of ‘Accepted’ or ‘Rejected’, which is greater?
# Assume Finished means the receipt is processed and accepted

query_avg_spend = """
SELECT
    rewards_receipt_status,
    COUNT(receipt_id) AS receipt_count, 
    SUM(purchased_item_count) AS count_itemsPurchased, 
    AVG(purchased_item_count) AS avg_itemsPurchased, 
	AVG(total_spent) AS avg_spend
FROM RECEIPT
where rewards_receipt_status in ('FINISHED', 'REJECTED')
group by rewards_receipt_status
"""

# Execute the query.
cursor.execute(query_avg_spend)
results = cursor.fetchall()

# # Print the results.
# print("Rewards Receipt Status | Receipt Count | Count of Items Purchased | Average of Items Purchased | Average Spend")
# print("-----------------------------------------")
# print(results)

# Get column names from the cursor
columns = [desc[0] for desc in cursor.description]

# Create a DataFrame
df = pd.DataFrame(results, columns=columns)

# Print the DataFrame
print(df)



  rewards_receipt_status  receipt_count count_itemsPurchased  \
0               REJECTED             71                  173   
1               FINISHED            518                 8184   

  avg_itemsPurchased  avg_spend  
0             2.4366  23.326056  
1            15.7992  80.854305  


In [116]:
# Which brand has the most spend among users who were created within the past 6 months?

query = """
WITH max_date AS (
    SELECT MAX(created_date) AS max_created_date
    FROM USER
),
recent_users AS (
    SELECT user_id
    FROM USER, max_date
    WHERE created_date >= DATE_SUB(max_created_date, INTERVAL 6 MONTH)
),
brand_spend AS (
    SELECT 
        b.brand_id,
        b.name,
        SUM(r.total_spent) AS total_spend
    FROM RECEIPT r
    JOIN RECEIPT_ITEM ri ON r.receipt_id = ri.receipt_id
    INNER JOIN BRAND b 
    ON (
        CASE
            WHEN b.barcode = ri.user_flagged_barcode THEN 1
            WHEN b.barcode = ri.barcode THEN 2
            WHEN b.brand_code = ri.brand_code AND b.brand_code <> '' AND b.brand_code IS NOT NULL THEN 3
            ELSE 0
        END
    ) > 0
    WHERE r.user_id IN (SELECT user_id FROM recent_users)
    GROUP BY b.brand_id, b.name
)
SELECT b.brand_id, b.name, total_spend
FROM brand_spend b
ORDER BY total_spend DESC
LIMIT 1
"""

cursor.execute(query)
result = cursor.fetchone()

print("Brand with Most Spend Among Users Created Within the Past 6 Months:")
print("---------------------------------------------------------------------")
if result:
    print(f"Brand ID: {result[0]}, Name: {result[1]}, Total Spend: ${result[2]:.2f}")

Brand with Most Spend Among Users Created Within the Past 6 Months:
---------------------------------------------------------------------
Brand ID: 5332f5fbe4b03c9a25efd0b9, Name: Pepsi, Total Spend: $78870.86


In [125]:
# Which brand has the most transactions among users who were created within the past 6 months?
query = """
WITH max_date AS (
    SELECT MAX(created_date) AS max_created_date
    FROM USER
),
recent_users AS (
    SELECT user_id
    FROM USER, max_date
    WHERE created_date >= DATE_SUB(max_created_date, INTERVAL 6 MONTH)
),
brand_transactions AS (
    SELECT 
        b.brand_id,
        b.name,
        COUNT(DISTINCT r.receipt_id) AS transaction_count
    FROM RECEIPT r
    JOIN RECEIPT_ITEM ri ON r.receipt_id = ri.receipt_id
    INNER JOIN BRAND b 
    ON (
        CASE
            WHEN b.barcode = ri.user_flagged_barcode THEN 1
            WHEN b.barcode = ri.barcode THEN 2
            WHEN b.brand_code = ri.brand_code AND b.brand_code <> '' AND b.brand_code IS NOT NULL THEN 3
            ELSE 0
        END
    ) > 0
    WHERE r.user_id IN (SELECT user_id FROM recent_users)
    GROUP BY b.brand_id, b.name
)
SELECT b.brand_id, b.name, transaction_count
FROM brand_transactions b
ORDER BY transaction_count DESC
LIMIT 1
"""

cursor.execute(query)
result = cursor.fetchone()

# Get column names from the cursor
columns = [desc[0] for desc in cursor.description]

# Create a DataFrame
df = pd.DataFrame(result, columns=columns)

# Print the DataFrame
print(df)

ValueError: 3 columns passed, passed data had 5 columns