In [40]:
import pandas as pd
import sqlite3

# Load CSV files
prods = pd.read_csv("PRODUCTS_TAKEHOME.csv")
trans = pd.read_csv("TRANSACTION_TAKEHOME.csv")
users = pd.read_csv("USER_TAKEHOME.csv")



# Set up in-memory SQLite DB
conn = sqlite3.connect(":memory:")

# Load DataFrames into SQLite tables
prods.to_sql("products", conn, index=False, if_exists="replace")
trans.to_sql("transactions", conn, index=False, if_exists="replace")
users.to_sql("users", conn, index=False, if_exists="replace")

# Cleaning PRODUCTS data, remove duplicates, remove nulls, change barcode data format from scientific notations to string

conn.execute( """CREATE TABLE products_cleaned AS 
SELECT CAST( barcode AS TEXT) AS cleanBarcode, CATEGORY_1, CATEGORY_2, CATEGORY_3, CATEGORY_4, BRAND
FROM products
WHERE BARCODE IS NOT NULL
GROUP BY CATEGORY_1, CATEGORY_2, CATEGORY_3, CATEGORY_4, BRAND""") 

# Cleaning TRANSACTIONS data, change barcode data format from scientific notations to string
conn.execute("""CREATE TABLE transactions_cleaned AS 
SELECT 
    CAST( barcode AS TEXT) AS cleanBarcode, 
    RECEIPT_ID, 
    DATETIME(PURCHASE_DATE) AS PURCHASE_DATE, 
    DATETIME(SCAN_DATE) AS SCAN_DATE, STORE_NAME, USER_ID,
    FINAL_QUANTITY AS cleanQty, FINAL_SALE
    
FROM transactions
WHERE FINAL_QUANTITY != 'zero'""") 
##GROUP BY CAST( barcode AS TEXT), RECEIPT_ID,  DATETIME(PURCHASE_DATE),  DATETIME(SCAN_DATE), FINAL_QUANTITY, FINAL_SALE

# Cleaning USERS data, remove duplicates, remove nulls

conn.execute( """CREATE TABLE users_cleaned AS 
SELECT 
    ID, 
    DATETIME(CREATED_DATE) AS CREATED_DATE, 
    DATETIME(BIRTH_DATE) AS BIRTH_DATE, 
    STATE, LANGUAGE, GENDER
FROM users
WHERE ID IS NOT NULL
GROUP BY ID, DATETIME(CREATED_DATE), DATETIME(BIRTH_DATE), STATE, LANGUAGE, GENDER""") 

conn.commit()

## Here are a my queries. This is where I am testing the results of my new tables

query1 = """ SELECT CATEGORY_1, CATEGORY_2, CATEGORY_3, CATEGORY_4
FROM products_cleaned
WHERE CATEGORY_2 like '%Dips & Salsa%'
GROUP BY CATEGORY_1, CATEGORY_2, CATEGORY_3, CATEGORY_4
ORDER BY CATEGORY_2
LIMIT 100
"""

query2 = """ SELECT *
FROM users_cleaned
LIMIT 100
"""

query3 = """ SELECT COUNT(*)
FROM transactions_cleaned
WHERE cleanBarcode is null
LIMIT 10
"""

query4 = """ SELECT   cleanBarcode, 
    RECEIPT_ID, 
    PURCHASE_DATE, 
    SCAN_DATE, STORE_NAME, USER_ID,
    FINAL_QUANTITY, FINAL_SALE, COUNT(*)
FROM transactions_cleaned
GROUP BY cleanBarcode, RECEIPT_ID, PURCHASE_DATE, SCAN_DATE, STORE_NAME, USER_ID, FINAL_QUANTITY, FINAL_SALE
HAVING  COUNT(*) > 1
LIMIT 10
"""

query5 = """SELECT cleanBarcode, PURCHASE_DATE, FINAL_SALE, FINAL_QUANTITY, COUNT(*)
FROM transactions_cleaned
GROUP BY cleanBarcode, FINAL_SALE, FINAL_QUANTITY
HAVING  COUNT(*) > 1"""

##validates cleanBarcode for transaction data
query6 = """SELECT cleanBarcode, FINAL_SALE, cleanQty, COUNT(*)
FROM transactions_cleaned
WHERE cleanBarcode IS NULL OR cleanBarcode = '0.0' OR cleanBarcode = '-1.0'
GROUP BY cleanBarcode, FINAL_SALE, cleanQty
"""

query = """SELECT cleanQty, SUM(FINAL_SALE), COUNT(*)
FROM transactions_cleaned
WHERE cleanQty = 0
GROUP BY cleanQty
"""

df = pd.read_sql_query(query, conn)
print(df)


Empty DataFrame
Columns: [cleanQty, SUM(FINAL_SALE), COUNT(*)]
Index: []
