In [None]:
import pandas as pd
import sqlite3

# Load CSV files
prods = pd.read_csv("PRODUCTS_TAKEHOME.csv")
trans = pd.read_csv("TRANSACTION_TAKEHOME.csv")
users = pd.read_csv("USER_TAKEHOME.csv")



# Set up in-memory SQLite DB
conn = sqlite3.connect(":memory:")

# Load DataFrames into SQLite tables
prods.to_sql("products", conn, index=False, if_exists="replace")
trans.to_sql("transactions", conn, index=False, if_exists="replace")
users.to_sql("users", conn, index=False, if_exists="replace")

# Cleaning PRODUCTS data, remove duplicates, remove nulls, change barcode data format from scientific notations to string

conn.execute( """CREATE TABLE products_cleaned AS 
SELECT CAST( barcode AS TEXT) AS cleanBarcode, CATEGORY_1, CATEGORY_2, CATEGORY_3, CATEGORY_4, BRAND
FROM products
WHERE BARCODE IS NOT NULL
GROUP BY CATEGORY_1, CATEGORY_2, CATEGORY_3, CATEGORY_4, BRAND""") 

# Cleaning TRANSACTIONS data, change barcode data format from scientific notations to string
conn.execute("""CREATE TABLE transactions_cleaned AS 
SELECT 
    CAST( barcode AS TEXT) AS cleanBarcode, 
    RECEIPT_ID, 
    DATETIME(PURCHASE_DATE) AS PURCHASE_DATE, 
    DATETIME(SCAN_DATE) AS SCAN_DATE, STORE_NAME, USER_ID,
    FINAL_QUANTITY, FINAL_SALE
    
FROM transactions""") 

# Cleaning USERS data, remove duplicates, remove nulls

conn.execute( """CREATE TABLE users_cleaned AS 
SELECT 
    ID, 
    DATETIME(CREATED_DATE) AS CREATED_DATE, 
    DATETIME(BIRTH_DATE) AS BIRTH_DATE, 
    STATE, LANGUAGE, GENDER
FROM users
WHERE ID IS NOT NULL
GROUP BY ID, DATETIME(CREATED_DATE), DATETIME(BIRTH_DATE), STATE, LANGUAGE, GENDER""") 

conn.commit()

## Here are a my queries. This is where I am testing the results of my new tables

query1 = """ SELECT CATEGORY_1, CATEGORY_2, CATEGORY_3, CATEGORY_4
FROM products_cleaned
WHERE CATEGORY_2 like '%Dips & Salsa%'
GROUP BY CATEGORY_1, CATEGORY_2, CATEGORY_3, CATEGORY_4
ORDER BY CATEGORY_2
LIMIT 100
"""

query2 = """ SELECT *
FROM users_cleaned
LIMIT 100
"""

query = """ SELECT *
FROM transactions_cleaned
LIMIT 10
"""


df = pd.read_sql_query(query, conn)
print(df)


                          ID         CREATED_DATE           BIRTH_DATE STATE  \
0   5351b1efe4b0e3638af82697  2014-04-18 23:14:55  1995-01-12 00:00:00  None   
1   536178cfe4b012a86bd734f0  2014-04-30 22:27:27  1990-01-23 00:00:00  None   
2   5364098ae4b0a1ed6a0c2f85  2014-05-02 21:09:30  1995-01-19 00:00:00  None   
3   536409c8e4b060d2e95bb71b  2014-05-02 21:10:32  1993-04-07 00:00:00  None   
4   53640a0ee4b060d2e95bb7a9  2014-05-02 21:11:42  1968-08-09 00:00:00  None   
..                       ...                  ...                  ...   ...   
95  56c90241e4b089775946f2a8  2016-02-21 00:18:09  1986-04-15 00:00:00  None   
96  56ca2b6ee4b03934a7909b36  2016-02-21 21:26:06  1971-07-26 00:00:00  None   
97  56d42b85e4b0cad901c0bb9c  2016-02-29 11:29:09  1950-04-29 05:00:00    MA   
98  56d6be26e4b086f9a34e57d9  2016-03-02 10:19:18  1989-11-03 00:00:00    PR   
99  56db73bee4b0796561f73ffd  2016-03-06 00:03:10  1983-07-15 00:00:00  None   

   LANGUAGE  GENDER  
0      None    No