# Format Data SQL

### Loading initial data

In [5]:
import pandas as pd
from sqlalchemy import create_engine

df = pd.read_csv('solutions_simulation_willy_wonka.csv')
engine = create_engine('postgresql://@localhost:5432/willy_wonka')

def format_data(df):
    df.columns = [col.lower() for col in df.columns]
    df = df.assign(order_date = pd.to_datetime(df['order_date']))
    return df

def insert_data(df, engine):
    df.to_sql('raw_orders', engine, if_exists = 'replace')
    return df


In [46]:
formatted_df = format_data(df)
inserted_df = insert_data(formatted_df, engine)


### Formatting Data

In [72]:
df = pd.read_sql("""select * from raw_orders""", engine)
# df.columns

In [78]:
# df.columns

query = """select order_id, user_id, list_sku,
       list_sku_category, list_sku_flavor, list_lineitem_id,
       order_date, order_value, order_item_count, order_single_item,
       order_channel, order_coupon_code, order_referrer,
       order_ad_shown, order_store_city, order_store_id,
       order_day_part, order_is_freeshipping, user_loyalty_program,
       user_age, user_gender,
       EXTRACT(month FROM order_date) as month, EXTRACT(week FROM order_date) as week, 
EXTRACT(DOW FROM order_date) as dow from raw_orders"""

df_with_date = pd.read_sql(query, engine)
df_with_date.to_sql('orders', engine, if_exists = 'replace')

27

In [80]:
df_with_date.columns

Index(['order_id', 'user_id', 'list_sku', 'list_sku_category',
       'list_sku_flavor', 'list_lineitem_id', 'order_date', 'order_value',
       'order_item_count', 'order_single_item', 'order_channel',
       'order_coupon_code', 'order_referrer', 'order_ad_shown',
       'order_store_city', 'order_store_id', 'order_day_part',
       'order_is_freeshipping', 'user_loyalty_program', 'user_age',
       'user_gender', 'month', 'week', 'dow'],
      dtype='object')

In [81]:


query = """with first_sku as (
    select order_id, user_id, split_part(list_sku, ',', 1) as list_sku,
	split_part(list_sku_category, ',', 1) as list_sku_category,
     split_part(list_sku_flavor, ',', 1) as list_sku_flavor, 
     split_part(list_lineitem_id, ',', 1) as list_lineitem_id,
     order_date, month, week, dow,
       order_value, order_item_count, order_single_item, order_channel,
       order_coupon_code, order_referrer, order_ad_shown,
       order_store_city, order_store_id, order_day_part,
       order_is_freeshipping, 
       user_loyalty_program, user_age, user_gender
     from orders
     ), second_sku as (
    select order_id, user_id, split_part(list_sku, ',', 2) as list_sku,
	split_part(list_sku_category, ',', 2) as list_sku_category,
     split_part(list_sku_flavor, ',', 2) as list_sku_flavor, 
     split_part(list_lineitem_id, ',', 2) as list_lineitem_id,
     order_date, month, week, dow,
       order_value, order_item_count, order_single_item, order_channel,
       order_coupon_code, order_referrer, order_ad_shown,
       order_store_city, order_store_id, order_day_part,
       order_is_freeshipping, 
       user_loyalty_program, user_age, user_gender
     from orders 
     )
     select * from first_sku union select * from second_sku where list_sku != '' order by order_id asc 
     """

df = pd.read_sql(query, engine)
df.to_sql('raw_lineitems', engine, if_exists = 'replace')

27

In [71]:
pd.read_sql('select * from line_items limit 2', engine)

Unnamed: 0,index,list_lineitem_id,order_id,user_id,order_date,list_sku,list_sku_category,order_item_count,order_single_item,order_channel,...,order_store_id,order_day_part,order_is_freeshipping,user_loyalty_program,user_age,user_gender,list_sku_flavor,month,week,dow
0,0,ITEM-2009089674,TXN-52889719,USER-16961214,2021-05-04,SKU-3002,Energy Bar,ITEM-2009089674,False,digital_direct,...,Dallas-3,evening,1,bronze,49,F,Peanut Butter,5.0,18.0,2.0
1,1,ITEM-2008965888,TXN-52889719,USER-16961214,2021-05-04,SKU-3014,Energy Shot,ITEM-2008965888,False,digital_direct,...,Dallas-3,evening,1,bronze,49,F,Chocolate Brownie,5.0,18.0,2.0


In [None]:
pd.read_sql('select * from line_items limit 2', engine)