## Create database tables

In [1]:
import pandas as pd
import psycopg, os
from sqlalchemy import create_engine

In [2]:
print('Connecting to the PostgreSQL database...') 
conn = psycopg.connect(
    host="localhost",
    port='5432',
    dbname="5310 ETL",
    user="postgres",
    password="123")
cur = conn.cursor()

Connecting to the PostgreSQL database...


In [3]:
cur.execute("DROP TABLE IF EXISTS department CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS position CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS sales_country CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS sales_region CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS sales_state_province CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS sales_district CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS sales_city CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS store CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS employee CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS employee_closure CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS customer CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS warehouse_class CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS warehouse CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS promotion CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS product_class CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS vendor CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS product CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS time_by_day CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS inventory_delivery CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS sales CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS currency CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS account CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS category CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS expense CASCADE")
conn.commit()
cur.execute("DROP TABLE IF EXISTS salary_record CASCADE")
conn.commit()


createCmd = """
            CREATE TABLE department (
                department_id SERIAL PRIMARY KEY,
                department_description VARCHAR(255) NOT NULL
            );

            CREATE TABLE position (
                position_id SERIAL PRIMARY KEY,
                position_title VARCHAR(255) NOT NULL,
                pay_type VARCHAR(50) NOT NULL,
                min_scale INT,
                max_scale INT,
                management_role VARCHAR(50) NOT NULL
            );

            CREATE TABLE sales_country (
                country_id SERIAL PRIMARY KEY,
                sales_country VARCHAR(255) NOT NULL
            );

            CREATE TABLE sales_region (
                region_id SERIAL PRIMARY KEY,
                sales_region VARCHAR(255) NOT NULL,
                country_id INT,
                FOREIGN KEY (country_id) REFERENCES sales_country(country_id)
            );

            CREATE TABLE sales_state_province (
                state_province_id SERIAL PRIMARY KEY,
                sales_state_province VARCHAR(255) NOT NULL,
                region_id INT NOT NULL,
                FOREIGN KEY (region_id) REFERENCES sales_region(region_id)
            );

            CREATE TABLE sales_district (
                sales_district_id SERIAL PRIMARY KEY,
                sales_district VARCHAR(255) NOT NULL,
                state_province_id INT,
                FOREIGN KEY (state_province_id) REFERENCES sales_state_province(state_province_id)
            );

            CREATE TABLE sales_city (
                city_id SERIAL PRIMARY KEY,
                sales_city VARCHAR(255) NOT NULL,
                sales_district_id INT,
                FOREIGN KEY (sales_district_id) REFERENCES sales_district(sales_district_id)
            );

            CREATE TABLE store (
                store_id SERIAL PRIMARY KEY,
                store_type VARCHAR(50) NOT NULL,
                store_name VARCHAR(255) NOT NULL,
                region_id INT NOT NULL,
                store_street_address VARCHAR(255) NOT NULL,
                store_postal_code INT,
                store_phone VARCHAR(20),
                store_fax VARCHAR(20),
                first_opened_date DATE,
                last_remodel_date DATE,
                store_sqft INT,
                grocery_sqft INT,
                frozen_sqft INT,
                meat_sqft INT,
                coffee_bar CHAR(1),
                video_store CHAR(1),
                salad_bar CHAR(1),
                prepared_food CHAR(1),
                florist CHAR(1),
                FOREIGN KEY (region_id) REFERENCES sales_region(region_id)
            );

            CREATE TABLE employee (
                employee_id SERIAL PRIMARY KEY,
                first_name VARCHAR(255) NOT NULL,
                last_name VARCHAR(255) NOT NULL,
                position_id INT NOT NULL,
                store_id INT NOT NULL,
                department_id INT NOT NULL,
                salary DOUBLE PRECISION,
                hire_date DATE,
                end_date DATE,
                birth_date DATE,
                education_level VARCHAR(100),
                marital_status CHAR(1),
                gender CHAR(1),
                FOREIGN KEY (position_id) REFERENCES position(position_id),
                FOREIGN KEY (store_id) REFERENCES store(store_id),
                FOREIGN KEY (department_id) REFERENCES department(department_id)
            );

            CREATE TABLE employee_closure (
                employee_id INT,
                supervisor_id INT,
                distance INT,
                PRIMARY KEY (employee_id, supervisor_id),
                FOREIGN KEY (employee_id) REFERENCES employee(employee_id),
                FOREIGN KEY (supervisor_id) REFERENCES employee(employee_id)
            );

            CREATE TABLE customer (
                customer_id SERIAL PRIMARY KEY,
                account_num BIGINT NOT NULL,
                fname VARCHAR(255),
                lname VARCHAR(255),
                mi VARCHAR(50),
                address1 VARCHAR(255),
                postal_code INT,
                city VARCHAR(255),
                country_id INT,
                phone1 VARCHAR(50),
                phone2 VARCHAR(50),
                birthdate DATE,
                marital_status CHAR(1),
                yearly_income VARCHAR(255),
                gender CHAR(1),
                total_children INT,
                num_children_at_home INT,
                education VARCHAR(255),
                date_accnt_opened DATE,
                member_card VARCHAR(50),
                occupation VARCHAR(255),
                houseowner CHAR(1),
                num_cars_owned INT,
                FOREIGN KEY (country_id) REFERENCES sales_country(country_id)
            );

            CREATE TABLE warehouse_class (
                warehouse_class_id SERIAL PRIMARY KEY,
                Description VARCHAR(255)
            );

            CREATE TABLE warehouse (
                warehouse_id SERIAL PRIMARY KEY,
                warehouse_class_id INT NOT NULL,
                stores_id INT NOT NULL,
                warehouse_name VARCHAR(255),
                wa_address1 VARCHAR(255),
                city_id INT,
                warehouse_state_province VARCHAR(255),
                warehouse_country VARCHAR(255),
                warehouse_postal_code INT,
                warehouse_phone VARCHAR(50),
                warehouse_fax VARCHAR(50),
                FOREIGN KEY (warehouse_class_id) REFERENCES warehouse_class(warehouse_class_id),
                FOREIGN KEY (stores_id) REFERENCES store(store_id),
                FOREIGN KEY (city_id) REFERENCES sales_city(city_id)
            );

            CREATE TABLE promotion (
                promotion_id SERIAL PRIMARY KEY,
                promotion_name VARCHAR(255),
                media_type VARCHAR(255),
                cost DOUBLE PRECISION,
                start_date DATE,
                end_date DATE
            );

            CREATE TABLE product_class (
                product_class_id SERIAL PRIMARY KEY,
                product_subcategory VARCHAR(255)
            );

            CREATE TABLE vendor (
                vendor_id SERIAL PRIMARY KEY,
                vendor_name VARCHAR(255)
            );

            CREATE TABLE product (
                product_id SERIAL PRIMARY KEY,
                product_class_id INT NOT NULL,
                product_name VARCHAR(255),
                vendor_id INT NOT NULL,
                "SKU" BIGINT,
                "SRP" DECIMAL(4,2),
                gross_weight DECIMAL(4,2),
                net_weight DECIMAL(4,2),
                recyclable_package CHAR(1),
                low_fat CHAR(1),
                units_per_case INT,
                cases_per_pallet INT,
                shelf_width DECIMAL(4,2),
                shelf_height DECIMAL(4,2),
                shelf_depth DECIMAL(4,2),
                FOREIGN KEY (product_class_id) REFERENCES product_class(product_class_id),
                FOREIGN KEY (vendor_id) REFERENCES vendor(vendor_id)
            );

            CREATE TABLE time_by_day (
                time_id SERIAL PRIMARY KEY,
                the_date DATE NOT NULL,
                the_day VARCHAR(255),
                day_of_month INT,
                week_of_year INT,
                month_of_year INT,
                quarter CHAR(2)
            );

            CREATE TABLE inventory_delivery (
                product_id INT,
                time_id INT,
                warehouse_id INT,
                store_id INT,
                units_ordered INT,
                units_shipped INT,
                warehouse_sales DOUBLE PRECISION,
                warehouse_cost DOUBLE PRECISION,
                supply_time INT,
                store_invoice DOUBLE PRECISION,
                PRIMARY KEY (product_id, time_id, warehouse_id, store_id),
                FOREIGN KEY (product_id) REFERENCES product(product_id),
                FOREIGN KEY (time_id) REFERENCES time_by_day(time_id),
                FOREIGN KEY (warehouse_id) REFERENCES warehouse(warehouse_id),
                FOREIGN KEY (store_id) REFERENCES store(store_id)
            );

            CREATE TABLE sales (
                product_id INT,
                time_id INT,
                customer_id INT,
                promotion_id INT,
                store_id INT,
                store_sales DECIMAL(5,2),
                store_cost DECIMAL(7,4),
                unit_sales INT,
                PRIMARY KEY (product_id, time_id, customer_id, promotion_id, store_id,store_cost),
                FOREIGN KEY (product_id) REFERENCES product(product_id),
                FOREIGN KEY (time_id) REFERENCES time_by_day(time_id),
                FOREIGN KEY (customer_id) REFERENCES customer(customer_id),
                FOREIGN KEY (promotion_id) REFERENCES promotion(promotion_id),
                FOREIGN KEY (store_id) REFERENCES store(store_id)
            );

            CREATE TABLE currency (
                currency_id SERIAL PRIMARY KEY,
                date DATE,
                currency VARCHAR(255),
                conversion_ratio DECIMAL(4,2)
            );

            CREATE TABLE account (
                account_id SERIAL PRIMARY KEY,
                account_parent INT,
                account_description VARCHAR(255),
                account_type VARCHAR(255),
                account_rollup CHAR(1)
            );

            CREATE TABLE category (
                category_id VARCHAR(255) PRIMARY KEY,
                category_parent VARCHAR(255),
                category_description VARCHAR(255),
                category_rollup CHAR(1)
            );

            CREATE TABLE expense (
                store_id INT,
                account_id INT,
                exp_time_id INT,
                category_id VARCHAR(255),
                currency_id INT,
                amount DOUBLE PRECISION,
                PRIMARY KEY (store_id, account_id, exp_time_id, currency_id),
                FOREIGN KEY (store_id) REFERENCES store(store_id),
                FOREIGN KEY (account_id) REFERENCES account(account_id),
                FOREIGN KEY (exp_time_id) REFERENCES time_by_day(time_id),
                FOREIGN KEY (category_id) REFERENCES category(category_id),
                FOREIGN KEY (currency_id) REFERENCES currency(currency_id)
            );

            CREATE TABLE salary_record (
                pay_date DATE,
                employee_id INT,
                currency_id INT NOT NULL,
                salary_paid DOUBLE PRECISION,
                overtime_paid DOUBLE PRECISION,
                vacation_accrued INT,
                vacation_used INT,
                PRIMARY KEY (pay_date, employee_id),
                FOREIGN KEY (employee_id) REFERENCES employee(employee_id),
                FOREIGN KEY (currency_id) REFERENCES currency(currency_id)
            );
"""

cur.execute(createCmd)
conn.commit()


### department table

In [5]:
engine = create_engine('postgresql+psycopg2://postgres:123@localhost:5432/5310 ETL')

department_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/department.csv')
department_df.head()

Unnamed: 0,department_id,department_description
0,1,HQ General Management
1,2,HQ Information Systems
2,3,HQ Marketing
3,4,HQ Human Resources
4,5,HQ Finance and Accounting


In [6]:
department_df.to_sql('department', engine, if_exists='append', index=False)

12

### position table

In [7]:
position_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/position.csv')
position_df.head()

Unnamed: 0,position_id,position_title,pay_type,min_scale,max_scale,management_role
0,1,President,Monthly,25000.0,85000.0,Senior Management
1,6,HQ Information Systems,Monthly,6700.0,10000.0,Middle Management
2,7,HQ Marketing,Monthly,5000.0,8500.0,Middle Management
3,8,HQ Human Resources,Monthly,5000.0,6700.0,Middle Management
4,9,HQ Finance and Accounting,Monthly,5000.0,6700.0,Middle Management


In [8]:
position_df.to_sql('position', engine, if_exists='append', index=False)

18

### sales_country table

In [9]:
region_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/region.csv')

countries = region_df[['sales_country']].drop_duplicates().reset_index(drop=True)
countries.insert(0, 'country_id', range(1, 1 + len(countries)))
countries.head()

Unnamed: 0,country_id,sales_country
0,1,USA
1,2,Mexico
2,3,Canada


In [10]:
countries.to_sql('sales_country', engine, if_exists='append', index=False)

3

### sales_region table

In [11]:
regions = region_df[['sales_region', 'sales_country']].drop_duplicates().reset_index(drop=True)
regions = regions.merge(countries, on='sales_country', how='left')
regions = regions[['sales_region', 'country_id']].reset_index(drop=True)
regions.insert(0, 'region_id', range(1, 1 + len(regions)))
regions.head()

Unnamed: 0,region_id,sales_region,country_id
0,1,Central West,1
1,2,Mexico Central,2
2,3,South West,1
3,4,Mexico West,2
4,5,Canada West,3


In [12]:
regions.to_sql('sales_region', engine, if_exists='append', index=False)

7

### sales_state_province table

In [13]:
state_provinces = region_df[['sales_state_province', 'sales_region']].drop_duplicates().reset_index(drop=True)
state_provinces = state_provinces.merge(regions, on='sales_region', how='left')
state_provinces = state_provinces[['sales_state_province', 'region_id']].reset_index(drop=True)
state_provinces.insert(0, 'state_province_id', range(1, 1 + len(state_provinces)))
state_provinces.head()

Unnamed: 0,state_province_id,sales_state_province,region_id
0,1,CA,1
1,2,DF,2
2,3,CA,3
3,4,Jalisco,4
4,5,BC,5


In [14]:
state_provinces.to_sql('sales_state_province', engine, if_exists='append', index=False)

13

### sales_district table

In [15]:
districts = region_df[['sales_district', 'sales_state_province']].drop_duplicates().reset_index(drop=True)
districts = districts.merge(state_provinces, on='sales_state_province', how='left')
districts = districts[['sales_district', 'state_province_id']].reset_index(drop=True)
districts.insert(0, 'sales_district_id', range(1, 1 + len(districts)))
districts.head()

Unnamed: 0,sales_district_id,sales_district,state_province_id
0,1,San Francisco,1
1,2,San Francisco,3
2,3,Mexico City,2
3,4,Los Angeles,1
4,5,Los Angeles,3


In [16]:
districts.to_sql('sales_district', engine, if_exists='append', index=False)

26

### sales_city table

In [17]:
cities = region_df[['sales_city', 'sales_district']].drop_duplicates().reset_index(drop=True)
cities = cities.merge(districts, on='sales_district', how='left')
cities = cities[['sales_city', 'sales_district_id']].reset_index(drop=True)
cities.insert(0, 'city_id', range(1, 1 + len(cities)))
cities.head()

Unnamed: 0,city_id,sales_city,sales_district_id
0,1,San Francisco,1
1,2,San Francisco,2
2,3,Mexico City,3
3,4,Los Angeles,4
4,5,Los Angeles,5


In [18]:
cities.to_sql('sales_city', engine, if_exists='append', index=False)

156

### store table

In [19]:
store_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/store.csv')
columns_needed = ['store_type', 'store_name', 'store_street_address', 'store_postal_code', 'store_phone', 'store_fax', 'first_opened_date', 'last_remodel_date', 'store_sqft', 'grocery_sqft', 'frozen_sqft', 'meat_sqft', 'coffee_bar', 'video_store', 'salad_bar', 'prepared_food', 'florist', 'store_state']
store_df = store_df[columns_needed]
state_province_query = "SELECT ssp.sales_state_province, sr.region_id FROM sales_state_province ssp JOIN sales_region sr ON ssp.region_id = sr.region_id"
state_province_df = pd.read_sql(state_province_query, engine)
store_df = store_df.merge(state_province_df, left_on='store_state', right_on='sales_state_province', how='left')
store_df.drop(['store_state', 'sales_state_province'], axis=1, inplace=True)
columns_before = ['store_type', 'store_name', 'region_id']
columns_after = [col for col in store_df.columns if col not in columns_before]
store_final_df = store_df[columns_before + columns_after]
store_final_df.head()

Unnamed: 0,store_type,store_name,region_id,store_street_address,store_postal_code,store_phone,store_fax,first_opened_date,last_remodel_date,store_sqft,grocery_sqft,frozen_sqft,meat_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist
0,Supermarket,Store 1,4,2853 Bailey Rd,55555,262-555-5124,262-555-5121,1/9/1982,12/5/1990,23593.0,17475.0,3671.0,2447.0,f,f,f,f,f
1,Small Grocery,Store 2,6,5203 Catanzaro Way,55555,605-555-8203,605-555-8201,4/2/1970,6/4/1973,28206.0,22271.0,3561.0,2374.0,t,f,f,f,f
2,Supermarket,Store 3,6,1501 Ramsey Circle,55555,509-555-1596,509-555-1591,6/14/1959,11/19/1967,39696.0,24390.0,9184.0,6122.0,f,f,t,t,f
3,Gourmet Supermarket,Store 4,2,433 St George Dr,55555,304-555-1474,304-555-1471,9/27/1994,12/1/1995,23759.0,16844.0,4149.0,2766.0,t,f,t,t,t
4,Small Grocery,Store 5,4,1250 Coggins Drive,55555,801-555-4324,801-555-4321,9/18/1978,6/29/1991,24597.0,15012.0,5751.0,3834.0,t,f,f,f,f


In [20]:
store_final_df.to_sql('store', engine, if_exists='append', index=False)

28

### employee table

In [21]:
employee_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/employee.csv')
employee_df.head()

Unnamed: 0,employee_id,first_name,last_name,position_id,store_id,department_id,salary,hire_date,end_date,birth_date,education_level,marital_status,gender
0,1,Sheri,Nowmer,1,1,1,80000,1994/12/1,,1961/8/26,Graduate Degree,S,F
1,2,Derrick,Whelply,2,1,1,40000,1994/12/1,,1915/7/3,Graduate Degree,M,M
2,4,Michael,Spence,2,1,1,40000,1998/1/1,,1969/6/20,Graduate Degree,S,M
3,5,Maya,Gutierrez,2,2,1,35000,1998/1/1,,1951/5/10,Bachelors Degree,M,F
4,6,Roberta,Damstra,3,2,2,25000,1994/12/1,,1942/10/8,Bachelors Degree,M,F


In [22]:
employee_df.to_sql('employee', engine, if_exists='append', index=False)

155

### employee_closure table

In [23]:
employee_closure_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/employee_closure.csv')
employee_closure_df.head()

Unnamed: 0,employee_id,supervisor_id,distance
0,1,1,0
1,2,2,0
2,4,4,0
3,5,5,0
4,6,6,0


In [24]:
employee_closure_df.to_sql('employee_closure', engine, if_exists='append', index=False)

179

### customer table

In [25]:
customer_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/customer.csv')

country_mapping = pd.read_sql_query('SELECT country_id, sales_country FROM sales_country', engine)
customer_df = customer_df.merge(country_mapping, left_on='country', right_on='sales_country', how='left')
customer_df.drop(columns=['country', 'sales_country'], inplace=True)
customer_df.head()

Unnamed: 0,customer_id,account_num,fname,lname,mi,address1,postal_code,city,phone1,phone2,...,gender,total_children,num_children_at_home,education,date_accnt_opened,member_card,occupation,houseowner,num_cars_owned,country_id
0,1,87462024688,Sheri,Nowmer,A.,2433 Bailey Road,15057,Tlaxiaco,271-555-9715,119-555-1969,...,F,4,2,Partial High School,1991/9/10,Bronze,Skilled Manual,Y,4,2
1,2,87470586299,Derrick,Whelply,I.,2219 Dewing Avenue,17172,Sooke,211-555-7669,807-555-9033,...,M,1,0,Partial High School,1993/3/11,Bronze,Professional,N,3,3
2,3,87475757600,Jeanne,Derry,,7640 First Ave.,73980,Issaquah,656-555-2272,221-555-2493,...,F,1,1,Bachelors Degree,1991/6/11,Bronze,Professional,Y,2,1
3,4,87500482201,Michael,Spence,J.,337 Tosca Way,74674,Burnaby,929-555-7279,272-555-2844,...,M,4,4,Partial High School,1994/5/21,Normal,Skilled Manual,N,2,3
4,5,87514054179,Maya,Gutierrez,,8668 Via Neruda,57355,Novato,387-555-7172,260-555-6936,...,F,3,0,Partial College,1992/8/21,Silver,Manual,N,3,1


In [26]:
customer_df.to_sql('customer', engine, if_exists='append', index=False)

281

### warehouse_class table

In [27]:
warehouse_class_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/warehouse_class.csv')
warehouse_class_df.head()

Unnamed: 0,warehouse_class_id,description
0,1,Small Independent
1,2,Medium Independent
2,3,Large Independent
3,4,Small Owned
4,5,Medium Owned


In [28]:
warehouse_class_df.to_sql('warehouse_class', engine, if_exists='append', index=False)

6

### warehouse table

In [29]:
warehouse_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/warehouse.csv')
city_mapping = pd.read_sql_query('SELECT city_id, sales_city FROM sales_city', engine)
warehouse_df = warehouse_df.merge(city_mapping, left_on='warehouse_city', right_on='sales_city', how='left')
warehouse_df.drop(columns=['warehouse_city', 'sales_city'], inplace=True)
warehouse_df.head()

Unnamed: 0,warehouse_class_id,stores_id,warehouse_name,wa_address1,warehouse_state_province,warehouse_country,warehouse_postal_code,warehouse_phone,warehouse_fax,city_id
0,1,1,Salka Warehousing,9716 Alovera Road,Guerrero,Mexico,55555,821-555-1658,594-555-2908,41.0
1,1,1,Salka Warehousing,9716 Alovera Road,Guerrero,Mexico,55555,821-555-1658,594-555-2908,42.0
2,2,2,Foster Products,958 Hilltop Dr,WA,USA,55555,315-555-8947,119-555-3826,125.0
3,3,3,"Destination, Inc.",4162 Euclid Ave,WA,USA,55555,517-555-3022,136-555-4501,123.0
4,4,4,Anderson Warehousing,5657 Georgia Dr,Zacatecas,Mexico,55555,681-555-1655,946-555-4848,40.0


In [30]:
warehouse_df.to_sql('warehouse', engine, if_exists='append', index=False)

29

### promotion table

In [31]:
promotion_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/promotion.csv')
promotion_df.head()

Unnamed: 0,promotion_id,promotion_name,media_type,cost,start_date,end_date
0,0,0,No Promotion,,,
1,1,High Roller Savings,Product Attachment,14435.0,1996/1/3,1996/1/6
2,2,Green Light Special,Product Attachment,8907.0,1996/1/18,1996/1/20
3,3,Wallet Savers,Radio,12512.0,1996/2/2,1996/2/5
4,4,Weekend Markdown,In-Store Coupon,11256.0,1996/2/13,1996/2/15


In [32]:
promotion_df.to_sql('promotion', engine, if_exists='append', index=False)

864

### product_class table

In [33]:
product_class_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/product_class.csv')
product_class_df.head()

Unnamed: 0,product_class_id,product_subcategory
0,1,Nuts
1,2,Shellfish
2,3,Canned Fruit
3,4,Spices
4,5,Pasta


In [34]:
product_class_df.to_sql('product_class', engine, if_exists='append', index=False)

110

### vendor table

In [35]:
df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/product.csv')

vendors = df[['brand_name']].drop_duplicates().reset_index(drop=True)
vendors.rename(columns={'brand_name': 'vendor_name'}, inplace=True)
vendors.head()

Unnamed: 0,vendor_name
0,Washington
1,Jeffers
2,Blue Label
3,Green Ribbon
4,King


In [36]:
vendors.to_sql('vendor', engine, if_exists='append', index=False)

111

### product table

In [37]:
vendor_mapping = pd.read_sql_query('SELECT vendor_id, vendor_name FROM vendor', engine)
df = df.merge(vendor_mapping, left_on='brand_name', right_on='vendor_name', how='left')
df.drop(columns=['brand_name', 'vendor_name'], inplace=True)
df.head()

Unnamed: 0,product_id,product_class_id,product_name,SKU,SRP,gross_weight,net_weight,recyclable_package,low_fat,units_per_case,cases_per_pallet,shelf_width,shelf_height,shelf_depth,vendor_id
0,1,30,Washington Berry Juice,90748583674,2.85,8.39,6.39,f,f,30,14,16.9,12.6,7.4,1
1,2,52,Washington Mango Drink,96516502499,0.74,7.42,4.42,f,t,18,8,13.4,3.71,22.6,1
2,3,52,Washington Strawberry Drink,58427771925,0.83,13.1,11.1,t,t,17,13,14.4,11.0,7.77,1
3,4,19,Washington Cream Soda,64412155747,3.64,10.6,9.6,t,f,26,10,22.9,18.9,7.93,1
4,5,19,Washington Diet Soda,85561191439,2.19,6.66,4.65,t,f,7,10,20.7,21.9,19.2,1


In [38]:
df.to_sql('product', engine, if_exists='append', index=False)

560

### time_by_day table

In [39]:
time_by_day_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/time_by_day.csv')
time_by_day_df['the_date'] = pd.to_datetime(time_by_day_df['the_date'])
time_by_day_df.head()

Unnamed: 0,time_id,the_date,the_day,day_of_month,week_of_year,month_of_year,quarter
0,738,1998-01-07,Wednesday,7,4,1,Q1
1,739,1998-01-08,Thursday,8,4,1,Q1
2,740,1998-01-09,Friday,9,4,1,Q1
3,741,1998-01-10,Saturday,10,4,1,Q1
4,742,1998-01-11,Sunday,11,5,1,Q1


In [40]:
time_by_day_df.to_sql('time_by_day', engine, if_exists='append', index=False)

365

### inventory_delivery table

In [41]:
inventory_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/inventory_fact_1998.csv')
inventory_df.head()

Unnamed: 0,product_id,time_id,warehouse_id,store_id,units_ordered,units_shipped,warehouse_sales,warehouse_cost,supply_time,store_invoice
0,308,765,1,1,53,7,7.4774,2.6171,4,3.0358
1,325,765,1,1,84,62,50.5548,28.8162,1,32.8505
2,275,765,1,1,24,24,29.8224,15.5076,2,17.2134
3,425,765,1,1,65,65,46.592,27.0234,3,30.2662
4,1534,765,1,1,67,67,57.7406,33.4895,4,38.5129


In [42]:
inventory_df.to_sql('inventory_delivery', engine, if_exists='append', index=False)

282

### sales table

In [43]:
sales_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/sales_fact_1998.csv')
sales_df.head()

Unnamed: 0,product_id,time_id,customer_id,promotion_id,store_id,store_sales,store_cost,unit_sales
0,173,748,2094,54,1,4.29,1.8447,3
1,1119,748,2094,54,1,9.51,3.5187,3
2,1242,748,2094,54,1,7.92,2.8512,4
3,460,748,2094,54,1,6.44,2.7048,4
4,104,748,2094,54,1,11.67,3.9678,3


In [44]:
sales_df.to_sql('sales', engine, if_exists='append', index=False)

553

### currency table

In [45]:
currency_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/currency.csv')
currency_df.head()

Unnamed: 0,date,currency,conversion_ratio
0,1/1/1998,USD,1.0
1,2/1/1998,USD,1.0
2,3/1/1998,USD,1.0
3,4/1/1998,USD,1.0
4,5/1/1998,USD,1.0


In [46]:
currency_df.to_sql('currency', engine, if_exists='append', index=False)

36

### account table

In [47]:
account_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/account.csv')
account_df.head()

Unnamed: 0,account_id,account_parent,account_description,account_type,account_rollup
0,1000,,Assets,Asset,~
1,2000,,Liabilities,Liability,~
2,3000,5000.0,Net Sales,Income,+
3,3100,3000.0,Gross Sales,Income,+
4,3200,3000.0,Cost of Goods Sold,Income,-


In [48]:
account_df.to_sql('account', engine, if_exists='append', index=False)

11

### category table

In [49]:
category_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/category.csv')
category_df.head()

Unnamed: 0,category_id,category_parent,category_description,category_rollup
0,ACTUAL,,Current Year's Actuals,
1,ADJUSTMENT,,Adjustment for Budget input,
2,BUDGET,,Current Year's Budget,
3,FORECAST,,Forecast,


In [50]:
category_df.to_sql('category', engine, if_exists='append', index=False)

4

### expense table

In [51]:
expense_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/expense_fact.csv')
currency_df = pd.read_sql('currency', con=engine)
time_by_day_df = pd.read_sql('time_by_day', con=engine)
time_by_day_df['the_date'] = pd.to_datetime(time_by_day_df['the_date'])
currency_df['date'] = pd.to_datetime(currency_df['date'])
currency_df['year_month'] = currency_df['date'].dt.to_period('M')
currency_map = currency_df.set_index('year_month')['currency_id'].to_dict()
time_by_day_df['year_month'] = time_by_day_df['the_date'].dt.to_period('M')
time_by_day_df['currency_id'] = time_by_day_df['year_month'].map(currency_map)
time_currency_map = time_by_day_df.set_index('time_id')['currency_id'].to_dict()
expense_df['currency_id'] = expense_df['exp_time_id'].apply(lambda x: time_currency_map.get(x, None))
expense_df.head()

Unnamed: 0,store_id,account_id,exp_time_id,category_id,currency_id,amount
0,1,4100,732,ACTUAL,25,743.62
1,1,4100,763,ACTUAL,26,743.62
2,1,4100,791,ACTUAL,27,743.62
3,1,4100,822,ACTUAL,28,743.62
4,1,4100,852,ACTUAL,29,743.62


In [52]:
expense_df.to_sql('expense', con=engine, if_exists='append', index=False)

152

### salary_record table

In [53]:
salary_df = pd.read_csv('/Users/wangyuan/Desktop/Group2 Dataset/salary.csv')
salary_df.head()

Unnamed: 0,pay_date,employee_id,currency_id,salary_paid,overtime_paid,vacation_accrued,vacation_used
0,1998/1/1,1,1,80.0,0.0,1,0
1,1998/1/1,2,1,40.0,0.0,1,0
2,1998/1/1,4,1,40.0,0.0,1,0
3,1998/1/1,5,1,35.0,0.0,1,0
4,1998/1/1,6,1,25.0,0.0,1,0


In [54]:
salary_df.to_sql('salary_record', engine, if_exists='append', index=False)

860