d
**Definition of data paths in DBFS**
<br>`northwind_path` path of Northwind Data Warehouse project
- `raw_path` path of csv files imported from Postgresql
- `trusted_path` path of trusted layer files
- `refined_path` path of refined layer files
- `analytics_path` path of refined layer files, incluidng aggregations

In [0]:
username = 'tfukuda'

In [0]:
northwind_path = f"/{username}/northwind_dw/"

raw_path = northwind_path + 'raw/'

trusted_path = northwind_path + 'trusted/'

refined_path = northwind_path + 'refined/'

analytics_path = refined_path + "user_analytics/"

**Data base configuration**

In [0]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS project_{username}")
spark.sql(f"USE project_{username}")

Out[14]: DataFrame[]

**Definition of dictionary with table columns in trusted layer: `northwind_columns`**
  - key: table name at `northwind` data base
  - value: tuple of tuples containing (`column name`, `data type`, `alias`, `comment`)

In [0]:
northwind_columns = {
  'categories': (('category_id', 'short', 'category_id', '\"\"'),
                 ('category_name', 'string', 'category_name', '\"\"'),
                 ('description', 'string', 'description', '\"\"'),
                 ('picture', 'binary', 'picture', '\"\"')),
  
  'customer_customer_demo': (('customer_id', 'string', 'customer_id2', '\"\"'), # FK renamed to avoid duplicated column name after denormalization
                             ('customer_type_id', 'string', 'customer_type_id2', '\"\"')), # FK renamed to avoid duplicated column name after denormalization
  
  'customer_demographics': (('customer_type_id', 'string', 'customer_type_id', '\"\"'),
                            ('customer_desc', 'string', 'customer_desc', '\"\"')),
  
  'customers': (('customer_id', 'string', 'customer_id', '\"\"'),
                ('company_name', 'string', 'company_name', '\"\"'),
                ('contact_name', 'string', 'contact_name', '\"first last\"'),
                ('contact_title', 'string', 'contact_title', '\"\"'),
                ('address', 'string', 'address', '\"\"'),
                ('city', 'string', 'city', '\"\"'),
                ('region', 'string', 'region', '\"\"'),
                ('postal_code', 'string', 'postal_code', '\"\"'),
                ('country', 'string', 'country', '\"\"'),
                ('phone', 'string', 'phone', '\"\"'),
                ('fax', 'string', 'fax', '\"\"')),
  
  'employees': (('employee_id', 'short', 'employee_id', '\"\"'),
                ('last_name', 'string', 'last_name', '\"\"'),
                ('first_name', 'string', 'first_name', '\"\"'),
                ('title', 'string', 'title', '\"\"'),
                ('title_of_courtesy', 'string', 'title_of_courtesy', '\"\"'),
                ('birth_date', 'date', 'birth_date', '\"YYYY-mm-dd\"'),
                ('hire_date', 'date', 'hire_date', '\"YYYY-mm-dd\"'),
                ('address', 'string', 'address', '\"\"'),
                ('city', 'string', 'city', '\"\"'),
                ('region', 'string', 'region', '\"\"'),
                ('postal_code', 'string', 'postal_code', '\"\"'),
                ('country', 'string', 'country', '\"\"'),
                ('home_phone', 'string', 'home_phone', '\"\"'),
                ('extension', 'string', 'extension', '\"\"'),
                ('photo', 'binary', 'photo', '\"\"'),
                ('notes', 'string', 'notes', '\"short resumee\"'),
                ('reports_to', 'short', 'reports_to', '\"\"'),
                ('photo_path', 'string', 'photo_path', '\"\"')),
  
  'employee_territories': (('employee_id', 'short', 'employee_id2', '\"\"'), # FK renamed to avoid duplicated column name after denormalization
                           ('territory_id', 'string', 'territory_id2', '\"\"')), # FK renamed to avoid duplicated column name after denormalization
  
  'order_details': (('order_id', 'short', 'order_id2', '\"\"'), # FK renamed to avoid duplicated column name after denormalization
                    ('product_id', 'short', 'product_id', '\"\"'),
                    ('unit_price', 'float', 'unit_price', '\"\"'),
                    ('quantity', 'short', 'quantity', '\"\"'),
                    ('discount', 'float', 'discount', '\"\"')),
  
  'orders': (('order_id', 'short', 'order_id', '\"\"'),
             ('customer_id', 'string', 'customer_id', '\"\"'),
             ('employee_id', 'short', 'employee_id', '\"\"'),
             ('order_date', 'date', 'order_date', '\"YYYY-mm-dd\"'),
             ('order_date', 'string', 'order_year', '\"YYYY\"'), # new column for partitioning
             ('order_date', 'string', 'order_month', '\"mm\"'), # new column for partitioning
             ('order_date', 'string', 'order_day', '\"dd\"'), # new column for partitioning
             ('required_date', 'date', 'required_date', '\"YYYY-mm-dd\"'),
             ('shipped_date', 'date', 'shipped_date', '\"YYYY-mm-dd\"'),
             ('ship_via', 'short', 'ship_via', '\"\"'),
             ('freight', 'float', 'freight', '\"\"'),
             ('ship_name', 'string', 'ship_name', '\"first last\"'),
             ('ship_address', 'string', 'ship_address', '\"\"'),
             ('ship_city', 'string', 'ship_city', '\"\"'),
             ('ship_region', 'string', 'ship_region', '\"\"'),
             ('ship_postal_code', 'string', 'ship_postal_code', '\"\"'),
             ('ship_country', 'string', 'ship_country', '\"\"')),
  
  'products': (('product_id', 'short', 'product_id', '\"\"'),
               ('product_name', 'string', 'product_name', '\"\"'),
               ('supplier_id', 'short', 'supplier_id2', '\"\"'), # FK renamed to avoid duplicated column name after denormalization
               ('category_id', 'short', 'category_id2', '\"\"'), # FK renamed to avoid duplicated column name after denormalization
               ('quantity_per_unit', 'string', 'quantity_per_unit', '\"\"'),
               ('unit_price', 'float', 'unit_price', '\"\"'),
               ('units_in_stock', 'short', 'units_in_stock', '\"\"'),
               ('units_on_order', 'short', 'units_on_order', '\"\"'),
               ('reorder_level', 'short', 'reorder_level', '\"\"'),
               ('discontinued', 'integer', 'discontinued', '\"\"')),
  
  'region': (('region_id', 'short', 'region_id', '\"\"'),
             ('region_description', 'string', 'region_description', '\"\"')),
  
  'shippers': (('shipper_id', 'short', 'shipper_id', '\"\"'),
               ('company_name', 'string', 'company_name', '\"\"'),
               ('phone', 'string', 'phone', '\"\"')),
  
  'suppliers': (('supplier_id', 'short', 'supplier_id', '\"\"'),
                ('company_name', 'string', 'company_name', '\"\"'),
                ('contact_name', 'string', 'contact_name', '\"first last\"'),
                ('contact_title', 'string', 'contact_title', '\"\"'),
                ('address', 'string', 'address', '\"\"'),
                ('city', 'string', 'city', '\"\"'),
                ('region', 'string', 'region', '\"\"'),
                ('postal_code', 'string', 'postal_code', '\"\"'),
                ('country', 'string', 'country', '\"\"'),
                ('phone', 'string', 'phone', '\"\"'),
                ('fax', 'string', 'fax', '\"\"'),
                ('homepage' ,'string', 'homepage', '\"\"')),
  
  'territories': (('territory_id', 'string', 'territory_id', '\"\"'), # FK renamed to avoid duplicated column name after denormalization
                  ('territory_description', 'string', 'territory_description', '\"\"'),
                  ('region_id', 'short', 'region_id2', '\"\"')), # FK renamed to avoid duplicated column name after denormalization
  
  'us_states': (('state_id', 'short', 'state_id', '\"\"'),
                ('state_name', 'string', 'state_name', '\"\"'),
                ('state_abbr', 'string', 'state_abbr', '\"\"'),
                ('state_region', 'string', 'state_region', '\"\"'))
}

**Definition of list with table names of Northwind relational data base: `northwind_tables`**

In [0]:
northwind_tables = [table_name for table_name in northwind_columns]

**Definition of list with CSV file names: `northwind_files`**

In [0]:
northwind_files = [file_name + '.csv' for file_name in northwind_tables]

**Definition of list with table names of trusted layer: `northwind_tables_trusted`**

In [0]:
northwind_tables_trusted = [table_name + '_trusted' for table_name in northwind_tables]

**Definition of list with talbe names of refined layer: `northwind_tables_refined`**

In [0]:
northwind_tables_refined = ['ft_orders', 'dm_customers', 'dm_employees', 'dm_products', 'dm_shippers']

**Import Functions**

In [0]:
%run ./utilities