# Assignment - ETL transform to DWH

## 1. Import libraries

### 1.1 Load libraries

In [1]:
from sqlalchemy.schema import CreateSchema
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError
import pandas as pd
from urllib.parse import quote_plus

### 1.2 Database connection

#### 1.2.a MSSQL

In [3]:
server_name = '127.0.0.1'
database_name = 'master'
username = 'sa'
password = 'trung_password123'
driver_name = 'ODBC Driver 18 for SQL Server'
# Add your port number here (1433 is default)
port_number = 1433

raw_connection_string = (
    f'DRIVER={{{driver_name}}};'
    f'SERVER={server_name},{port_number};' # Note the 'host,port' format
    f'DATABASE={database_name};'
    f'UID={username};'
    f'PWD={password};'
    # **CRITICAL: This is the trust connection setting you need**
    f'TrustServerCertificate=yes;' 
)

quoted_connection_string = quote_plus(raw_connection_string)

connection_url = f"mssql+pyodbc:///?odbc_connect={quoted_connection_string}"

mssql_engine = create_engine(connection_url)
try:
    with mssql_engine.connect() as conn:
        # optional: issue a lightweight query
        conn.execute(text("SELECT 1"))
    print("Connection successful")
except SQLAlchemyError as e:
    print("Connection failed:", e)
    # handle/log error accordingly

mssql_engine

Connection successful


Engine(mssql+pyodbc:///?odbc_connect=DRIVER%3D%7BODBC+Driver+18+for+SQL+Server%7D%3BSERVER%3D127.0.0.1%2C1433%3BDATABASE%3Dmaster%3BUID%3Dsa%3BPWD%3Dtrung_password123%3BTrustServerCertificate%3Dyes%3B)

#### 1.2.b PostgreSQL

In [3]:
server_name = 'localhost' 
database_name = 'postgres'
username = 'postgres'
password = 'Trungtq'
driver_name = 'PostgreSQL JDBC Driver'

# Add your port number here (1433 is default)
port_number = 5432

# Modified connection_url with the port:
connection_url = f"postgresql+psycopg2://{username}:{password}@{server_name}:{port_number}/{database_name}"

postgre_engine = create_engine(connection_url)
postgre_engine

Engine(postgresql+psycopg2://postgres:***@localhost:5432/postgres)

## 2. Extract

Extract and process tables:

- Table `Product`: only keep those sellable items

- Tables `ProductCostHistory`, `ProductListPriceHistory`: merge them into a single table

- Tables `SalesOrderDetail`, `SalesOrderHeader`: retrieve `OrderDate` from `SalesOrderHeader` and merge to `SalesOrderDetail`.

In [4]:
# Load table product
input_product_df = pd.read_sql("""
        SELECT ProductID, Name, ProductSubcategoryID, FinishedGoodsFlag
        FROM CompanyX.Production.Product
    """.strip(), mssql_engine)

input_product_df = input_product_df[input_product_df['FinishedGoodsFlag'] == 1].reset_index(drop=True).drop(columns=['FinishedGoodsFlag'])
salable_products = list(input_product_df['ProductID'].unique())

input_product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295 entries, 0 to 294
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ProductID             295 non-null    int64  
 1   Name                  295 non-null    object 
 2   ProductSubcategoryID  295 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 7.0+ KB


In [5]:
# Load table ProductSubCategory
input_psc_df = pd.read_sql("""
        SELECT ProductSubcategoryID, Name, ProductCategoryID
        FROM CompanyX.Production.ProductSubcategory
    """.strip(), mssql_engine)

input_psc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ProductSubcategoryID  37 non-null     int64 
 1   Name                  37 non-null     object
 2   ProductCategoryID     37 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1020.0+ bytes


In [6]:
# Load table ProductCategory
input_pc_df = pd.read_sql("""
        SELECT ProductCategoryID, Name
        FROM CompanyX.Production.ProductCategory
    """.strip(), mssql_engine)

input_pc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ProductCategoryID  4 non-null      int64 
 1   Name               4 non-null      object
dtypes: int64(1), object(1)
memory usage: 196.0+ bytes


In [7]:
# Load table ProductCostHistory, ProductListPriceHistory
input_pch_df = pd.read_sql("""
        SELECT ProductID, StartDate, EndDate, StandardCost
        FROM CompanyX.Production.ProductCostHistory
    """.strip(), mssql_engine)

input_plph_df = pd.read_sql("""
        SELECT ProductID, StartDate, EndDate, ListPrice
        FROM CompanyX.Production.ProductListPriceHistory
    """.strip(), mssql_engine)

merge_df = pd.merge(input_pch_df, input_plph_df, on=['ProductID', 'StartDate', 'EndDate'], how='inner')

# Only keep salable products
merge_df = merge_df[merge_df['ProductID'].isin(salable_products)].reset_index(drop=True)

merge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   ProductID     395 non-null    int64         
 1   StartDate     395 non-null    datetime64[ns]
 2   EndDate       200 non-null    datetime64[ns]
 3   StandardCost  395 non-null    float64       
 4   ListPrice     395 non-null    float64       
dtypes: datetime64[ns](2), float64(2), int64(1)
memory usage: 15.6 KB


In [8]:
# Load table SalesOrderDetail
input_sod_df = pd.read_sql("""
        SELECT ProductID, OrderQty, LineTotal, SalesOrderID
        FROM CompanyX.Sales.SalesOrderDetail
    """.strip(), mssql_engine)

# Load data SalesOrderHeader
input_soh_df = pd.read_sql("""
        SELECT SalesOrderID, OrderDate, CustomerID
        FROM CompanyX.Sales.SalesOrderHeader
    """.strip(), mssql_engine)

input_sod_df = pd.merge(input_sod_df, input_soh_df[['SalesOrderID', 'OrderDate', 'CustomerID']], on='SalesOrderID', how='left')

# Only keep salable products
input_sod_df = input_sod_df[input_sod_df['ProductID'].isin(salable_products)].reset_index(drop=True)

input_sod_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121317 entries, 0 to 121316
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   ProductID     121317 non-null  int64         
 1   OrderQty      121302 non-null  float64       
 2   LineTotal     121317 non-null  float64       
 3   SalesOrderID  121317 non-null  int64         
 4   OrderDate     121317 non-null  datetime64[ns]
 5   CustomerID    121317 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 5.6 MB


## 3. Data transformation

### 3.1 `Dim` tables

In [9]:
# Interval dimension
dimDate_df = merge_df[['StartDate', 'EndDate']].copy()
dimDate_df.drop_duplicates(inplace=True)
dimDate_df['Id'] = dimDate_df.index + 1

# Product dimension
dimProduct_df = input_product_df
dimProductSubcategory_df = input_psc_df
dimProductCategory_df = input_pc_df

# ProductPriceCostHistory
dimProductPriceCostHistory_df = pd.merge(merge_df, right=dimDate_df, on=['StartDate', 'EndDate'], how='inner').drop(columns=['StartDate', 'EndDate'])
dimProductPriceCostHistory_df.rename(columns={'Id': 'Interval'}, inplace=True)
dimProductPriceCostHistory_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ProductID     395 non-null    int64  
 1   StandardCost  395 non-null    float64
 2   ListPrice     395 non-null    float64
 3   Interval      395 non-null    int64  
dtypes: float64(2), int64(2)
memory usage: 12.5 KB


### 3.2 `Fact` tables

In [10]:
def assign_interval(fact_df, dim_df, date_col="OrderDate"):
    dim = dim_df.sort_values("StartDate")
    fact = fact_df.sort_values(date_col)

    merged = pd.merge_asof(
        fact,
        dim,
        left_on=date_col,
        right_on="StartDate",
        direction="backward"
    )

    return merged[
        (merged[date_col] >= merged["StartDate"]) &
        ((merged["EndDate"].isna()) | (merged[date_col] <= merged["EndDate"]))
    ]


factProductSales_df = assign_interval(input_sod_df, dimDate_df, date_col="OrderDate").drop(columns=['StartDate', 'EndDate'])
factProductSales_df.rename(columns={'Id': 'Interval'}, inplace=True)

factProductSales_df.iloc[20000:20010]

Unnamed: 0,ProductID,OrderQty,LineTotal,SalesOrderID,OrderDate,CustomerID,Interval
20000,836,5.0,1622.2635,48043,2012-09-30,29716,2
20001,770,3.0,1409.382,48043,2012-09-30,29716,2
20002,760,6.0,2818.764,48043,2012-09-30,29716,2
20003,789,9.0,13194.09,48043,2012-09-30,29716,2
20004,708,4.0,80.746,48043,2012-09-30,29716,2
20005,715,4.0,115.3616,48043,2012-09-30,29716,2
20006,791,4.0,5864.04,48043,2012-09-30,29716,2
20007,792,4.0,5235.75,48043,2012-09-30,29716,2
20008,738,4.0,735.7528,48043,2012-09-30,29716,2
20009,707,8.0,161.492,48043,2012-09-30,29716,2


### Deal with missing value
1. Fact table: remove those rows with NaN value, because each row is an isolated record
2. EndDate of Dimtable: accept Null value as the interval of applying is not end

In [11]:
factProductSales_df.dropna(inplace=True)
factProductSales_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 121302 entries, 0 to 121316
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   ProductID     121302 non-null  int64         
 1   OrderQty      121302 non-null  float64       
 2   LineTotal     121302 non-null  float64       
 3   SalesOrderID  121302 non-null  int64         
 4   OrderDate     121302 non-null  datetime64[ns]
 5   CustomerID    121302 non-null  int64         
 6   Interval      121302 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(4)
memory usage: 7.4 MB


## 4. Load

### 4.1 Create new schema

In [12]:
schema_name = 'dwh'

with postgre_engine.connect() as connection:
    try:
        connection.execute(CreateSchema(schema_name, if_not_exists=True))
        connection.commit()
        print(f"Schema '{schema_name}' created successfully (or already exists).")
    except Exception as e:
        connection.rollback()
        print(f"Error creating schema '{schema_name}': {e}")

Schema 'dwh' created successfully (or already exists).


### 4.2 Write data into schema

In [13]:
dimDate_df.to_sql('DimDate', postgre_engine, schema=schema_name, if_exists='replace', index=False)
dimProduct_df.to_sql('DimProduct', postgre_engine, schema=schema_name, if_exists='replace', index=False)
dimProductSubcategory_df.to_sql('DimProductSubcategory', postgre_engine, schema=schema_name, if_exists='replace', index=False)
dimProductCategory_df.to_sql('DimProductCategory', postgre_engine, schema=schema_name, if_exists='replace', index=False)
dimProductPriceCostHistory_df.to_sql('DimProductPriceCostHistory', postgre_engine, schema=schema_name, if_exists='replace', index=False)
factProductSales_df.to_sql('FactProductSales', postgre_engine, schema=schema_name, if_exists='replace', index=False)

302