## Task: Load dim_product Dimension Table to the database

In [1]:
# Further data cleaning and preprocessing steps would go here
from sqlalchemy import create_engine

engine = create_engine("postgresql+psycopg2://postgres:new_password@localhost:5432/Superstore_db")

# Test connection
connection = engine.connect()
print("Connection successful")


Connection successful


In [2]:
import pandas as pd
# Load the dataset 
df = pd.read_csv("Sample_ Superstore.csv")  # exact filename
print(df.shape)
print(df.columns)

(9994, 19)
Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Segment', 'Country', 'City', 'State', 'Region',
       'Product ID', 'Category', 'Sub-Category', 'Product Name', 'Sales',
       'Quantity', 'Discount', 'Profit'],
      dtype='object')


In [3]:
raw_df = df.copy()
raw_df.shape

(9994, 19)

### Loading dim_product Dimension Table

In [4]:
#Selecting product related columns for dimension table
# Creating Dim_Product table

dim_product = raw_df[[
    "Product ID",
    "Product Name",
    "Category",
    "Sub-Category"
]]


In [5]:
dim_product["Product ID"].nunique() #number of unique products


1862

In [6]:
dim_product.shape #number of rows and columns


(9994, 4)

In [7]:
# Removing duplicate entries based on product_id
dim_product_clean = (
    dim_product
    .drop_duplicates(subset=["Product ID"])
    .reset_index(drop=True)
)
dim_product_clean.shape

(1862, 4)

In [8]:
dim_product_clean["Product ID"].nunique()


1862

In [9]:
# Renaming columns to follow consistent naming conventions
dim_product_clean = dim_product_clean.rename(columns={
    "Product ID": "product_id",
    "Product Name": "product_name",
    "Category": "category",
    "Sub-Category": "sub_category"
})
dim_product_clean.head()

Unnamed: 0,product_id,product_name,category,sub_category
0,FUR-BO-10001798,Bush Somerset Collection Bookcase,Furniture,Bookcases
1,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",Furniture,Chairs
2,OFF-LA-10000240,Self-Adhesive Address Labels for Typewriters b...,Office Supplies,Labels
3,FUR-TA-10000577,Bretford CR4500 Series Slim Rectangular Table,Furniture,Tables
4,OFF-ST-10000760,Eldon Fold 'N Roll Cart System,Office Supplies,Storage


In [10]:
print(dim_product_clean.columns)
dim_product_clean.shape

Index(['product_id', 'product_name', 'category', 'sub_category'], dtype='object')


(1862, 4)

### Load dim_product_clean into PostgreSQL

In [11]:
dim_product_clean.to_sql(
    "dim_product",
    engine,
    if_exists="replace",
    index=False
)


InternalError: (psycopg2.errors.DependentObjectsStillExist) cannot drop table dim_product because other objects depend on it
DETAIL:  constraint fact_sales_product_id_fkey on table fact_sales depends on table dim_product
HINT:  Use DROP ... CASCADE to drop the dependent objects too.

[SQL: 
DROP TABLE dim_product]
(Background on this error at: https://sqlalche.me/e/20/2j85)

##### Steps implemented:

Dimensional modeling logic

De-duplication using business keys

Schema alignment between pandas & Postgres

Proper dimension table loading

In [12]:
# Keep unique products
dim_product_unique = dim_product_clean.drop_duplicates(subset=['product_id'])

# Fetch existing product_ids from DB
existing_ids = pd.read_sql("SELECT product_id FROM dim_product", engine)

# Filter out already existing products
dim_product_to_insert = dim_product_unique[
    ~dim_product_unique['product_id'].isin(existing_ids['product_id'])
]

# Insert only new rows
dim_product_to_insert.to_sql(
    'dim_product',
    engine,
    if_exists='append',
    index=False
)


0