## Task: Load dim_Region Dimension Table to the database

In [1]:
# Further data cleaning and preprocessing steps would go here
from sqlalchemy import create_engine

engine = create_engine("postgresql+psycopg2://postgres:new_password@localhost:5432/Superstore_db")

# Test connection
connection = engine.connect()
print("Connection successful")


Connection successful


In [2]:
import pandas as pd
# Load the dataset 
df = pd.read_csv("Sample_ Superstore.csv")  # exact filename
print(df.shape)
print(df.columns)

(9994, 19)
Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Segment', 'Country', 'City', 'State', 'Region',
       'Product ID', 'Category', 'Sub-Category', 'Product Name', 'Sales',
       'Quantity', 'Discount', 'Profit'],
      dtype='object')


In [3]:
raw_df = df.copy()
raw_df.shape

(9994, 19)

### Loading Dim_Region Dimension Table
### Collecting Region related Column to the database

In [4]:
#Selecting region related columns for dimension table
# Creating Dim_Region table

dim_region = raw_df[["Country", "Region", "State", "City"]]
dim_region.shape

(9994, 4)

In [5]:
# Drop duplicate rows to keep only unique regions and stores in the Dim_Region_clean table
dim_region_clean = dim_region.drop_duplicates().reset_index(drop=True)
dim_region_clean.shape

(604, 4)

In [6]:
dim_region_clean.columns

Index(['Country', 'Region', 'State', 'City'], dtype='object')

In [7]:
# Renaming columns in dim_region_clean to follow consistent naming conventions 
 
dim_region_clean = dim_region_clean.rename(columns={
    "Country": "country",
    "Region": "region",
    "State": "state",
    "City": "city"
})


### Load dim_region_clean into PostgreSQL

In [8]:
dim_region_clean.to_sql(
    "dim_region",
    engine,
    if_exists="append",
    index=False
)

IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "dim_region_region_key"
DETAIL:  Key (region)=(South) already exists.

[SQL: INSERT INTO dim_region (country, region, state, city) VALUES (%(country__0)s, %(region__0)s, %(state__0)s, %(city__0)s), (%(country__1)s, %(region__1)s, %(state__1)s, %(city__1)s), (%(country__2)s, %(region__2)s, %(state__2)s, %(city__2)s), (%(countr ... 40341 characters truncated ... , %(state__602)s, %(city__602)s), (%(country__603)s, %(region__603)s, %(state__603)s, %(city__603)s)]
[parameters: {'country__0': 'United States', 'state__0': 'Kentucky', 'city__0': 'Henderson', 'region__0': 'South', 'country__1': 'United States', 'state__1': 'California', 'city__1': 'Los Angeles', 'region__1': 'West', 'country__2': 'United States', 'state__2': 'Florida', 'city__2': 'Fort Lauderdale', 'region__2': 'South', 'country__3': 'United States', 'state__3': 'North Carolina', 'city__3': 'Concord', 'region__3': 'South', 'country__4': 'United States', 'state__4': 'Washington', 'city__4': 'Seattle', 'region__4': 'West', 'country__5': 'United States', 'state__5': 'Texas', 'city__5': 'Fort Worth', 'region__5': 'Central', 'country__6': 'United States', 'state__6': 'Wisconsin', 'city__6': 'Madison', 'region__6': 'Central', 'country__7': 'United States', 'state__7': 'Utah', 'city__7': 'West Jordan', 'region__7': 'West', 'country__8': 'United States', 'state__8': 'California', 'city__8': 'San Francisco', 'region__8': 'West', 'country__9': 'United States', 'state__9': 'Nebraska', 'city__9': 'Fremont', 'region__9': 'Central', 'country__10': 'United States', 'state__10': 'Pennsylvania', 'city__10': 'Philadelphia', 'region__10': 'East', 'country__11': 'United States', 'state__11': 'Utah', 'city__11': 'Orem', 'region__11': 'West', 'country__12': 'United States', 'state__12': 'Texas' ... 2316 parameters truncated ... 'city__591': 'Hagerstown', 'region__591': 'East', 'country__592': 'United States', 'state__592': 'New Jersey', 'city__592': 'East Orange', 'region__592': 'East', 'country__593': 'United States', 'state__593': 'Illinois', 'city__593': 'Arlington Heights', 'region__593': 'Central', 'country__594': 'United States', 'state__594': 'Illinois', 'city__594': 'Oswego', 'region__594': 'Central', 'country__595': 'United States', 'state__595': 'Minnesota', 'city__595': 'Coon Rapids', 'region__595': 'Central', 'country__596': 'United States', 'state__596': 'California', 'city__596': 'San Clemente', 'region__596': 'West', 'country__597': 'United States', 'state__597': 'California', 'city__597': 'Dublin', 'region__597': 'West', 'country__598': 'United States', 'state__598': 'California', 'city__598': 'San Luis Obispo', 'region__598': 'West', 'country__599': 'United States', 'state__599': 'Arkansas', 'city__599': 'Springdale', 'region__599': 'South', 'country__600': 'United States', 'state__600': 'California', 'city__600': 'Lodi', 'region__600': 'West', 'country__601': 'United States', 'state__601': 'Texas', 'city__601': 'La Porte', 'region__601': 'Central', 'country__602': 'United States', 'state__602': 'Ohio', 'city__602': 'Mason', 'region__602': 'East', 'country__603': 'United States', 'state__603': 'Georgia', 'city__603': 'Woodstock', 'region__603': 'South'}]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

In [None]:
dim_region_clean.columns

Index(['country', 'region', 'state', 'city'], dtype='object')

In [None]:
# Keep only unique regions from DataFrame
dim_region_unique = dim_region_clean[['region', 'country', 'state', 'city']].drop_duplicates(subset=['region'])




In [None]:
# Keep only unique regions based on 'region' column
dim_region_unique = dim_region_clean.drop_duplicates(subset=['region'])

# Insert into PostgreSQL safely
dim_region_unique.to_sql(
    'dim_region',
    engine,
    if_exists='append',  # keep existing rows
    index=False
)


4

In [None]:
print(dim_region_unique.shape)
print(dim_region_unique)

(4, 4)
          country   region         state          city
0   United States    South      Kentucky     Henderson
1   United States     West    California   Los Angeles
5   United States  Central         Texas    Fort Worth
10  United States     East  Pennsylvania  Philadelphia
