In [15]:
# Step 4: Transform Phase — etl_transform.ipynb

import os
import pandas as pd

# -----------------------------
# 1. Load datasets from data/
# -----------------------------
raw_path = r'C:\Users\USER\OneDrive\Desktop\ET_EXAM_GIFT_662\data\validated_data.csv'
inc_path = r'C:\Users\USER\OneDrive\Desktop\ET_EXAM_GIFT_662\data\incremental_data.csv'

full_df = pd.read_csv(raw_path)
inc_df = pd.read_csv(inc_path)

print("Full dataset shape:", full_df.shape)
print("Incremental dataset shape:", inc_df.shape)

# -----------------------------
# 1b. Rename columns for easier access
# -----------------------------
rename_cols = {
    '113 Cause Name': 'Cause_Code',
    'Cause Name': 'Cause_Name',
    'Age-adjusted Death Rate': 'Age_Adjusted_Rate'
}

full_df.rename(columns=rename_cols, inplace=True)
inc_df.rename(columns=rename_cols, inplace=True)
print("Columns after renaming:", full_df.columns)

# -----------------------------
# 2. Prepare 'transformed/' folder
# -----------------------------
transformed_folder = r'C:\Users\USER\OneDrive\Desktop\ET_EXAM_GIFT_662\transformed'
if not os.path.exists(transformed_folder):
    os.makedirs(transformed_folder)
    print(f"'transformed' folder created at: {transformed_folder}")
else:
    print(f"'transformed' folder already exists at: {transformed_folder}")

# -----------------------------
# 3. Apply Transformations
# -----------------------------

# 3.1 Cleaning — Remove duplicates
print("\n[Cleaning] Remove duplicates")
print("Full before:", full_df.shape)
full_df = full_df.drop_duplicates()
print("Full after:", full_df.shape)

print("Incremental before:", inc_df.shape)
inc_df = inc_df.drop_duplicates()
print("Incremental after:", inc_df.shape)

# 3.2 Cleaning — Handle missing values
print("\n[Cleaning] Handle missing values")
print("Full missing before:\n", full_df.isnull().sum())
full_df['Deaths'] = full_df['Deaths'].fillna(full_df['Deaths'].median())
full_df['Age_Adjusted_Rate'] = full_df['Age_Adjusted_Rate'].fillna(full_df['Age_Adjusted_Rate'].median())
print("Full missing after:\n", full_df.isnull().sum())

print("Incremental missing before:\n", inc_df.isnull().sum())
inc_df['Deaths'] = inc_df['Deaths'].fillna(inc_df['Deaths'].median())
inc_df['Age_Adjusted_Rate'] = inc_df['Age_Adjusted_Rate'].fillna(inc_df['Age_Adjusted_Rate'].median())
print("Incremental missing after:\n", inc_df.isnull().sum())

# 3.3 Standardization — Capitalize State
print("\n[Standardization] Capitalize State")
print("Before:\n", full_df['State'].head())
full_df['State'] = full_df['State'].str.title()
inc_df['State'] = inc_df['State'].str.title()
print("After:\n", full_df['State'].head())

# 3.4 Enrichment — Add Deaths_Rate_Scaled
print("\n[Enrichment] Add Deaths_Rate_Scaled")
full_df['Deaths_Rate_Scaled'] = full_df['Deaths'] / 1000
inc_df['Deaths_Rate_Scaled'] = inc_df['Deaths'] / 1000
print(full_df[['Deaths', 'Deaths_Rate_Scaled']].head())

# 3.5 Categorization — Create Rate_Category
print("\n[Categorization] Create Rate_Category")
bins = [0, 50, 100, 200, 1000]
labels = ['Low', 'Medium', 'High', 'Very High']
full_df['Rate_Category'] = pd.cut(full_df['Age_Adjusted_Rate'], bins=bins, labels=labels)
inc_df['Rate_Category'] = pd.cut(inc_df['Age_Adjusted_Rate'], bins=bins, labels=labels)
print(full_df[['Age_Adjusted_Rate', 'Rate_Category']].head())

# 3.6 Filtering — Keep only relevant columns
print("\n[Filtering] Keep relevant columns")
cols = ['Year', 'Cause_Code', 'Cause_Name', 'State', 'Deaths', 
        'Age_Adjusted_Rate', 'Rate_Category', 'Deaths_Rate_Scaled']
full_df = full_df[cols]
inc_df = inc_df[cols]
print("Columns after filtering:", full_df.columns)

# -----------------------------
# 4. Save transformed datasets
# -----------------------------
full_df.to_csv(os.path.join(transformed_folder, 'transformed_full.csv'), index=False)
inc_df.to_csv(os.path.join(transformed_folder, 'transformed_incremental.csv'), index=False)
print("\nTransformed datasets saved in '/transformed/' folder")


Full dataset shape: (11868, 6)
Incremental dataset shape: (1000, 6)
Columns after renaming: Index(['Year', 'Cause_Code', 'Cause_Name', 'State', 'Deaths',
       'Age_Adjusted_Rate'],
      dtype='object')
'transformed' folder already exists at: C:\Users\USER\OneDrive\Desktop\ET_EXAM_GIFT_662\transformed

[Cleaning] Remove duplicates
Full before: (11868, 6)
Full after: (10868, 6)
Incremental before: (1000, 6)
Incremental after: (1000, 6)

[Cleaning] Handle missing values
Full missing before:
 Year                 0
Cause_Code           0
Cause_Name           0
State                0
Deaths               0
Age_Adjusted_Rate    0
dtype: int64
Full missing after:
 Year                 0
Cause_Code           0
Cause_Name           0
State                0
Deaths               0
Age_Adjusted_Rate    0
dtype: int64
Incremental missing before:
 Year                 0
Cause_Code           0
Cause_Name           0
State                0
Deaths               0
Age_Adjusted_Rate    0
dtype: int64
