# Join State Regions with AQI-Income-Race-PopulationDensity Data

This notebook performs the final integration of the state regions and divisions data with the existing dataset containing AQI, income, race, and population density information.

In [None]:
import pandas as pd
import os

# File paths
base_path = '../JOINED-aqi-income-race-populationDensity'
regions_file = os.path.join(base_path, 'us-census-bureau-states-regions.csv')
aqi_data_file = os.path.join(base_path, 'aqi-income-race-populationDensity-joined.csv')
output_dir = '../JOINED-aqi-income-race-populationDensity-region'
output_file = os.path.join(output_dir, 'joined-data-with-region.csv')

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

print("Paths initialized.")

## 1. Load Datasets

In [None]:
df_regions = pd.read_csv(regions_file)
df_aqi = pd.read_csv(aqi_data_file)

print(f"Regions dataset shape: {df_regions.shape}")
print(f"AQI dataset shape: {df_aqi.shape}")
df_regions.head()

## 2. Normalize State Columns

We need to ensure the `State` columns in both datasets match exactly for a clean join. This includes stripping whitespace and ensuring consistent casing.

In [None]:
# Normalize State in regions dataframe
df_regions['State'] = df_regions['State'].str.strip().str.title()

# Normalize State in AQI dataframe
df_aqi['State'] = df_aqi['State'].str.strip().str.title()

print("State columns normalized.")

## 3. Perform the Join

We will perform a left join to keep all records from the AQI dataset and add the region/division information.

In [None]:
df_joined = df_aqi.merge(df_regions[['State', 'Region', 'Division']], on='State', how='left')

print(f"Joined dataset shape: {df_joined.shape}")
df_joined.head()

## 4. Verification

Let's check for any null values in the `Region` column to ensure all states were correctly mapped.

In [None]:
null_regions = df_joined[df_joined['Region'].isnull()]['State'].unique()
if len(null_regions) > 0:
    print(f"States with no region mapping: {null_regions}")
else:
    print("All records successfully mapped to a region.")

# Row count verification
assert len(df_joined) == len(df_aqi), "Row count mismatch!"
print("Row count verified.")

## 5. Export Data

In [None]:
df_joined.to_csv(output_file, index=False)
print(f"Dataset exported to {output_file}")