In [6]:
import sys; sys.path.append("..")
import pandas as pd
from src.data_processing import preprocess_data, handle_outliers

df = pd.read_csv('../data/raw/housing.csv')

# Transformation Pipeline
df_clean = preprocess_data(df, fill_strategy='median')
# Justification: Median was used for total_bedrooms because the
# distribution is skewed; mean would be affected by extreme values.

df_no_outliers = handle_outliers(df_clean, ['Price', 'median_income', 'total_rooms', 'total_bedrooms', 'population', 'households'])

# Type Conversion: One-Hot Encoding
df_final = pd.get_dummies(df_no_outliers, columns=['ocean_proximity'], drop_first=True)

df_final.to_csv('../data/processed/cleaned_housing.csv', index=False)

# Display first five rows and last 5 rows
print("----------- PROCESSED HEAD -----------\n")
print(df_final.head(),"\n")
print("----------- PROCESSED TAIL -----------\n")
print(df_final.tail(),"\n")

# Check the new columns specifically
print("----------- NEW COLUMNS CREATED -----------\n")
print(df_final[['Price', 'Rooms_Per_Household', 'Bedrooms_Per_Room']].head())

# Check for missing values
print("----------- MISSING VALUES -----------\n")
print(df_final.isnull().sum(), "\n")


----------- PROCESSED HEAD -----------

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   
5    -122.25     37.85                52.0        919.0           213.0   
6    -122.25     37.84                52.0       2535.0           489.0   

   population  households  median_income     Price  Rooms_Per_Household  \
2       496.0       177.0         7.2574  352100.0             8.288136   
3       558.0       219.0         5.6431  341300.0             5.817352   
4       565.0       259.0         3.8462  342200.0             6.281853   
5       413.0       193.0         4.0368  269700.0             4.761658   
6      1094.0       514.0         3.6591  299200.0             4.931907   

   Bedrooms_Per_Room  ocean_proximity_INLAND  ocean_proxim