In [2]:
import rootutils

rootutils.setup_root("../", indicator=".project-root", pythonpath=True)

import pandas as pd
from scipy.stats import pearsonr
from notebooks.utils.feature_selection import df_numerical_corr, df_mutual_info
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from src.data_preproc.encoding import one_hot_encode
from src.data_preproc.outlier_removal import iqr
from src.data_preproc.imputation import stat_imputation
from src.data_preproc.preproc_pipeline import DataPreproc
from functools import partial


DATA_RAW_FN = "../data/housing_raw.csv"

df = pd.read_csv(DATA_RAW_FN)
transformations = [
    partial(one_hot_encode, columns=["ocean_proximity"]),
    partial(
        iqr,
        columns=["total_rooms", "population", "households", "ocean_proximity_ISLAND"],
    ),
    partial(stat_imputation, columns=["total_bedrooms"], mode="median"),
]

pipeline = DataPreproc(transforms=transformations, store_intermediate=True)
df = pipeline(df)

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0.0,0.0,0.0,1.0,0.0
1,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0.0,0.0,0.0,1.0,0.0
2,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0.0,0.0,0.0,1.0,0.0
3,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0.0,0.0,0.0,1.0,0.0
4,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,0.0,0.0,0.0,1.0,0.0


# Let's compute correlation between input and target variables

In [23]:
input_variables = [
    "longitude",
    "latitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income",
    "ocean_proximity_<1H OCEAN",
    "ocean_proximity_INLAND",
    "ocean_proximity_NEAR BAY",
    "ocean_proximity_NEAR OCEAN",
]
target_variable = "median_house_value"


results_corr = df_numerical_corr(
    df, input_variables=input_variables, target_variable=target_variable
).dropna(axis=0, how="any")

results_mi = df_mutual_info(
    df, input_variables=input_variables, target_variable=target_variable
)

In [20]:
results_mi

Unnamed: 0,Mutual Info
longitude,0.413266
latitude,0.393675
housing_median_age,0.034276
total_rooms,0.040505
total_bedrooms,0.01455
population,0.019736
households,0.024724
median_income,0.386754
ocean_proximity_<1H OCEAN,0.102575
ocean_proximity_INLAND,0.203377


In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatterpolar(theta=results_corr.index, r=results_corr["r"].abs(), fill="toself")
)

fig.show()

fig = go.Figure()
fig.add_trace(
    go.Scatterpolar(
        theta=results_mi.index, r=results_mi["Mutual Info"].abs(), fill="toself"
    )
)

# Feature Preprocessing, Outlier Detection

1) Convert data to float -> One Hot Encoding