In [None]:
!pip3 install tensorflow==2.14.0
!pip3 install scikeras==0.12.0
!pip3 install pandas
!pip3 install matplotlib
!pip3 install seaborn
!pip3 install scikit-learn
!pip3 install cartopy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

import math

import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [None]:
df = pd.read_csv("../data/housing.csv")
df.head(10)

In [None]:
df.shape[0]

In [None]:
df.rename({"median_house_value": "target"}, inplace=True, axis=1)
df.head(1)

In [None]:
df.dtypes

In [None]:
df.isnull().any()

In [None]:
df["total_bedrooms"].isnull().sum()

In [None]:
df["total_bedrooms"] = df["total_bedrooms"].fillna(df["total_bedrooms"].mean())
df.isnull().any()

In [None]:
numerical_features = ["longitude",
                      "latitude",
                      "housing_median_age",
                      "total_rooms",
                      "total_bedrooms",
                      "population",
                      "households",
                      "median_income"]
categorical_features = ["ocean_proximity"]

In [None]:
def plot_scatter_matrix(df):
    sm = pd.plotting.scatter_matrix(df, alpha=0.2, diagonal="kde", figsize=(6,6))
    [s.xaxis.label.set_rotation(90) for s in sm.reshape(-1)]
    [s.yaxis.label.set_rotation(0) for s in sm.reshape(-1)]
    [s.yaxis.label.set_size(8) for s in sm.reshape(-1)]
    [s.xaxis.label.set_size(8) for s in sm.reshape(-1)]
    [s.get_yaxis().set_label_coords(-1,0.5) for s in sm.reshape(-1)]
    [s.set_xticks(()) for s in sm.reshape(-1)]
    [s.set_yticks(()) for s in sm.reshape(-1)]

    plt.show()

plot_scatter_matrix(df[numerical_features])

In [None]:
def get_redundant_pairs(df):
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]
    
get_top_abs_correlations(df[numerical_features], 20)

In [None]:
sns.heatmap(df[numerical_features].corr())

In [None]:
plt.figure(figsize=(12, 8))
ax = plt.axes(projection=ccrs.PlateCarree())
ax.set_extent([-125, -112, 30, 45], crs=ccrs.PlateCarree())  # Adjust the extent as needed

ax.add_feature(cfeature.COASTLINE)
ax.add_feature(cfeature.BORDERS, linestyle=':')
ax.add_feature(cfeature.LAND, edgecolor='black')

sc = ax.scatter(df['longitude'], df['latitude'], c=df['target'], cmap='viridis', s=5, alpha=0.7, transform=ccrs.PlateCarree())
plt.colorbar(sc, label='Median house value')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Median house value according to location')

plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

clf = DecisionTreeClassifier(random_state=0)
cross_val_score(clf, df[["latitude", "longitude"]], df[["ocean_proximity"]], cv=10).mean()

In [None]:
df["ocean_proximity"].unique()

In [None]:
from sklearn.linear_model import LinearRegression

enc = OrdinalEncoder()
df["ocean_proximity_enc"] = enc.fit_transform(df[["ocean_proximity"]])
df.head(5)

In [None]:
reg = LinearRegression()

cross_val_score(reg, df[["latitude", "longitude"]], df[["ocean_proximity_enc"]], scoring="neg_root_mean_squared_error", cv=10).mean()

In [None]:
custom_encoding = {"ISLAND": 0, "NEAR OCEAN": 1, "NEAR BAY": 2, "<1H OCEAN": 3, "INLAND": 4}
df["ocean_proximity_enc_2"] = df.replace({"ocean_proximity": custom_encoding})["ocean_proximity"]
df.head(5)

In [None]:
reg = LinearRegression()

cross_val_score(reg, df[["latitude", "longitude"]], df[["ocean_proximity_enc_2"]], scoring="neg_root_mean_squared_error", cv=10).mean()

In [None]:
corr_with_target = \
    df[numerical_features+["ocean_proximity_enc", "ocean_proximity_enc_2"]].corrwith(df["target"]).abs().sort_values(ascending=False)
corr_with_target

In [None]:
fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(10,6), sharey=True)

means = df.groupby("ocean_proximity")["target"].mean()

ax0.scatter(df["median_income"], df["target"])
ax0.set_xlabel("Median income (tens of thousands of US Dollars)")
ax0.set_ylabel("Median house value")

plt.xticks(rotation=45)
ax1.bar(means.index, means)
ax1.set_xlabel("Ocean proximity")
ax1.set_ylabel("Median house value")
plt.show()

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(10,10))

for (feature, ax) in zip(numerical_features, axs.flat):
    ax.boxplot(df[feature])
    ax.set_title(feature)
axs.flat[-1].axis("off")
plt.show()

In [None]:
df["rooms_per_bedroom"] = df["total_rooms"] / df["total_bedrooms"]
plt.boxplot(df["rooms_per_bedroom"])
plt.title("rooms_per_bedroom")
plt.show()

In [None]:
df["rooms_per_household"] = df["total_rooms"] / df["households"]
plt.boxplot(df["rooms_per_household"])
plt.title("rooms_per_household")
plt.show()

In [None]:
df[df["rooms_per_household"] > 40]

In [None]:
df["encoded_position"] = df["longitude"] + df["latitude"]
df["encoded_position"]

In [None]:
df["population_per_bedrooms"] = df["population"] / df["total_bedrooms"]

In [None]:
corr_with_target = \
    df.drop(["ocean_proximity", "target"], axis=1).corrwith(df["target"]).abs().sort_values(ascending=False)
corr_with_target

In [None]:
def remove_outliers(df, quantile=0.05):
    Q1 = df.quantile(quantile)
    Q3 = df.quantile(1-quantile)
    IQR = Q3 - Q1
    
    return df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

df_no_outliers = remove_outliers(df.drop("ocean_proximity", axis=1))
df_no_outliers

In [None]:
corr_with_target_no_outliers = \
    df_no_outliers.drop(["target"], axis=1).corrwith(df_no_outliers["target"]).abs().sort_values(ascending=False)

diff = (corr_with_target_no_outliers - corr_with_target)
rel_diff = (diff/corr_with_target_no_outliers).sort_values(ascending=False)

pd.DataFrame({"Correlation with target": corr_with_target, 
              "Correlation with target without outliers": corr_with_target_no_outliers,
              "Difference": diff,
              "Relative Difference": rel_diff}).sort_values("Relative Difference", ascending=False)

In [None]:
def apply_feature_engineering(df: pd.DataFrame, remove_outliers) -> pd.DataFrame:
    df["total_bedrooms"] = df["total_bedrooms"].fillna(df["total_bedrooms"].mean())
    custom_encoding = {"ISLAND": 0, "NEAR OCEAN": 1, "NEAR BAY": 2, "<1H OCEAN": 3, "INLAND": 4}
    df["ocean_proximity_enc"] = df.replace({"ocean_proximity": custom_encoding})["ocean_proximity"]
    df = df.drop("ocean_proximity")
    df["rooms_per_bedroom"] = df["total_rooms"] / df["total_bedrooms"]
    df["rooms_per_household"] = df["total_rooms"] / df["households"]
    df["encoded_position"] = df["longitude"] + df["latitude"]
    df["population_per_bedrooms"] = df["population"] / df["total_bedrooms"]
    if remove_outliers:
        df = remove_outliers(df, 0.05)
    return df