In [33]:
# Core libraries
import numpy as np
import matplotlib.pyplot as plt

# Third-party libraries
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.cluster import KMeans

In [None]:
california = fetch_california_housing()
df = pd.concat([pd.DataFrame(data=california.target, columns=['MedHouseVal']), pd.DataFrame(data=california.data, columns = california.feature_names)], axis=1)
df

In [None]:
df.hist(figsize=(12, 10), bins=30, edgecolor="black")
plt.subplots_adjust(hspace=0.7, wspace=0.4)

In [None]:
columns_drop = ["Longitude", "Latitude"]
subset = df.loc[:, ['MedHouseVal', 'MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']]
# Quantize the target and keep the midpoint for each interval
subset["MedHouseVal"] = pd.qcut(subset["MedHouseVal"], 6, retbins=False)
subset["MedHouseVal"] = subset["MedHouseVal"].apply(lambda x: x.mid)
_ = sns.pairplot(data=subset, hue="MedHouseVal", palette="viridis")

In [None]:
X = df[['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']]
y = df['MedHouseVal']

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
fig = px.choropleth_mapbox(
    df,
    geojson='https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json',
    color='MedHouseVal',
    color_continuous_scale='viridis',  
    range_color=(df['MedHouseVal'].min(), df['MedHouseVal'].max()), 
    mapbox_style='carto-positron', 
    zoom=5, 
    center={'lat': 36.7783, 'lon': -119.4179},
    opacity=0.7, 
    hover_name='MedHouseVal', 
    title='Choropleth Map of Median House Values in California'
)

scatter_fig = px.scatter_mapbox(
    df,
    lat='Latitude', 
    lon='Longitude',
    color='MedHouseVal', 
    color_continuous_scale='viridis', 
    size_max=15,  
    opacity=0.7,
    hover_name='MedHouseVal', 
)

fig.add_trace(scatter_fig.data[0])

fig.update_layout(
    margin=dict(l=0, r=0, t=30, b=0),
    height=600,
    width=800
)

fig.show()

In [None]:
# from plotly.offline import plot
# plot(fig, filename='plotly_plot.html', auto_open=False)

Visualize clustering example. Ideally you will want the clusters to match the Y variable (of course you will not want to include the Y variable).

In [None]:
kmeans = KMeans(n_clusters=15, n_init=10)
df['Cluster'] = kmeans.fit_predict(df[['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'AveOccup', 'Latitude', 'Longitude', 'Population']])

fig = px.choropleth_mapbox(
    df,
    geojson='https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json',  
    color='Cluster', 
    color_continuous_scale='viridis',  
    range_color=(df['Cluster'].min(), df['Cluster'].max()), 
    mapbox_style='carto-positron', 
    zoom=5, 
    center={'lat': 36.7783, 'lon': -119.4179},
    opacity=0.7, 
    hover_name='Cluster', 
    title='Choropleth Map of 15 Kmeans Clusters'
)

# height_fig = px.scatter_3d(df, x='Longitude', y='Latitude', z='MedHouseVal',
#                     color='Cluster', symbol='Cluster',
#                     size_max=10, opacity=0.7,
#                     title='3D Scatter Plot with MedHouseVal as Height')
# fig.add_trace(height_fig)

scatter_fig = px.scatter_mapbox(
    df,
    lat='Latitude',  
    lon='Longitude',  
    color='Cluster',  
    color_continuous_scale='viridis',  
    size_max=15,
    opacity=0.7,  
    hover_name='Cluster',  
)

fig.add_trace(scatter_fig.data[0])

fig.update_layout(
    margin=dict(l=0, r=0, t=30, b=0),
    height=600,
    width=800
)

fig.show()


Regression

In [34]:
california = fetch_california_housing()
df = pd.concat([pd.DataFrame(data=california.target, columns=['MedHouseVal']), pd.DataFrame(data=california.data, columns = california.feature_names)], axis=1)

X = df.drop(['MedHouseVal'], axis=1)
y = df['MedHouseVal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

n = 10 #number of clusters
kmeans = KMeans(n_clusters=n, random_state=42) 
X_train['cluster'] = kmeans.fit_predict(X_train)
X_train = pd.get_dummies(X_train, columns=['cluster'], drop_first=True)

X_test['cluster'] = kmeans.predict(X_test)
X_test = pd.get_dummies(X_test, columns=['cluster'], drop_first=False)

for column in X_train.columns:
    if column not in X_test.columns:
        X_test[column] = False
for column in X_test.columns:
    if column not in X_train.columns:
        X_test.drop(columns=[column], inplace=True)

X_test = X_test[X_train.columns]

ols_model = LinearRegression()
ols_model.fit(X_train, y_train)
ols_pred = ols_model.predict(X_test)
ols_r2 = r2_score(y_test, ols_pred)
ols_rmse = mean_squared_error(y_test, ols_pred, squared=False)

lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
lasso_pred = lasso_model.predict(X_test)
lasso_r2 = r2_score(y_test, lasso_pred)
lasso_rmse = mean_squared_error(y_test, lasso_pred, squared=False)

ridge_model = Ridge(alpha=0.1) 
ridge_model.fit(X_train, y_train)
ridge_pred = ridge_model.predict(X_test)
ridge_r2 = r2_score(y_test, ridge_pred)
ridge_rmse = mean_squared_error(y_test, ridge_pred, squared=False)

ols_mae = mean_absolute_error(y_test, ols_pred)
lasso_mae = mean_absolute_error(y_test, lasso_pred)
ridge_mae = mean_absolute_error(y_test, ridge_pred)

results_df = pd.DataFrame({
    'Model': ['OLS', 'Lasso', 'Ridge'],
    'R-squared': [ols_r2, lasso_r2, ridge_r2],
    'RMSE': [ols_rmse, lasso_rmse, ridge_rmse],
    'MAE': [ols_mae, lasso_mae, ridge_mae]
})

print(results_df)

  super()._check_params_vs_input(X, default_n_init=10)


   Model  R-squared      RMSE       MAE
0    OLS   0.576707  0.744773  0.532895
1  Lasso   0.531817  0.783270  0.581607
2  Ridge   0.576724  0.744758  0.532893
