In [25]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'iframe'

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import preprocessing

from sklearn.cluster import KMeans

import plotly.express as px

from sklearn.metrics import silhouette_score

In [3]:
home_data = pd.read_csv('../data/california_housing_prices/housing.csv', usecols = ['longitude', 'latitude', 'median_house_value'])
home_data.head()

Unnamed: 0,longitude,latitude,median_house_value
0,-122.23,37.88,452600.0
1,-122.22,37.86,358500.0
2,-122.24,37.85,352100.0
3,-122.25,37.85,341300.0
4,-122.25,37.85,342200.0


In [4]:
home_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   median_house_value  20640 non-null  float64
dtypes: float64(3)
memory usage: 483.8 KB


### Visualize the Data

In [5]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=home_data['longitude'], y=home_data['latitude'],mode='markers', 
                         marker=dict(color=home_data['median_house_value'],showscale=True)))
fig.update_layout(title=f"house value", yaxis_title='latitude', xaxis_title='longitude')
fig.show()

### Normalizing the Data

In [27]:
X_train, X_test, y_train, y_test = train_test_split(home_data[['latitude', 'longitude','median_house_value']], home_data[['median_house_value']], test_size=0.33, random_state=0)
X_train_norm = preprocessing.normalize(X_train)
X_test_norm = preprocessing.normalize(X_test)

### Fitting and Evaluating the Model

In [30]:
kmeans = KMeans(n_clusters = 3, random_state = 42)
kmeans.fit(X_train_norm)

KMeans(n_clusters=3, random_state=42)

In [31]:
kmeans.labels_

array([2, 0, 2, ..., 0, 0, 1])

In [32]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_train['longitude'], y=X_train['latitude'],mode='markers', 
                         marker=dict(color=kmeans.labels_,showscale=True)))
fig.update_layout(title=f"house value", yaxis_title='latitude', xaxis_title='longitude')
fig.show()

In [33]:
box_df= y_train
box_df['Cluster']=kmeans.labels_

In [34]:
fig = px.box(box_df, x="Cluster", y="median_house_value")
fig.show()

In [35]:
kmeans.inertia_

0.0007795012160685037

In [36]:
kmeans.score(X_train_norm)

-0.0007795012160685038

In [37]:
kmeans.score(X_test_norm)

-0.000364182266925372

In [38]:
#"Opposite of the value of X on the K-means objective." It means negative of the K-means objective.

### Choosing the best number of clusters

In [39]:
K = range(2, 8)
fits = []
score = []


for k in K:
    # train the model for current value of k on training data
    model = KMeans(n_clusters = k, random_state = 0).fit(X_train_norm)
    
    # append the model to fits
    fits.append(model)
    
    # Append the silhouette score to scores
    score.append(model.inertia_)

In [40]:
score

[0.0013679826700646889,
 0.0007795048576701006,
 0.000529398708359306,
 0.0003631728120112357,
 0.00025637670865566155,
 0.00018848748106956386]

In [41]:
grid_param = { 
    'n_clusters': range(2, 8),
}
model = KMeans(random_state = 0)
grid_mse = GridSearchCV(estimator=model, param_grid=grid_param, cv=5, verbose=0)
grid_mse.fit(X_train_norm)
print("Best parameters found: ", grid_mse.best_params_)
print("best score: ", grid_mse.best_score_)

Best parameters found:  {'n_clusters': 7}
best score:  -3.852138795434785e-05


In [46]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_train['longitude'], y=X_train['latitude'],mode='markers', 
                         marker=dict(color=fits[5].labels_,showscale=True)))
fig.update_layout(title=f"house value", yaxis_title='latitude', xaxis_title='longitude')
fig.show()