In [37]:
# Import the modules
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans


In [24]:
# Read in the CSV file as a Pandas DataFrame
home_sales_df = pd.read_csv(
  Path("../Resources/national-home-sales.csv"),
  index_col="date", 
  parse_dates=True, 
  infer_datetime_format=True 
)

# Review the DataFrame
home_sales_df.head()

Unnamed: 0_level_0,inventory,homes_sold,median_sale_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-01,1250798,377964,289000
2020-02-01,1265253,405992,294000
2020-03-01,1316823,507324,303000
2020-04-01,1297460,436855,304000
2020-05-01,1289500,421351,299000


In [25]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

In [26]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the spread_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(home_sales_df)
    inertia.append(k_model.inertia_)
    



In [27]:
# Create a Dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data Dictionary
elbow_df = pd.DataFrame(elbow_data)

# Review the DataFrame
elbow_df.head()



Unnamed: 0,k,inertia
0,1,8048111000000.0
1,2,3451654000000.0
2,3,1894158000000.0
3,4,1357434000000.0
4,5,1114805000000.0


In [40]:
# Plot the DataFrame
elbow_df.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)



## Perform the following tasks for each of the two most likely values of `k`:

* Define a K-means model using `k` to define the clusters, fit the model, make predictions, and add the prediction values to a copy of the scaled DataFrame and call it `spread_predictions_df`.

* Plot the clusters. The x-axis should reflect home "inventory", and the y-axis should reflect either the "median_sale_price" or "homes_sold" variable.

In [32]:
# Define the model with the lower value of k clusters
# Use a random_state of 1 to generate the model
model = KMeans(n_clusters=3, random_state=1)

# Fit the model
model.fit(home_sales_df)

# Make predictions
predictions = model.predict(home_sales_df)


# Create a copy of the DataFrame and name it as spread_df_predictions
home_sales_predictions_df = home_sales_df.copy()


# Add a class column with the labels to the spread_df_predictions DataFrame
home_sales_predictions_df["clusters_lower"] = model.labels_


In [33]:

home_sales_predictions_df.head()

Unnamed: 0_level_0,inventory,homes_sold,median_sale_price,clusters_lower
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01,1250798,377964,289000,1
2020-02-01,1265253,405992,294000,1
2020-03-01,1316823,507324,303000,1
2020-04-01,1297460,436855,304000,1
2020-05-01,1289500,421351,299000,1


In [34]:
# Plot the clusters
home_sales_predictions_df.hvplot.scatter(
    x="inventory",
    y="homes_sold",
    by="clusters_lower"
).opts(yformatter="%.0f")

In [35]:
# Define the model with the higher value of k clusters
# Use a random_state of 1 to generate the model
model = KMeans(n_clusters=4, random_state=1)

# Fit the model
model.fit(home_sales_df)

# Make predictions
prediciton_k4 = model.predict(home_sales_df)

# Add a class column with the labels to the spread_df_predictions DataFrame
home_sales_predictions_df["clusters_higher"] = model.labels_


In [36]:
# Plot the clusters
home_sales_predictions_df.hvplot.scatter(
    x="inventory",
    y="homes_sold",
    by="clusters_higher"
).opts(yformatter="%.0f")


## Answer the following question

* Considering the plot, what’s the best number of clusters to choose, or value of k? 

From the scatter plots, it appears that the optimal value for k, the nubmer of clusters is probably 3. It appears to better group the monthly housing trends among different levels of inventory. However, 4 clusters is probably not wrong either as, for a certain range of inventory, it appears to identify two unique clusters according to the number of homes sold. Overall, the best differention across multiple variables though would likely be 3 clusters.