In [11]:
# Import the modules
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [34]:
hvplot.__version__

'0.8.4'

In [35]:
!python -V

Python 3.7.16


In [12]:
# Read in the CSV file as a Pandas DataFrame
spread_df = pd.read_csv(
    Path("../Resources/stock_data.csv"),
    index_col="date", 
    parse_dates=True, 
    infer_datetime_format=True
)

# Review the DataFrame
spread_df.head()

Unnamed: 0_level_0,close,volume,open,high,low,returns,hi_low_spread
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-04-30,3.61,18193730,3.55,3.73,3.53,0.02849,0.2
2009-05-01,3.82,16233940,3.55,3.9,3.55,0.058172,0.35
2009-05-04,4.26,21236940,3.9,4.3,3.83,0.115183,0.47
2009-05-05,4.32,16369170,4.36,4.39,4.11,0.014085,0.28
2009-05-06,4.31,15075630,4.45,4.45,4.12,-0.002315,0.33


In [13]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

In [14]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the spread_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(spread_df)
    inertia.append(k_model.inertia_)

In [15]:
# Create a Dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data Dictionary
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,2.835703e+18
1,2,9.903144e+17
2,3,4.999629e+17
3,4,3.0621e+17
4,5,2.101772e+17


In [16]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [17]:
scaler = StandardScaler()

In [20]:
spread_scaled = scaler.fit_transform(spread_df)

In [24]:
pd.DataFrame(spread_scaled, columns=spread_df.columns)

Unnamed: 0,close,volume,open,high,low,returns,hi_low_spread
0,-0.679974,-0.511488,-0.690524,-0.670844,-0.682939,0.731966,-0.355360
1,-0.643852,-0.569864,-0.690524,-0.642324,-0.679406,1.536581,0.043405
2,-0.568169,-0.420841,-0.630345,-0.575219,-0.629931,3.082054,0.362417
3,-0.557849,-0.565836,-0.551252,-0.560121,-0.580457,0.341459,-0.142685
4,-0.559569,-0.604367,-0.535777,-0.550055,-0.578690,-0.103095,-0.009763
...,...,...,...,...,...,...,...
2511,3.510107,0.185227,3.544378,3.482952,3.603651,-0.242357,0.973857
2512,3.594391,0.480224,3.530622,3.543346,3.628388,0.434556,1.558712
2513,3.456785,0.635761,3.628629,3.545024,3.527673,-0.802344,3.100603
2514,3.494627,0.398296,3.454968,3.383972,3.472898,0.175265,1.372622


In [25]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

In [26]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the spread_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(spread_scaled)
    inertia.append(k_model.inertia_)

In [27]:
# Create a Dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data Dictionary
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,17612.0
1,2,8696.386368
2,3,6260.146033
3,4,5332.582793
4,5,4694.732543


In [28]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

## Perform the following tasks for each of the two most likely values of `k`:

* Define a K-means model using `k` to define the clusters, fit the model, make predictions, and add the prediction values to a copy of the scaled DataFrame and call it `spread_predictions_df`.

* Plot the clusters. The x-axis should reflect the "hi_low_spread", and the y-axis should reflect the "close" price.

In [29]:
# Define the model with the lower value of k clusters
# Use a random_state of 1 to generate the model
model = KMeans(n_clusters=3, random_state=1)

# Fit the model
model.fit(spread_df)

# Make predictions
k_lower = model.predict(spread_df)

# Create a copy of the DataFrame and name it as spread_df_predictions
spread_df_predictions = spread_df.copy()

# Add a class column with the labels to the spread_df_predictions DataFrame
spread_df_predictions['clusters_lower'] = k_lower

In [30]:
# Plot the clusters
spread_df_predictions.hvplot.scatter(
    x="hi_low_spread",
    y="close",
    by="clusters_lower"
).opts(yformatter="%.0f")

In [31]:
# Define the model with the higher value of k clusters
# Use a random_state of 1 to generate the model
model = KMeans(n_clusters=4, random_state=1)

# Fit the model
model.fit(spread_df)

# Make predictions
k_higher = model.predict(spread_df)

# Add a class column with the labels to the spread_df_predictions DataFrame
spread_df_predictions['clusters_higher'] = k_higher

In [32]:
# Plot the clusters
spread_df_predictions.hvplot.scatter(
    x="hi_low_spread",
    y="close",
    by="clusters_higher"
).opts(yformatter="%.0f")

## Answer the following question

* Considering the plot, what’s the best number of clusters to choose, or value of k? 

From the scatter plots, it's a little hard to tell given the variability and quantity of the data (zooming in helps), but it appears that the optimal value for k, the nubmer of clusters, is 3.