In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.datasets import make_moons
from scipy.cluster.hierarchy import dendrogram
from sklearn.datasets import make_blobs

# new import statements
from sklearn.cluster import KMeans, AgglomerativeClustering

### `StandardScaler` with `KMeans`

Recall that `StandardScaler` should always be applied after applying `PolynomialFeatures`.

In [None]:
x = datasets.make_blobs(centers=np.array([(0, 0), (0, 20), (3, 20)]))[0]
df = pd.DataFrame(x)
df ???

In [None]:
km_c = ???
km_c.???
km_c.???

#### `fit_predict(...)` is a shortcut for `fit` and `predict` method invocations.

In [None]:
???

In [None]:
# -1 => white, 0 => gray, 1 => black
df.plot.scatter(x=0, y=1, figsize=(6, 4), ???)

**Observation**: scale for columns are intentionally not specified.

In [None]:
df

Let's make a copy of the data. Assuming initial data for both columns is in "km", let's convert one column (`0`) into "meters". 

In [None]:
df2 = ???
??? # km => m
df2.head()

In [None]:
df2.plot.scatter(x=0, y=1, figsize=(6,4), c=KMeans(2).fit_predict(df2), vmin=-1, vmax=1)

**Observations**:
- One would expect to see the same clusters, but that is not happening here. Why?
    - x-axis difference is too high when compared to the y-axis difference
    - That is, KMeans doesn't get that x-axis has scaled data, whereas y-axis doesn't have scaled data
- This is not too far off from realistic datasets. 
    - That is, real-world dataset columns might have difference units. 
    - For example, one column might be representing temperature data where as another might be representing distance.

#### Conclusion: `StandardScaler` should be applied before `KMeans`

In [None]:
model = ???

df2.plot.scatter(x=0, y=1, figsize=(6, 4), c=model.fit_predict(df2), vmin=-1, vmax=1)

## K-Means use cases:

1. estimator
2. transformer:
    - sometimes we'll use an unsupervised learning technique (like k-means) to pre-process data, creating better inputs for a supervised learning technique (like logistic regression)

In [None]:
def make_data():
    x, y = datasets.make_blobs(n_samples=250, centers=5, random_state=5)
    xcols = ["x0", "x1"]
    df1 = pd.DataFrame(x, columns=xcols)
    df1["y"] = y > 0

    df2 = pd.DataFrame(np.random.uniform(-10, 10, size=(250, 2)), columns=["x0", "x1"])
    df2["y"] = False

    return pd.concat((df1, df2))

df = make_data()
df["color"] = df.apply(lambda row: "r" if row["y"] else "b", axis=1)
train, test = train_test_split(df)

In [None]:
plt.rcParams["font.size"] = 16
fig, ax = plt.subplots(ncols=2, figsize=(10,4))
train.plot.scatter(x="x0", y="x1", c=train["color"], vmin=-1, ax=ax[0])
test.plot.scatter(x="x0", y="x1", c="k", ax=ax[1])
ax[0].set_title("Training Data")
ax[1].set_title("Test Data")
plt.subplots_adjust(wspace=0.4)

#### Objective: use `LogisticRegression` to classify points as "red" or "blue".

In [None]:
model = LogisticRegression()
model.fit(train[["x0", "x1"]], train["y"])
model.score(test[["x0", "x1"]], test["y"])

In [None]:
model = ???
model.fit(train[["x0", "x1"]], train["y"])
model.score(test[["x0", "x1"]], test["y"])

In [None]:
model = ???
model.fit(train[["x0", "x1"]], train["y"])
model.score(test[["x0", "x1"]], test["y"])

### Wisconsin counties example

In [None]:
df = gpd.read_file("counties.geojson")
df.head()

#### If we want to use "POP100", "AREALAND", "developed", "forest", "pasture", "crops" for clustering, what transformer should we use? 

- StandardScaler.

### Goal here: cluster counties based on similar land usage.

In [None]:
df.plot()

In [None]:
df.plot(column="crops")

In [None]:
df.plot(column="forest")

### KMeans

In [None]:
xcols = ["developed", "forest", "pasture", "crops"]

# instantiate
km_c = ???
# fit
km_c.fit(df[xcols])
# predict
clusters = ???

print(???)
print(clusters)

df.plot(???)

**Observation**: cluster number can be random. That is, if you re-run the above cell twice, you will get different number for each cluster.

### Agglomerative clustering

- import statement
```python
from sklearn.cluster import AgglomerativeClustering
```

In [None]:
xcols = ["developed", "forest", "pasture", "crops"]

# instantiate
agg_c = ???
# fit
agg_c.fit(df[xcols])
# predict
clusters = agg_c.predict(df[xcols])

print(clusters)

df.plot(column=clusters, cmap="tab10")

**Observations**: 
- no centroids => no inertia => no elbow plots (how do we pick cluster count?):
    - AttributeError: 'AgglomerativeClustering' object has no attribute 'predict'
- no `predict` method, but there is `fit_predict`:
    - AttributeError: 'AgglomerativeClustering' object has no attribute 'predict'
    - why?
        - because each point could lead to a completely different tree
        - remember unlike KMeans (which is top-down), AgglomerativeClustering is bottom-up

In [None]:
xcols = ["developed", "forest", "pasture", "crops"]

# instantiate
agg_c = AgglomerativeClustering(4) # use compute_distances=True to save the distances
# fit_predict
clusters = agg_c.???

print(clusters)

df.plot(column=clusters, cmap="tab10")

In [None]:
# The original df has 72 rows
len(df)

**Observations:**
- `agg_model.children_` lists all intermediate children of the dendrogram
- [44, 62] => Row 44, 62 from the original data was grouped together
- [19, 73] => 
    - What's 73? Original df length + i, where i stands for the first group ([44, 62])
    - 19 was grouped with [44, 62]
- Last row is the root of the dendrogram

In [None]:
agg_c.children_

In [None]:
# Let's peak at the first group
df.iloc[[44,  62]]

#### Count the number of nodes in the subtree given node index
- leaf nodes have index 0-71 (from the original df)
- intermediate nodes have index >= 72 (from `agg_model.children_`)

In [None]:
def node_count(node_idx):
    if node_idx < len(df):
        return 1
    else:
        left, right = agg_c.children_[node_idx - len(df)]
        return node_count(left) + node_count(right) + 1
    
node_count(75)

In [None]:
# total number of nodes in the dendrogram
node_count(len(df) + len(agg_c.children_) - 1)

#### Linkage Matrix 
- 4 columns: 
    - left node
    - right node
    - distances
    - node count

In [None]:
# The distance between each group in the same order as km_c.children_
agg_c.distances_

In [None]:
# use the above node_count function 
counts = [node_count(node_idx) for node_idx in range(len(df), len(df) + len(agg_c.children_))]
counts = np.array(counts).reshape(-1, 1)
counts[:10, :]

In [None]:
linkage = np.concatenate([
    agg_c.children_, 
    agg_c.distances_.reshape(-1, 1),
    counts,
], axis=1)
linkage

#### Create dendrogram
- import statement
```python
from scipy.cluster.hierarchy import dendrogram
```

In [None]:
fig, ax = plt.subplots(figsize=(16,5))
dendrogram(linkage, labels=df["NAME"].values, ax=ax)
ax.tick_params(labelsize=12)
None