In [12]:
# Intial Imports
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas
import plotly.figure_factory as ff

In [2]:
# Loading preprocessed iris dataset
file_path = "../Resources/unsupervised_learning_whr_2019.csv"
df_2019 = pd.read_csv(file_path)
df_2019.head()

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust
0,1.34,1.587,0.986,0.596,0.153,0.393
1,1.383,1.573,0.996,0.592,0.252,0.41
2,1.488,1.582,1.028,0.603,0.271,0.341
3,1.286,1.484,1.062,0.362,0.153,0.079
4,1.149,1.442,0.91,0.516,0.109,0.054


In [3]:
# Standardize data with StandardScaler
scaled_2019 = StandardScaler().fit_transform(df_2019)
print(scaled_2019[0:5])

[[ 1.09498097  1.26581562  1.08766556  1.43260518 -0.33170285  2.98292872]
 [ 1.20364872  1.21847653  1.12949572  1.40411035  0.70438435  3.1623664 ]
 [ 1.46900019  1.24890881  1.26335222  1.48247112  0.90322937  2.43406051]
 [ 0.9585145   0.91753518  1.40557475 -0.2343422  -0.33170285 -0.33139083]
 [ 0.61229401  0.77551791  0.76975637  0.86270864 -0.79218605 -0.59526978]]


In [16]:
# Initialize PCA model
pca = PCA(n_components=2)

In [17]:
# Get three principal components for 2019.
pca_2019 = pca.fit_transform(scaled_2019)

In [18]:
# Transform PCA data to a DataFrame
df_pca_2019 = pd.DataFrame(
    data=pca_2019, columns=["principal component 1", "principal component 2"]
)
df_pca_2019.head()

Unnamed: 0,principal component 1,principal component 2
0,-3.138875,1.050389
1,-3.300299,1.816897
2,-3.347485,1.521363
3,-1.456917,-1.183592
4,-1.18821,-0.989879


In [19]:
# Fetch ecplained variance
pca.explained_variance_ratio_

array([0.49415675, 0.240063  ])

In [20]:
# Create the dendrogram
fig = ff.create_dendrogram(df_pca_2019, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

In [21]:
# Run the hierarichal algorithm 
agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(df_pca_2019)

In [22]:
# Add a new column to df_iris
df_pca_2019["class"] = model.labels_
df_pca_2019.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-3.138875,1.050389,2
1,-3.300299,1.816897,2
2,-3.347485,1.521363,2
3,-1.456917,-1.183592,1
4,-1.18821,-0.989879,1


In [23]:
# Create a chart
df_pca_2019.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)