<img src="https://i.imgur.com/6FvkWEr.jpeg" width="600">

In [2]:
import pandas as pd

# 載入寶可夢資料
df = pd.read_json('pokemon.json')

# 取出目標欄位 (HP, Attack, Defense, Sp. Atk, Sp. Def)
X_train = df.iloc[:, :-1]

# 顯示前 5 筆資料
X_train.head()

Unnamed: 0,HP,Attack,Defense,SpecialAtk,SpecialDef
0,39,52,43,60,50
1,58,64,58,80,65
2,35,55,40,50,50
3,60,90,55,90,80
4,38,41,40,50,65


In [3]:
# 特徵標準化
from sklearn.preprocessing import StandardScaler

# 標準化訓練資料
scalar = StandardScaler()
scaler = scalar.fit(X_train)
X_train_std = scalar.transform(X_train)

# 顯示標準化後的資料
X_train_std[:2, :]

array([[-1.15734402, -0.72671047, -0.84787115, -0.47156812, -0.78920593],
       [-0.35171516, -0.33802419, -0.24169379,  0.11647846, -0.17526249]])

In [4]:
# 利用 Hierarchical Clustering 進行分群，除以下參數設定外，其餘為預設值
# #############################################################################
# n_clusters=4, affinity='euclidean', linkage='ward'
# #############################################################################
from sklearn.cluster import AgglomerativeClustering

# Hierarchical Clustering
model = AgglomerativeClustering(
    n_clusters=4, 
    linkage='ward'
)

# 訓練模型
model = model.fit(X_train_std)

In [5]:
# 計算每一群的個數
li_clusters = model.labels_.tolist()
'''
model.labels_

array([0, 0, 0, 2, 0, 3, 0, 2, 0, 2, 0, 0, 2, 2, 0, 1, 2, 2, 2, 0, 3, 0,
       0, 3, 3, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 3, 2, 0, 0, 0, 2, 3, 0, 2,
       2, 0, 1, 0, 0, 3, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 2, 2, 0,
       0, 1, 0, 3, 0, 0, 0, 2, 2, 3, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       2, 1, 2, 2, 1, 3, 2, 2, 3, 0, 0, 3, 0, 0, 2, 2, 0, 2, 0, 2, 0, 1,
       1, 1, 1, 0, 2, 0, 1, 2, 0, 0, 3, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 1, 1, 0, 3, 2,
       2, 2])

轉換成 list 之後，才能使 count() 方法計算每一群的個數
'''

# 顯示每一群的個數
for i in range(len(set(li_clusters))):
    print('Cluster', i, '個數:', li_clusters.count(i))

Cluster 0 個數: 76
Cluster 1 個數: 15
Cluster 2 個數: 49
Cluster 3 個數: 16


In [10]:
# 動態增加一個欄位 cluster，並將分群結果存入
df['cluster'] = model.labels_

# 找到 Speed 有遺漏值的兩隻寶可夢，並填入組內平均
for i in df.index:
    if pd.isna(df.iloc[i, -2]): # -2 是 Speed 欄位
        # 顯示有遺漏值的寶可夢資料
        print(df.iloc[i,:].tolist())

        # 找到同一群的組內平均
        poke = df[df['cluster']==df.iloc[i, -1]].mean()
        '''
        poke

        [60.0, 48.0, 45.0, 43.0, 90.0, nan, 0.0]
        HP            52.026316
        Attack        53.328947
        Defense       48.078947
        SpecialAtk    59.710526
        SpecialDef    53.973684
        Speed         58.266667
        cluster        0.000000
        dtype: float64

        [70.0, 75.0, 60.0, 105.0, 60.0, nan, 2.0]
        HP             78.387755
        Attack         97.285714
        Defense        69.755102
        SpecialAtk    110.204082
        SpecialDef     80.795918
        Speed          91.604167
        cluster         2.000000
        dtype: float64
        '''

        # 輸出組內平均
        print(f"Speed = {round(poke['Speed'], 0)}")

[60.0, 48.0, 45.0, 43.0, 90.0, nan, 0.0]
Speed = 58.0
[70.0, 75.0, 60.0, 105.0, 60.0, nan, 2.0]
Speed = 92.0
