In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

from matplotlib import pyplot
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

mpl.rcParams['axes.unicode_minus'] = False
mpl.rc("font", family = "Malgun Gothic")

from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_excel('/content/preprocessing_data.xlsx')
df

Unnamed: 0,서울시 동,가구원수,서울시 구,구별 가구수,반려동물 비율,반려동물 가구수 구,반려동물 가구수 동,면적,반려동물 가구밀도,폐사안락사수,보호소 수,보호소 밀도,폐사안락사 비율,동물병원약국개수,동물병원약국 밀도
0,가락1동,9534,송파구,255766,0.216,55245.456,2059.344,1.34,1536.823881,15,1,0.746269,11.194030,11,8.208955
1,가락2동,11482,송파구,255766,0.216,55245.456,2480.112,0.96,2583.450000,13,1,1.041667,13.541667,11,11.458333
2,가락본동,10576,송파구,255766,0.216,55245.456,2284.416,1.13,2021.607080,19,1,0.884956,16.814159,10,8.849558
3,가리봉동,4901,구로구,164083,0.195,31996.185,955.695,0.40,2389.237500,0,0,0.000000,0.000000,2,5.000000
4,가산동,14124,금천구,103432,0.179,18514.328,2528.196,2.52,1003.252381,0,0,0.000000,0.000000,14,5.555556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,청구동,5764,중구,55093,0.191,10522.763,1100.924,0.34,3238.011765,0,0,0.000000,0.000000,2,4.545455
421,동화동,3771,중구,55093,0.191,10522.763,720.261,0.26,2770.234615,0,0,0.000000,0.000000,2,2.325581
422,항동,5988,구로구,164083,0.195,31996.185,1167.660,1.40,834.042857,0,0,0.000000,0.000000,2,3.174603
423,위례동,9041,송파구,255766,0.216,55245.456,1952.856,2.55,765.825882,0,0,0.000000,0.000000,2,1.904762


In [3]:
# 독립변수 추출
col = ['서울시 동', '가구원수', '서울시 구', '구별 가구수', '반려동물 비율', '반려동물 가구수 구',
       '반려동물 가구수 동', '면적', '폐사안락사수', '보호소 수', '동물병원약국개수']
x = df.drop(col,axis=1)
x

Unnamed: 0,반려동물 가구밀도,보호소 밀도,폐사안락사 비율,동물병원약국 밀도
0,1536.823881,0.746269,11.194030,8.208955
1,2583.450000,1.041667,13.541667,11.458333
2,2021.607080,0.884956,16.814159,8.849558
3,2389.237500,0.000000,0.000000,5.000000
4,1003.252381,0.000000,0.000000,5.555556
...,...,...,...,...
420,3238.011765,0.000000,0.000000,4.545455
421,2770.234615,0.000000,0.000000,2.325581
422,834.042857,0.000000,0.000000,3.174603
423,765.825882,0.000000,0.000000,1.904762


In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaled_x = scaler.fit_transform(x) # 설명변수

In [5]:
df_scaled_x = pd.DataFrame(scaled_x,columns = x.columns)
df_scaled_x

Unnamed: 0,반려동물 가구밀도,보호소 밀도,폐사안락사 비율,동물병원약국 밀도
0,-0.378614,0.160694,0.129900,0.243147
1,0.711349,0.543612,0.292286,0.926818
2,0.126243,0.340471,0.518644,0.377930
3,0.509095,-0.806678,-0.644390,-0.432020
4,-0.934279,-0.806678,-0.644390,-0.315130
...,...,...,...,...
420,1.393014,-0.806678,-0.644390,-0.527656
421,0.905868,-0.806678,-0.644390,-0.994719
422,-1.110495,-0.806678,-0.644390,-0.816084
423,-1.181536,-0.806678,-0.644390,-1.083260


# Find Optimal K

## KMeans Clustering

In [6]:
from sklearn.cluster import KMeans

np.random.seed(42)

best_sil_score = 0

for n in range(2,200):
    n_clusters=n

    kmeans = KMeans(n_clusters).fit(scaled_x)
    cluster_kmeans = [i+1 for i in kmeans.labels_]

    df_scaled_x["cluster"] = kmeans.labels_
    df_scaled_x

    sil_score = silhouette_score(df_scaled_x, kmeans.labels_)
    
    if sil_score > best_sil_score:
        best_sil_score = sil_score
        print(f'number of cluster: {n}, best silhouette score: {sil_score}')


number of cluster: 2, best silhouette score: 0.39921324031089744
number of cluster: 4, best silhouette score: 0.4339335958070593
number of cluster: 6, best silhouette score: 0.4960267175852212
number of cluster: 7, best silhouette score: 0.5315681444925242
number of cluster: 9, best silhouette score: 0.5379861576232808
number of cluster: 11, best silhouette score: 0.556663284229338
number of cluster: 13, best silhouette score: 0.5891506437947149
number of cluster: 16, best silhouette score: 0.6018145642294924
number of cluster: 18, best silhouette score: 0.6071291200359566
number of cluster: 20, best silhouette score: 0.6173646946161564
number of cluster: 21, best silhouette score: 0.6179271154225533
number of cluster: 24, best silhouette score: 0.6474178883928192
number of cluster: 27, best silhouette score: 0.6585927840643182
number of cluster: 29, best silhouette score: 0.6586781410620485
number of cluster: 31, best silhouette score: 0.6791250948874494
number of cluster: 36, best si

## Gausian Mixture Clustering

In [7]:
from sklearn.mixture import GaussianMixture

np.random.seed(42)

best_sil_score = 0
for n in range(2,200):
    gmm = GaussianMixture(n_components=n, random_state=42)
    gmm.fit(scaled_x)
    label_gmm = gmm.predict(scaled_x)

    df_scaled_x["cluster"] = label_gmm

    sil_score = silhouette_score(df_scaled_x, label_gmm)
    
    if sil_score > best_sil_score:
        best_sil_score = sil_score
        print(f'number of cluster: {n}, best silhouette score: {sil_score}')



number of cluster: 2, best silhouette score: 0.283627477268619
number of cluster: 5, best silhouette score: 0.37527648158476457
number of cluster: 7, best silhouette score: 0.38443925887825464
number of cluster: 9, best silhouette score: 0.39405670746509003
number of cluster: 10, best silhouette score: 0.44637860975607896
number of cluster: 12, best silhouette score: 0.45864194881762266
number of cluster: 18, best silhouette score: 0.46798980350345987
number of cluster: 19, best silhouette score: 0.48443620000663323
number of cluster: 20, best silhouette score: 0.5025378108234573
number of cluster: 21, best silhouette score: 0.5290961602129326
number of cluster: 22, best silhouette score: 0.5364127825292626
number of cluster: 24, best silhouette score: 0.5777205494481487
number of cluster: 27, best silhouette score: 0.592637396962527
number of cluster: 28, best silhouette score: 0.6286522606527916
number of cluster: 30, best silhouette score: 0.6367708421643669
number of cluster: 31, b

## Hierarchical Clustering

In [8]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, dendrogram

linkages=['ward', 'complete', 'average', 'single']
affinities = ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
np.random.seed(42)


for linkage in linkages:
    for affinity in affinities:
        best_sil_score = 0
        if linkage == 'ward' and affinity != 'euclidean':
            continue
        for n in range(2,200):
            hierarchical_cluster = AgglomerativeClustering(n_clusters=n, affinity=affinity, linkage=linkage)
            label_hk = hierarchical_cluster.fit_predict(scaled_x)

            df_scaled_x["cluster"] = label_hk

            sil_score = silhouette_score(df_scaled_x, label_hk)
            
            if sil_score > best_sil_score:
                best_sil_score = sil_score
        print(f'linkage: ${linkage}, affinity: ${affinity}')
        print(f'number of cluster: {n}, best silhouette score: {sil_score}')



linkage: $ward, affinity: $euclidean
number of cluster: 199, best silhouette score: 0.6548927993408774
linkage: $complete, affinity: $euclidean
number of cluster: 199, best silhouette score: 0.642765656845505
linkage: $complete, affinity: $l1
number of cluster: 199, best silhouette score: 0.6368328640209893
linkage: $complete, affinity: $l2
number of cluster: 199, best silhouette score: 0.642765656845505
linkage: $complete, affinity: $manhattan
number of cluster: 199, best silhouette score: 0.6368328640209893
linkage: $complete, affinity: $cosine
number of cluster: 199, best silhouette score: 0.6229021570707012
linkage: $average, affinity: $euclidean
number of cluster: 199, best silhouette score: 0.5957274124837253
linkage: $average, affinity: $l1
number of cluster: 199, best silhouette score: 0.5769062285077404
linkage: $average, affinity: $l2
number of cluster: 199, best silhouette score: 0.5957274124837253
linkage: $average, affinity: $manhattan
number of cluster: 199, best silhouet

## K Medoids Clustering

In [9]:
!pip install https://github.com/scikit-learn-contrib/scikit-learn-extra/archive/master.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://github.com/scikit-learn-contrib/scikit-learn-extra/archive/master.zip
  Downloading https://github.com/scikit-learn-contrib/scikit-learn-extra/archive/master.zip
[K     / 740 kB 424 kB/s
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: scikit-learn-extra
  Building wheel for scikit-learn-extra (PEP 517) ... [?25l[?25hdone
  Created wheel for scikit-learn-extra: filename=scikit_learn_extra-0.3.0.dev0-cp38-cp38-linux_x86_64.whl size=1487280 sha256=90d4e792fb5a9090a1e7a6f6a123c2922c88411859271cff1dcebb7332d26862
  Stored in directory: /tmp/pip-ephem-wheel-cache-2btxi23_/wheels/0b/80/c3/2a9fbb64d751545f875783fadb673fda2e1242d8a2b1b674f3
Successfully built scikit-learn-extra
Installing collected packages: scikit-lea

In [10]:
from sklearn_extra.cluster import KMedoids
np.random.seed(42)

best_sil_score = 0

for n in range(2,200):
    n_clusters=n

    kmedoids = KMedoids(n_clusters).fit(scaled_x)
    cluster_kmedoidss = [i+1 for i in kmedoids.labels_]

    df_scaled_x["cluster"] = kmedoids.labels_
    df_scaled_x

    sil_score = silhouette_score(df_scaled_x, kmedoids.labels_)
    
    if sil_score > best_sil_score:
        best_sil_score = sil_score
        print(f'number of cluster: {n}, best silhouette score: {sil_score}')

number of cluster: 2, best silhouette score: 0.38929941892689385
number of cluster: 4, best silhouette score: 0.4087000371658631
number of cluster: 5, best silhouette score: 0.41304261606506304
number of cluster: 6, best silhouette score: 0.4787635207343427
number of cluster: 7, best silhouette score: 0.49633266019051087
number of cluster: 17, best silhouette score: 0.5157788425710851
number of cluster: 18, best silhouette score: 0.5255360580537791
number of cluster: 19, best silhouette score: 0.5477356015795766
number of cluster: 21, best silhouette score: 0.566318952655525
number of cluster: 26, best silhouette score: 0.5720500656947796
number of cluster: 27, best silhouette score: 0.5781110791181548
number of cluster: 29, best silhouette score: 0.5820468227498208
number of cluster: 30, best silhouette score: 0.5879670731244646
number of cluster: 44, best silhouette score: 0.6075318764740231
number of cluster: 68, best silhouette score: 0.6087353482605766
number of cluster: 82, best 