<a href="https://colab.research.google.com/github/sungjin-kim-data/K-Means-Clustering/blob/main/K_Means_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import math
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler

plt.style.use("default")

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# 데이터 불러오기
df = pd.read_csv('/content/dataset.csv')
df.head()

In [None]:
# 데이터 정보 확인
df.info()

In [None]:
# 데이터 정제
#diagnosis columns M은 1, B는 0으로 바꾼 뒤, label이라는 변수에 저장
df = df.replace({'diagnosis','M'}, 1)
df = df.replace({'diagnosis','B'}, 0)

label = df['diagnosis']
label[:5]

In [None]:
# columns 중 필요없는 columns 삭제
df.columns

In [None]:
df = df.drop(['id', 'diagnosis', 'Unnamed: 32'], axis = 1)

In [None]:
# 통계치 확인
df.describe()

In [None]:
# 모든 columns에 log tranformation 적용
df_log = np.log(df)
df_log

In [None]:
# 데이터의 단위에 따른 왜곡을 방지하기 위해 df_log에 StandardScaler()를 적용

sc = StandardScaler()
features = sc.fit_transform(df_log)

In [None]:
features

In [None]:
# K-Means Clustering 적용 ( K = 2 , random_state=42 )
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters = 2, random_state=42)
kmeans.fit(features)

df['cluster'] = kmeans.labels_

In [None]:
# df['cluster'] 시각화 및 결과 해석

sns.scatterplot(data=df, x='compactness_mean',y='perimeter_mean',hue='cluster')
plt.title('compactness_mean by perimeter_mean')
plt.legend(fontsize=15)
plt.show()

In [None]:
label2 = df['cluster']
acc = (label == label2).mean()
round(acc, 2)

## **Elbow Method를 보고 K 값 확인**

In [None]:
inertia_list = []

for i in range(1, 11):
  kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
  # 학습
  kmeans.fit(features)
  # 각 지점으로 부터 centroid 까지의 거리의 제곱의 합
  inertia_list.append(kmeans.inertia_)

# 시각화
plt.plot(range(1, 11), inertia_list, 'o-')
plt.title('Elbow Method')
plt.xlabel('n_clusters')
plt.xlim(1, 10, 1)
plt.ylabel('inertia')
plt.show()

In [None]:
# K = 2와 비교를 위해 df_1에 columns 저장

df_1 = df[['compactness_mean', 'perimeter_mean']]

In [None]:
# Scaling 이전의 데이터 확인
plt.scatter(df_1['compactness_mean'], df_1['perimeter_mean'])
plt.title('compactness_mean by perimeter_mean')
plt.show()

In [None]:
# Feature Scaling
sc = StandardScaler()
X = sc.fit_transform(df_1)

In [None]:
plt.scatter(X[:, 0], X[:, 1])
plt.title('compactness_mean by perimeter_mean')
plt.show()

In [None]:
# Optimal K 값을 4으로 지정
kmeans = KMeans(n_clusters=4, random_state=42)
# 각 지점들이 어떤 클라스크에 속하는지(Y) 학습을 통해 정보를 담기
y_kmeans = kmeans.fit_predict(X)

In [None]:
# 시각화 하기 전 각 Centroid 좌표 찍기
centers = kmeans.cluster_centers_
centers

In [None]:
# 시각화
for cluster in range(4):
  plt.scatter(X[y_kmeans == cluster, 0], X[y_kmeans == cluster, 1], s=40, ec='black')
  # centroid
  plt.scatter(centers[cluster, 0], centers[cluster, 1], s=300, ec='black', color='yellow', marker='s')
  # cluster text
  plt.text(centers[cluster, 0], centers[cluster, 1], cluster, va='center', ha='center')

plt.title('compactness_mean by perimeter_mean')
plt.show()

In [None]:
# Features Scaling 원복
X_org = sc.inverse_transform(X)
X_org[:5]

In [None]:
centers_org = sc.inverse_transform(centers)
centers_org

In [None]:
# 시각화
for cluster in range(4):
  plt.scatter(X_org[y_kmeans == cluster, 0], X_org[y_kmeans == cluster, 1], s=40, ec='black')
  # centroid
  plt.scatter(centers_org[cluster, 0], centers_org[cluster, 1], s=300, ec='black', color='yellow', marker='s')
  # cluster text
  plt.text(centers_org[cluster, 0], centers_org[cluster, 1], cluster, va='center', ha='center')

plt.title('compactness_mean by perimeter_mean')
plt.show()