<a href="https://colab.research.google.com/github/withlionbuddha/learning.ai/blob/ground/SemiSupervisedClustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.metrics import adjusted_rand_score

# Iris 데이터셋 로드
iris = datasets.load_iris()
X = iris.data[:, :2]  # 편의를 위해 2차원 데이터만 사용
y = iris.target


In [None]:
# 재현성을 위해 난수 시드 설정
np.random.seed(42)

# 전체 데이터 중 10%만 레이블된 데이터로 사용
num_total_samples = len(y)
num_labeled_samples = int(0.1 * num_total_samples)
indices = np.arange(num_total_samples)
np.random.shuffle(indices)

labeled_indices = indices[:num_labeled_samples]
unlabeled_indices = indices[num_labeled_samples:]

labels = np.copy(y)
labels[unlabeled_indices] = -1  # 레이블이 없는 데이터는 -1로 표시


In [None]:
# Label Propagation 모델 생성
label_prop_model = LabelPropagation(kernel='knn', n_neighbors=7)

# 모델 학습
label_prop_model.fit(X, labels)

# 레이블 예측
predicted_labels = label_prop_model.transduction_


In [None]:
# 평가 (Adjusted Rand Index)
ari = adjusted_rand_score(y, predicted_labels)
print(f"Adjusted Rand Index (Label Propagation): {ari:.4f}")

# 시각화
colors = ['red', 'green', 'blue']
for class_label, color in zip(np.unique(predicted_labels), colors):
    class_data = X[predicted_labels == class_label]
    plt.scatter(class_data[:, 0], class_data[:, 1], c=color, label=f'Class {class_label}')

plt.legend()
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Semi-Supervised Clustering with Label Propagation')
plt.show()


In [None]:
# Label Spreading 모델 생성
label_spread_model = LabelSpreading(kernel='knn', n_neighbors=7)

# 모델 학습
label_spread_model.fit(X, labels)

# 레이블 예측
predicted_labels_spread = label_spread_model.transduction_

# 평가
ari_spread = adjusted_rand_score(y, predicted_labels_spread)
print(f"Adjusted Rand Index (Label Spreading): {ari_spread:.4f}")


In [None]:
# Label Spreading 모델 생성
label_spread_model = LabelSpreading(kernel='knn', n_neighbors=7)

# 모델 학습
label_spread_model.fit(X, labels)

# 레이블 예측
predicted_labels_spread = label_spread_model.transduction_

# 평가
ari_spread = adjusted_rand_score(y, predicted_labels_spread)
print(f"Adjusted Rand Index (Label Spreading): {ari_spread:.4f}")


In [None]:
from sklearn.cluster import KMeans

# K-평균 클러스터링 수행
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X)

# 평가
kmeans_ari = adjusted_rand_score(y, kmeans_labels)
print(f"Adjusted Rand Index (K-Means): {kmeans_ari:.4f}")


In [None]:
print("=== Adjusted Rand Index Scores ===")
print(f"Label Propagation: {ari:.4f}")
print(f"Label Spreading: {ari_spread:.4f}")
print(f"K-Means (No Labels): {kmeans_ari:.4f}")


In [None]:
# RBF 커널을 사용하는 Label Spreading
label_spread_rbf = LabelSpreading(kernel='rbf', gamma=20)
label_spread_rbf.fit(X, labels)
predicted_labels_rbf = label_spread_rbf.transduction_
ari_rbf = adjusted_rand_score(y, predicted_labels_rbf)
print(f"Adjusted Rand Index (Label Spreading with RBF Kernel): {ari_rbf:.4f}")
