In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris

# Load the Iris dataset (you can replace this with any dataset of your choice)
iris = load_iris()
data = iris.data  # Iris dataset features
labels = iris.target  # True labels (used for visualization only)++++

# Convert to pandas DataFrame for easier manipulation (optional)
df = pd.DataFrame(data, columns=iris.feature_names)

# Preprocess the data: Scale the features for better performance with K-means
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# Apply KMeans Clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)  # We want to group into 3 clusters
kmeans.fit(scaled_data)

# Get the cluster labels for each data point
cluster_labels = kmeans.labels_

# Add the cluster labels to the dataframe
df['Cluster'] = cluster_labels

# Group the dataset by clusters and show the mean of each feature for each cluster
grouped_df = df.groupby('Cluster').mean()  # This will give the mean of each feature for each cluster

# Display the grouped dataset (mean of features for each cluster)
print("Grouped Dataset (Mean of Features by Cluster):")
print(grouped_df)

# Optionally, you can see how the original data points are assigned to clusters
print("\nOriginal Data with Cluster Labels:")
print(df)

# Visualize the clusters using PCA (2D visualization)
pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled_data)

# Plot the clusters (using PCA components for 2D plot)
plt.figure(figsize=(8, 6))
plt.scatter(pca_components[:, 0], pca_components[:, 1], c=cluster_labels, cmap='viridis')
plt.title('K-means Clustering into 3 Clusters on Iris Dataset')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()