In [None]:
import numpy as np
import matplotlib.pyplot as plt

def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b)**2))

def kmeans(data, k, max_iterations=100):
    # Randomly initialize centroids
    np.random.seed(42)
    centroids = data[np.random.choice(data.shape[0], k, replace=False)]

    for _ in range(max_iterations):
        # Assign data points to the closest centroid
        labels = np.argmin(np.array([euclidean_distance(data, centroid) for centroid in centroids]), axis=0)

        # Update centroids by calculating the mean of data points in each cluster
        new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)])

        # Check for convergence
        if np.all(centroids == new_centroids):
            break

        centroids = new_centroids

    return centroids, labels



In [None]:
# Example usage:
# if __name__ == "__main__":
# Generate some sample data
np.random.seed(42)
num_samples = 100
data = np.random.randn(num_samples, 2)
data.shape
plt.scatter(data[:, 0], data[:, 1])




In [None]:
# Number of clusters (k)
k = 3

# Run k-means algorithm
centroids, labels = kmeans(data, k)

# Print the centroids and labels of the data points
print("Centroids:")
print(centroids)
print("\nLabels:")
print(labels)

In [None]:
# conda install -c conda-forge kneed

import matplotlib.pyplot as plt
# from kneed import KneeLocator
from sklearn.datasets import make_blobs

from sklearn.preprocessing import StandardScaler

In [None]:
features, true_labels = make_blobs(
    n_samples=5000,
    centers=40,
    cluster_std=2.75,
    random_state=42
)


scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)


# plt.scatter(scaled_features[:, 0],scaled_features[:, 1],c= true_labels)
# print(features[:5])
# print(scaled_features[:5])

In [None]:

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def kmeans_fit(data,n_clusters=3,n_init=10,max_iter=300):
    kmeans = KMeans(
        init="random",
        random_state=42,
        n_clusters=n_clusters,
        n_init=n_init,
        max_iter=max_iter,
        
    )
    kmeans.fit(data)

    # # The lowest SSE value
    # print(kmeans.inertia_)

    # # Final locations of the centroid
    # print(kmeans.cluster_centers_)

    # # The number of iterations required to converge
    print(f'converged after {kmeans.n_iter_} iterations')

    return kmeans.labels_
labels = kmeans_fit(data = scaled_features,n_clusters=3,n_init=10,max_iter=300)

In [None]:
# # The lowest SSE value
# print(kmeans.inertia_)

# # Final locations of the centroid
# print(kmeans.cluster_centers_)

# # The number of iterations required to converge
# print(kmeans.n_iter_)

In [None]:
plt.figure()
plt.scatter(scaled_features[:, 0],scaled_features[:, 1],c= true_labels)
# kmeans.labels_[:5]
plt.figure()
plt.scatter(scaled_features[:, 0],scaled_features[:, 1],c= labels)



In [None]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

# A list holds the SSE values for each k
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled_features)
    sse.append(kmeans.inertia_)