# DATA CLUSTERING K-means

Step 1: Create a DataFrame

In [1]:
import pandas as pd

# Creating the DataFrame
data = {
    'points': [18.0, 19.0, 14.0, 14.0, 14.0, 11.0, 20.0, 28.0, 30.0, 31.0, 35.0, 33.0, 25.0, 25.0, 27.0, 29.0, 30.0, 19.0, 23.0],
    'assists': [3.0, 4.0, 5.0, 4.0, 7.0, 7.0, 8.0, 7.0, 6.0, 9.0, 12.0, 14.0, 9.0, 4.0, 3.0, 4.0, 12.0, 15.0, 11.0],
    'rebounds': [15, 14, 10, 8, 14, 14, 13, 9, 5, 4, 11, 6, 5, 3, 8, 12, 7, 6, 5]
}

df = pd.DataFrame(data)
print(df)


    points  assists  rebounds
0     18.0      3.0        15
1     19.0      4.0        14
2     14.0      5.0        10
3     14.0      4.0         8
4     14.0      7.0        14
5     11.0      7.0        14
6     20.0      8.0        13
7     28.0      7.0         9
8     30.0      6.0         5
9     31.0      9.0         4
10    35.0     12.0        11
11    33.0     14.0         6
12    25.0      9.0         5
13    25.0      4.0         3
14    27.0      3.0         8
15    29.0      4.0        12
16    30.0     12.0         7
17    19.0     15.0         6
18    23.0     11.0         5


Step 2: One Iteration of K-means Clustering

In [2]:
import numpy as np

# Function to calculate Euclidean distance
def euclidean_distance(point, centroid):
    return np.sqrt(np.sum((point - centroid) ** 2))

# Initial centroids
C1 = np.array([18, 3, 15])
C2 = np.array([30, 8, 5])

# Calculate distances and assign clusters
distances = []
clusters = []

for index, row in df.iterrows():
    player_point = np.array([row['points'], row['assists'], row['rebounds']])
    distance_to_C1 = euclidean_distance(player_point, C1)
    distance_to_C2 = euclidean_distance(player_point, C2)
    
    distances.append((distance_to_C1, distance_to_C2))
    cluster = 1 if distance_to_C1 < distance_to_C2 else 2
    clusters.append(cluster)

# Add clusters to DataFrame
df['cluster'] = clusters
print(df[['points', 'assists', 'rebounds', 'cluster']])


    points  assists  rebounds  cluster
0     18.0      3.0        15        1
1     19.0      4.0        14        1
2     14.0      5.0        10        1
3     14.0      4.0         8        1
4     14.0      7.0        14        1
5     11.0      7.0        14        1
6     20.0      8.0        13        1
7     28.0      7.0         9        2
8     30.0      6.0         5        2
9     31.0      9.0         4        2
10    35.0     12.0        11        2
11    33.0     14.0         6        2
12    25.0      9.0         5        2
13    25.0      4.0         3        2
14    27.0      3.0         8        2
15    29.0      4.0        12        2
16    30.0     12.0         7        2
17    19.0     15.0         6        2
18    23.0     11.0         5        2


Step 3: Plotting and K-means Implementation

In [7]:
import matplotlib.pyplot as plt

plt.scatter(df['points'], df['assists'], c='blue')
plt.xlabel('Points')
plt.ylabel('Assists')
plt.title('Scatter Plot of Players')
plt.show()



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/lib/python3/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/lib/python3/dist-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "/usr/lib/python3/dist-packages/ipykernel/kernelapp.py", line 677, in start
    s

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [4]:
def plot_clusters(df, k):
    plt.figure(figsize=(8, 6))
    for cluster in range(1, k + 1):
        cluster_data = df[df['cluster'] == cluster]
        plt.scatter(cluster_data['points'], cluster_data['assists'], label=f'Cluster {cluster}')
    plt.xlabel('Points')
    plt.ylabel('Assists')
    plt.title(f'K-means Clustering (K={k})')
    plt.legend()
    plt.show()

# Call this for K=2, K=3, K=4 with appropriate cluster assignments.


In [5]:
def calculate_sse(df, centroids):
    sse = 0
    for index, row in df.iterrows():
        cluster = row['cluster']
        centroid = centroids[cluster - 1]
        sse += euclidean_distance(np.array([row['points'], row['assists'], row['rebounds']]), centroid) ** 2
    return sse

# Keep track of SSE for each K
sse_values = []
for k in range(1, 5):
    # Recalculate clusters and centroids for K
    # Then calculate SSE
    sse_values.append(calculate_sse(df, centroids))  # Update centroids accordingly

plt.plot(range(1, 5), sse_values, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.title('K vs SSE')
plt.show()


NameError: name 'centroids' is not defined

In [6]:
# Manhattan Distance
def manhattan_distance(point, centroid):
    return np.sum(np.abs(point - centroid))

# Minkowski Distance
def minkowski_distance(point, centroid, p=3):
    return np.sum(np.abs(point - centroid) ** p) ** (1/p)
