# K-Means Clustering Using Basic Python & Numpy

In [40]:
import numpy as np

# This function is used for upload txt file
def load_dataset(name):
    return np.loadtxt(name)

# This function is used for find the Euclidian Distance
def euclidian(a, b):
    return np.linalg.norm(a-b)

#This function is used for Kmean Clustering 
def kmeans(k, epsilon=0, distance='euclidian'):
    history_centroids = []
    if distance == 'euclidian':
        dist_method = euclidian
    dataset = load_dataset('C:\\Users\\hamza jamil\\Downloads\\ccc\\__MACOSX\\0.txt')
    # dataset = dataset[:, 0:dataset.shape[1] - 1]
    num_instances, num_features = dataset.shape
    prototypes = dataset[np.random.randint(0, num_instances - 1, size=k)]
    history_centroids.append(prototypes)
    prototypes_old = np.zeros(prototypes.shape)
    belongs_to = np.zeros((num_instances, 1))
    norm = dist_method(prototypes, prototypes_old)
    iteration = 0
    while norm > epsilon:
        iteration += 1
        norm = dist_method(prototypes, prototypes_old)
        prototypes_old = prototypes
        for index_instance, instance in enumerate(dataset):
            dist_vec = np.zeros((k, 1))
            for index_prototype, prototype in enumerate(prototypes):
                dist_vec[index_prototype] = dist_method(prototype,
                                                        instance)

            belongs_to[index_instance, 0] = np.argmin(dist_vec)

        tmp_prototypes = np.zeros((k, num_features))

        for index in range(len(prototypes)):
            instances_close = [i for i in range(len(belongs_to)) if belongs_to[i] == index]
            prototype = np.mean(dataset[instances_close], axis=0)
            # prototype = dataset[np.random.randint(0, num_instances, size=1)[0]]
            tmp_prototypes[index, :] = prototype

        prototypes = tmp_prototypes

        history_centroids.append(tmp_prototypes)

    

    return prototypes, history_centroids, belongs_to





#   load_dataset function for read txt file

In [48]:
print(load_dataset('C:\\Users\\hamza jamil\\Downloads\\ccc\\__MACOSX\\0.txt')) # here we call over load_dataset
                                                                      #functoin to read our txt file

[[  4.5799537    2.02328985   0.82315443 ...  -5.2333601    3.69233373
    1.94677647]
 [  4.1820568    2.32421058   0.92980167 ...  -5.56948353   3.36895663
    1.66670782]
 [  4.1477836    2.17320989   1.06224779 ...  -5.5818858    3.60442882
    1.93620213]
 ...
 [ -8.33967264 -13.2846668   -1.34615581 ...  -9.5650505    7.36895766
   -4.60840407]
 [ -8.59027707   7.20808236  -3.96995165 ...  -0.33034336   6.41448931
   -4.23826982]
 [ -2.89575506  13.53716692   4.87929015 ...  -3.97744592   9.37005611
    1.70085442]]


#    k-means function to find the cluster

In [43]:
print(kmeans(4)) # here we call over k-mean functoin and we create 4 cluster k=4

(array([[-8.96855599,  8.85681671,  7.23039717, ...,  3.90886588,
        -6.19124522, -0.04005864],
       [-6.33463617, -9.09072332, -1.94509773, ..., -8.40639861,
         4.38997794, -2.64637259],
       [-4.6891471 , -9.45191223,  1.56609696, ...,  2.95925003,
        10.84315482,  1.35099566],
       [ 4.27838844,  2.07683016,  0.92553469, ..., -5.43982768,
         3.56527557,  1.80501458]]), [array([[-10.37517709,   7.47409974,   5.49697099, ...,   4.56381399,
        -10.35139235,  -1.15145834],
       [  4.32930046,   2.15741606,   0.98483694, ...,  -5.45394686,
          3.4190858 ,   1.78909215],
       [ -1.90219677,  -5.91062475,  -0.82718072, ...,   3.65155077,
          9.03581899,   2.03085853],
       [  4.41673406,   2.18701562,   1.15878746, ...,  -5.378129  ,
          3.48716666,   1.88540813]]), array([[-8.96855599,  8.85681671,  7.23039717, ...,  3.90886588,
        -6.19124522, -0.04005864],
       [-2.20939432, -4.74356612, -0.8201257 , ..., -7.25260077,
     

# euclidian function to find the distance

In [45]:
print(euclidian(-2.3435123,5.012345)) # here we call over function euclidian and distance betweeen two points

7.3558573


# silhoutte Method

In [4]:

def silhoutte( clusters, parsedData=np.genfromtxt("C:\\Users\\hamza jamil\\Downloads\\ccc\\__MACOSX\\0.txt")):  # calculateSC function with parameters
        sc = 0
        s = 0
        y = []
        for i in range(len(clusters)): 
            for j in range(len(clusters[i])):
                x_val = parsedData.ix[clusters[i][j]][2]
                y_val = parsedData.ix[clusters[i][j]][3]
                d = 0
                for l in range(len(clusters[i])):
                    dist1 = ((parsedData.ix[clusters[i][j]][2] - parsedData.ix[clusters[i][l]][2]) ** 2)
                    dist2 = ((parsedData.ix[clusters[i][j]][3] - parsedData.ix[clusters[i][l]][3]) ** 2)
                    eud = (dist1 + dist2) ** .5
                    d += eud
                a = d / (len(clusters[i]) - 1)
                b_vals = []
                for m in range(len(clusters)):
                    if m == i:
                        continue
                    c = 0
                    for j in range(len(clusters[m])):
                        dist3 = ((x_val - parsedData.ix[clusters[m][j]][2]) ** 2)
                        dist4 = ((y_val - parsedData.ix[clusters[m][j]][3]) ** 2)
                        eu = (dist3 + dist4) ** .5
                        c += eu
                    avg = float(c / len(clusters[m]))
                    b_vals.append(avg)
                b = min(b_vals)
                s = float((b - a) / max(a, b))  
                sc += s 
        sc = sc / len(parsedData[0])
        return sc  
    

In [47]:
 silhoutte  # here we call over functoin  silhoutte 

<function __main__.silhoutte(clusters, parsedData=array([[  4.5799537 ,   2.02328985,   0.82315443, ...,  -5.2333601 ,
          3.69233373,   1.94677647],
       [  4.1820568 ,   2.32421058,   0.92980167, ...,  -5.56948353,
          3.36895663,   1.66670782],
       [  4.1477836 ,   2.17320989,   1.06224779, ...,  -5.5818858 ,
          3.60442882,   1.93620213],
       ...,
       [ -8.33967264, -13.2846668 ,  -1.34615581, ...,  -9.5650505 ,
          7.36895766,  -4.60840407],
       [ -8.59027707,   7.20808236,  -3.96995165, ...,  -0.33034336,
          6.41448931,  -4.23826982],
       [ -2.89575506,  13.53716692,   4.87929015, ...,  -3.97744592,
          9.37005611,   1.70085442]]))>