# Clustering spatial data(retail stores) via the latitude, longitude coordinates available for the retail stores. 

### Importing libs

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import math
import matplotlib.pyplot as plt

### Importing data(lat,long coordinates of the stores)

In [2]:
df = pd.read_excel('H:\intern_lovelocal\ML_Project\lat_long_google_api.xlsx')
X=df.loc[:,['store_latitude','store_longitude']]
points= np.array(X)
print(points)


[[19.0209926  72.8403817 ]
 [19.2353777  72.8641904 ]
 [19.1886025  72.85634766]
 ...
 [18.45992267 73.81853091]
 [28.63501809 77.08556006]
 [18.464077   73.867619  ]]


### Calculating distance between retail stores via the Haversine matrix formula

In [4]:
def distance(origin, destination): #found here https://gist.github.com/rochacbruno/2883505
    lat1, lon1 = origin[0],origin[1]
    lat2, lon2 = destination[0],destination[1]
    radius = 6371 # km
    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c

    return d

### Creating clusters

In [5]:
def create_clusters(number_of_clusters,points):    
    kmeans = KMeans(n_clusters=number_of_clusters, random_state=0).fit(points)    
    id_label=kmeans.labels_    
    l_array = np.array([[label] for label in kmeans.labels_])
    clusters = np.append(points,l_array,axis=1)
    return clusters

### Getting cluster centroids

In [16]:
def cluster_centroids(number_of_clusters,points):
    number_of_clusters=11
    
    kmeans = KMeans(n_clusters=number_of_clusters, random_state=0).fit(points)
    
    id_label=kmeans.labels_    
    l_array = np.array([[label] for label in kmeans.labels_])
    clusters = np.append(points,l_array,axis=1)
    centroids = kmeans.cluster_centers_
    return clusters,centroids

### Validating the solution

In [6]:
def validate_solution(max_dist,clusters):
    _, __, n_clust = clusters.max(axis=0)
    n_clust = int(n_clust)
    for i in range(n_clust):
        two_d_cluster=clusters[clusters[:,2] == i][:,np.array([True, True, False])]
        if not validate_cluster(max_dist,two_d_cluster):
            return False
        else:
            continue
    return True

### Validating the clusters

In [None]:
def validate_cluster(max_dist,cluster):
    distances = cdist(cluster,cluster, lambda ori,des: int(round(distance(ori,des))))
    print(distances)
    print(30*'-')
    for item in distances.flatten():
        if item > max_dist:
            return False
    return True

### Get optimum number of clusters

In [None]:
if __name__ == '__main__':
    for i in range(2,40): #specify the number of clusters to be made, and the output will give the optimum number of clusters using elbow method
        print(i)
        print(validate_solution(10,create_clusters(i,points)))

Got 11 as optimum number of clusters, via above code.

### Get cluster centroids for the optimum clusters got

In [17]:
results,centroids = cluster_centroids(11,points)
print(centroids)

[[24.42615051 72.93411798]
 [19.19396304 72.88359209]
 [28.71938616 77.21987989]
 [24.06662515 86.40353931]
 [17.76687581 78.3491089 ]
 [12.80035239 77.80485137]
 [25.45952024 80.61005932]
 [25.07970141 83.72000511]
 [22.67560356 76.81881999]
 [18.41930331 73.96678983]
 [26.53761836 76.10917941]]


Got the centroid of 11 clusters which will be fed to google places API to get required data. 