# Build a Clustering Model to Perform a Customer Geolocation Data Clustering with K-Means Algorithm

This is the flow process of geolocation data in the clustering algorithm :

![image alt](static/flow_process.png "Flow Process")

In [1]:
!pip install geopandas



## Geocoding : From Address to Longitude & Latitude

In [2]:
from geopandas.tools import geocode

In [3]:
# Customer Address Needed to Perform Requests for Fetching Geolocation Data
customer_address = "RIVIERA, CASABLANCA"

latitude, longitude = None, None

try:
    result = geocode(customer_address, provider="nominatim")
    geo_location = result.geometry.iloc[0]
    latitude = geo_location.y
    longitude = geo_location.x
except:
    pass

print(f"Your address : {customer_address}")
print(f"Your geolocation : {latitude, longitude}")



Your address : RIVIERA, CASABLANCA
Your geolocation : (33.5709378, -7.6363549)


## Import Geolocation Data

In [4]:
pip install pandas numpy sklearn



In [5]:
import pandas as pd

In [6]:
file_url = "dataset/coord.csv"

data = pd.read_csv(file_url, delimiter=";")
features = data[['latitude', 'longitude']]
print(features)

      latitude  longitude
0    33.546479  -7.655312
1    33.549269  -7.650409
2    33.540746  -7.641676
3    33.585636  -7.619398
4    33.598570  -7.647054
..         ...        ...
995  33.555675  -7.646576
996  33.570501  -7.632851
997  33.556785  -7.621489
998  33.565970  -7.651056
999  33.596105  -7.646280

[1000 rows x 2 columns]


## K-Means Model & Training

The basic pseudocode of the K-Means algorithm is :

>**begin** <br>
   specify the number k of clustering to assign.<br>
   randomly initialize k centroids.<br>
   **repeat**<br>
      **expectation:** Assign each point to its closet centroid.   
      **maximization:** Compute the new centroid (mean) of each cluster.<br>
   **until** The centroid position do not change.<br>
**end**

In [7]:
from sklearn.cluster import KMeans

In [8]:
# create kmeans model

kmeans = KMeans(
    init="random",
    n_clusters=16,
    n_init=10,
    max_iter=300,
    random_state=42
)

In [9]:
# training clustering model

kmeans.fit(features)

# saving resulting labels
labels = kmeans.labels_

In [10]:
# sending results back into dataframe and display it
data['cluster'] = labels

# display the number of mamber each clustering
_clusters = data.groupby('cluster')['Index'].count()
print(_clusters)

cluster
0     64
1     50
2     71
3     79
4     68
5     53
6     72
7     57
8     45
9     63
10    57
11    59
12    59
13    63
14    73
15    67
Name: Index, dtype: int64


## Clustering with Constrained Problem

In [11]:
pip install k-means-constrained



In [12]:
import numpy as np

In [13]:
file_url = "dataset/coord.csv"

data_2 = pd.read_csv(file_url, delimiter=";")

# convert our features dataframe into an X array
features_2 = data[['latitude', 'longitude']]
X = np.array(features_2)
print(X[:10])

[[33.5464789  -7.65531218]
 [33.54926861 -7.65040856]
 [33.54074579 -7.64167577]
 [33.58563566 -7.6193976 ]
 [33.59856997 -7.64705397]
 [33.59408516 -7.62597458]
 [33.56767724 -7.64328812]
 [33.5882301  -7.60901599]
 [33.54945347 -7.64815838]
 [33.57455478 -7.62984487]]


In [14]:
from k_means_constrained import KMeansConstrained

In [15]:
# create constrained kmeans model

clf = KMeansConstrained(
    n_clusters=16,
    size_min=39,
    size_max=63,
    random_state=0
)

In [16]:
# training kmeans constrained model with 
# min and max size number of each cluster 

clf.fit_predict(X)

# saving resulting labels
labels_2 = clf.labels_

In [17]:
# send back results into dataframe 
data['cluster_2'] = labels_2

# display the number of mamber each clustering
_clusters_2 = data.groupby('cluster_2')['Index'].count()
print(_clusters_2)

cluster_2
0     63
1     62
2     63
3     63
4     63
5     63
6     63
7     63
8     63
9     63
10    63
11    57
12    63
13    62
14    63
15    63
Name: Index, dtype: int64


## Visualization of the Result

In [18]:
pip install streamlit-folium



In [26]:
import folium
import streamlit as st
from streamlit_folium import folium_static

In [36]:
colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred',
          'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue',
          'darkpurple', 'pink', 'lightblue', 'lightgreen', 'gray']

lat = data.iloc[0]['latitude']
lng = data.iloc[0]['longitude']


### Map containing results of Geolocation Data Clustering (First Model Results)

In [38]:
# Map containing results of Geolocation Data Clustering (First Model Results)

map = folium.Map(location=[latitude, longitude], zoom_start=13)

for _, row in data.iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=12,
        weight=2,
        fill=True,
        fill_color=colors[int(row["cluster"])],
        color=colors[int(row["cluster"])]
    ).add_to(map)
    
# Displaying Map
folium_static(map)

DeltaGenerator(_root_container=0, _provided_cursor=None, _parent=None, _block_type=None, _form_data=None)

### Map containing results of Geolocation Data Constrained Clustering (min: 39, max: 63)

In [39]:
# Map containing results of Geolocation Data Constrained Clustering (min: 39, max: 63)

map_2 = folium.Map(location=[latitude, longitude], zoom_start=13)

for _, row in data.iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=12,
        weight=2,
        fill=True,
        fill_color=colors[int(row["cluster_2"])],
        color=colors[int(row["cluster_2"])]
    ).add_to(map_2)

In [40]:
# Displaying Map 2
folium_static(map_2)

DeltaGenerator(_root_container=0, _provided_cursor=None, _parent=None, _block_type=None, _form_data=None)