In [1]:
import pandas as pd
import json
import requests
import folium
from sklearn.cluster import KMeans
from folium.plugins import FastMarkerCluster

In [2]:
url = 'https://demo.brewlytics.com/blocking/api/jobs?m=b5a3bb0c-589c-4312-a054-4ed29f8be09c&AOI_9562a686-409c-4e97-ab86-df3bd1312384=BOX(-220.429688%20-17.308688,-99.140625%2054.367759)&o=0fd17264-49dc-40fa-9aac-eed2193e682a'
headers = {'X-REQUEST-TOKEN': 'CuU7s34kJp3MjrbFyr8SuP'}
req= requests.get(url,headers= headers)

In [3]:
data=req.json()
data['types']

[{'id': 'string'},
 {'longitude': 'decimal'},
 {'latitude': 'decimal'},
 {'location': 'point'},
 {'elevation': 'decimal'},
 {'name': 'string'},
 {'owner': 'string'},
 {'pgm': 'string'},
 {'type': 'string'},
 {'met': 'bool'},
 {'currents': 'bool'},
 {'water_quality': 'bool'},
 {'dart': 'bool'}]

In [4]:
df = pd.DataFrame.from_dict(data['table'], orient = 'columns')
df.head(4)

Unnamed: 0,elevation,owner,latitude,pgm,name,location,id,type,longitude,currents,dart,water_quality,met
0,0.0,NDBC,5.0,TAO,5N 110W,POINT(-110 5),32315,tao,-110.0,,,,
1,0.0,NDBC,2.0,TAO,2N 110W,POINT(-110 2),32316,tao,-110.0,,,,
2,0.0,NDBC,-2.0,TAO,2S 110W,POINT(-110 -2),32317,tao,-110.0,,,,
3,0.0,NDBC,-5.0,TAO,5S 110W,POINT(-110 -5),32318,tao,-110.0,,,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   elevation      230 non-null    float64
 1   owner          288 non-null    object 
 2   latitude       288 non-null    float64
 3   pgm            288 non-null    object 
 4   name           288 non-null    object 
 5   location       288 non-null    object 
 6   id             288 non-null    object 
 7   type           288 non-null    object 
 8   longitude      288 non-null    float64
 9   currents       254 non-null    object 
 10  dart           254 non-null    object 
 11  water_quality  254 non-null    object 
 12  met            254 non-null    object 
dtypes: float64(3), object(10)
memory usage: 29.4+ KB


In [6]:
df.shape

(288, 13)

In [7]:
df.drop_duplicates(keep = 'first', inplace = True)

In [8]:
missing = df.shape[0] - df.count()
missing

elevation        58
owner             0
latitude          0
pgm               0
name              0
location          0
id                0
type              0
longitude         0
currents         34
dart             34
water_quality    34
met              34
dtype: int64

In [9]:
#fills missing values for elevation with average of before and after elevation point
df.elevation = df.elevation.interpolate()

In [10]:
df.head(4)

Unnamed: 0,elevation,owner,latitude,pgm,name,location,id,type,longitude,currents,dart,water_quality,met
0,0.0,NDBC,5.0,TAO,5N 110W,POINT(-110 5),32315,tao,-110.0,,,,
1,0.0,NDBC,2.0,TAO,2N 110W,POINT(-110 2),32316,tao,-110.0,,,,
2,0.0,NDBC,-2.0,TAO,2S 110W,POINT(-110 -2),32317,tao,-110.0,,,,
3,0.0,NDBC,-5.0,TAO,5S 110W,POINT(-110 -5),32318,tao,-110.0,,,,


In [11]:
df.drop(columns=['location'],inplace=True)

In [12]:
locations = df[['latitude','longitude']]
location_list = locations.values.tolist()
location_list[7]

[16.003, -106.989]

In [13]:
len(location_list)

288

In [14]:
df.shape[0]

288

In [15]:
map = folium.Map(lostion=[-110, 2], zoom_start = 10)
for point in range(0,len(location_list)):
    folium.Marker(location_list[point],popup = df['name'][point]).add_to(map)


In [16]:
map

In [17]:
df.nunique()

elevation         76
owner             23
latitude         257
pgm                7
name             288
id               288
type               5
longitude        252
currents           2
dart               2
water_quality      2
met                2
dtype: int64

In [18]:
#taking id and name out of a new df to get dummy because they are both individual identifiers with 288 valoues and shouold not be used to cluster 
data = df[df.columns.difference(['id','name'],sort=False)]
data

Unnamed: 0,elevation,owner,latitude,pgm,type,longitude,currents,dart,water_quality,met
0,0.000,NDBC,5.000,TAO,tao,-110.000,,,,
1,0.000,NDBC,2.000,TAO,tao,-110.000,,,,
2,0.000,NDBC,-2.000,TAO,tao,-110.000,,,,
3,0.000,NDBC,-5.000,TAO,tao,-110.000,,,,
4,0.000,NDBC,-8.000,TAO,tao,-110.000,,,,
...,...,...,...,...,...,...,...,...,...,...
283,3.000,NOS,53.880,NOS/CO-OPS,fixed,-166.537,False,False,False,True
284,0.000,NOAA NOS PORTS,38.038,NOS/CO-OPS,fixed,-122.121,False,False,False,True
285,328.574,National Park Service - Lake Mead National Rec...,36.132,IOOS Partners,buoy,-114.412,False,False,False,False
286,3.000,NDBC,47.662,NDBC Meteorological/Ocean,fixed,-122.436,False,False,False,True


In [68]:
data =pd.get_dummies(data,dummy_na=True)

In [69]:
data

Unnamed: 0,elevation,latitude,longitude,cluster,"owner_APL-UW, University of Washington Applied Physics Laboratory",owner_California Polytechnic State University,owner_Central and Northern California Ocean Observing System,owner_Columbia River Inter-Tribal Fish Commission,owner_Environment and Climate Change Canada,owner_MBARI,...,currents_nan,dart_False,dart_True,dart_nan,water_quality_False,water_quality_True,water_quality_nan,met_False,met_True,met_nan
0,0.000,5.000,-110.000,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,1
1,0.000,2.000,-110.000,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,1
2,0.000,-2.000,-110.000,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,1
3,0.000,-5.000,-110.000,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,1
4,0.000,-8.000,-110.000,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,3.000,53.880,-166.537,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
284,0.000,38.038,-122.121,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
285,328.574,36.132,-114.412,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
286,3.000,47.662,-122.436,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0


In [70]:

data_matrix= data.values


## Clustering with Kmeans

In [71]:
km=KMeans(n_clusters=4)
y = km.fit_predict(data_matrix)

In [72]:
df['cluster'] = y
df.head(5)

Unnamed: 0,elevation,owner,latitude,pgm,name,id,type,longitude,currents,dart,water_quality,met,cluster
0,0.0,NDBC,5.0,TAO,5N 110W,32315,tao,-110.0,,,,,1
1,0.0,NDBC,2.0,TAO,2N 110W,32316,tao,-110.0,,,,,1
2,0.0,NDBC,-2.0,TAO,2S 110W,32317,tao,-110.0,,,,,1
3,0.0,NDBC,-5.0,TAO,5S 110W,32318,tao,-110.0,,,,,1
4,0.0,NDBC,-8.0,TAO,8S 110W,32319,tao,-110.0,,,,,1


In [73]:
df.cluster.value_counts()

1    208
0     72
2      5
3      3
Name: cluster, dtype: int64

In [74]:

cluster_list = df['cluster'].tolist()

In [75]:
len(cluster_list),len(location_list)

(288, 288)

In [76]:
map = folium.Map(lostion=[-110, 2], zoom_start = 10)
for point in range(0,len(location_list)):
        if cluster_list[point] == 0:
            folium.Marker(location_list[point], popup = df['name'][point], icon=folium.Icon(color="green")).add_to(map)
        elif cluster_list[point] == 1:
            folium.Marker(location_list[point],popup = df['name'][point], icon=folium.Icon(color="red")).add_to(map)
        elif cluster_list[point] == 2:
            folium.Marker(location_list[point],popup = df['name'][point], icon=folium.Icon(color="orange")).add_to(map)
        else:
            folium.Marker(location_list[point],popup = df['name'][point],  icon=folium.Icon(color="blue")).add_to(map)
 

In [77]:
map

## clustering wtih DBSCAN

In [19]:
import numpy as np
from sklearn.cluster import DBSCAN

In [20]:
kms_per_radian = 6371.0088
epsilon = 1000/kms_per_radian
db = DBSCAN(eps = epsilon, min_samples = 1, algorithm = 'ball_tree',metric='haversine').fit(np.radians(locations))

In [23]:
dbscan_df= df.copy()

In [24]:

dbscan_df['cluster']= db.labels_

In [25]:
dbscan_df

Unnamed: 0,elevation,owner,latitude,pgm,name,id,type,longitude,currents,dart,water_quality,met,cluster
0,0.000,NDBC,5.000,TAO,5N 110W,32315,tao,-110.000,,,,,0
1,0.000,NDBC,2.000,TAO,2N 110W,32316,tao,-110.000,,,,,0
2,0.000,NDBC,-2.000,TAO,2S 110W,32317,tao,-110.000,,,,,0
3,0.000,NDBC,-5.000,TAO,5S 110W,32318,tao,-110.000,,,,,0
4,0.000,NDBC,-8.000,TAO,8S 110W,32319,tao,-110.000,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,3.000,NOS,53.880,NOS/CO-OPS,"9462620 - Unalaska, AK",unla2,fixed,-166.537,False,False,False,True,1
284,0.000,NOAA NOS PORTS,38.038,NOS/CO-OPS,"9415118 - Union Pacific Rail Road Bridge, Mart...",upbc1,fixed,-122.121,False,False,False,True,1
285,328.574,National Park Service - Lake Mead National Rec...,36.132,IOOS Partners,Lake Mead - Virgin Basin,vbba3,buoy,-114.412,False,False,False,False,1
286,3.000,NDBC,47.662,NDBC Meteorological/Ocean,"West Point, WA",wpow1,fixed,-122.436,False,False,False,True,1


In [28]:
dbscan_df.cluster.value_counts(sort=False)

0      9
1    222
2     26
3      6
4     10
5      7
6      7
7      1
Name: cluster, dtype: int64

In [30]:
dbscan_df['cluster'].nunique()

8

In [32]:


# Create a map
m = folium.Map()



# Create a dictionary to map class labels to colors
color_dict = {0: 'red',1: 'green', 2: 'blue', 3: 'orange',4:'yellow',5: 'indigo',6: 'violet', 7: 'purple'}

# Iterate through the data and add a marker for each location
for _, row in dbscan_df.iterrows():
    lat = row['latitude']
    lng = row['longitude']
    label = row['cluster']
    # Use the 'class' label to determine the color of the marker
    color = color_dict.get(label, 'black')  # use 'black' as the default color
    folium.Marker(location=[lat, lng], popup=label, icon=folium.Icon(color=color)).add_to(m)

# Display the map
m

    

  folium.Marker(location=[lat, lng], popup=label, icon=folium.Icon(color=color)).add_to(m)


## PCA adn Kmeans

In [82]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [83]:
scaler = StandardScaler()