In [2]:
import pandas as pd
import numpy as np
import random
import os


import geopandas as gpd
from shapely.geometry import Point
from sklearn.neighbors import BallTree

In [159]:
df = pd.read_csv("../../data/classification_ready_data.csv")

df.shape

(1364460, 17)

In [160]:
# num of clusters
df['cluster_1'].nunique()

38079

In [161]:
df = df[['latitude','longitude', 'cluster_1']]
df = df.sort_values(by=['cluster_1'],ignore_index=True)
df

Unnamed: 0,latitude,longitude,cluster_1
0,51.489442,-0.190292,0.0
1,51.489352,-0.190296,0.0
2,51.489352,-0.190296,0.0
3,51.489354,-0.190440,0.0
4,51.489354,-0.190440,0.0
...,...,...,...
1364455,55.543842,-4.660981,54492.0
1364456,55.543781,-4.660850,54492.0
1364457,55.543781,-4.660850,54492.0
1364458,55.543716,-4.660956,54492.0


In [162]:
df.drop_duplicates(inplace=True,ignore_index=True)
df

Unnamed: 0,latitude,longitude,cluster_1
0,51.489442,-0.190292,0.0
1,51.489352,-0.190296,0.0
2,51.489354,-0.190440,0.0
3,51.488749,-0.192048,0.0
4,51.488920,-0.191465,0.0
...,...,...,...
257398,55.543716,-4.660956,54492.0
257399,55.543978,-4.661291,54492.0
257400,55.543719,-4.660798,54492.0
257401,55.543842,-4.660981,54492.0


In [163]:
# cluster approximate mid points
# (assume: with coordinates that close to each other, you can treat the Earth as being locally flat and simply find the centroid as though they were planar coordinates. )

df = df.groupby(['cluster_1'], as_index=False).mean()

df

Unnamed: 0,cluster_1,latitude,longitude
0,0.0,51.489206,-0.190764
1,1.0,51.519975,-0.211626
2,2.0,51.482212,-0.173765
3,3.0,51.501756,-0.191344
4,4.0,51.483387,-0.167298
...,...,...,...
38074,54488.0,55.803012,-4.051202
38075,54489.0,55.767877,-4.040314
38076,54490.0,55.739641,-3.973837
38077,54491.0,55.823578,-4.085047


In [164]:
gdf_df = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
gdf_df

Unnamed: 0,cluster_1,latitude,longitude,geometry
0,0.0,51.489206,-0.190764,POINT (-0.19076 51.48921)
1,1.0,51.519975,-0.211626,POINT (-0.21163 51.51998)
2,2.0,51.482212,-0.173765,POINT (-0.17376 51.48221)
3,3.0,51.501756,-0.191344,POINT (-0.19134 51.50176)
4,4.0,51.483387,-0.167298,POINT (-0.16730 51.48339)
...,...,...,...,...
38074,54488.0,55.803012,-4.051202,POINT (-4.05120 55.80301)
38075,54489.0,55.767877,-4.040314,POINT (-4.04031 55.76788)
38076,54490.0,55.739641,-3.973837,POINT (-3.97384 55.73964)
38077,54491.0,55.823578,-4.085047,POINT (-4.08505 55.82358)


In [165]:
tree = BallTree(gdf_df[['longitude','latitude']].values, leaf_size=2 )

In [166]:
#e.g.:
input_df  = pd.DataFrame(
    {'longitude': [
        -3.716685
    ],'latitude': [
        50.534589
    ]})

_, input_df['cluster_1'] = tree.query(
    input_df[[ 'longitude','latitude']].values, # The input array for the query
    k=1, # The number of nearest neighbors
)

input_df['cluster_1'] = input_df['cluster_1'].apply(lambda x: gdf_df['cluster_1'].iloc[x])
input_df

Unnamed: 0,longitude,latitude,cluster_1
0,-3.716685,50.534589,51280.0


In [168]:
import pickle

pickle.dump(tree, open('./models/ball_tree_cluster.tree', 'wb'))
pickle.dump(gdf_df, open('./models/gdf_df.data', 'wb'))

In [173]:
input_df.iloc[0]['cluster_1']

51280.0


#### weather data for fake weather forecast info
[Demostration purpose only] 
This give fake weather details to test the models


In [3]:
df = pd.read_csv("./weather_data_gather/DATASETS/avg.csv")

In [6]:
df = df[['air_temperature','cld_ttl_amt_id','dewpoint','ground_state_id','rltv_hum','wind_direction','wind_speed']]

In [21]:
import random

random.seed(.324210)
SIZE = 78000

df.loc[[random.randrange(SIZE)]].to_dict('records')[0]

{'air_temperature': 11.2144911691491,
 'cld_ttl_amt_id': 7.483703290854838,
 'dewpoint': 10.283244681504305,
 'ground_state_id': 6.420100702227023,
 'rltv_hum': 93.75856187911543,
 'wind_direction': 135.1698437754195,
 'wind_speed': 11.01295497395501}

In [12]:
import pickle

pickle.dump(df, open('./models/dummy_weather.df', 'wb'))