In [269]:
import pandas as pd
import numpy as np
import folium
from folium import plugins
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import datetime
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [270]:
df = pd.read_csv('data/aout_2015.csv', sep=',',dtype={"latitude":float,"longitude":float,"wind_direction_true":float,"wind_speed":float,"present_weather":float,"sea_level_pressure":float,"air_temperature":float,"wetbulb_temperature":float,"sea_surface_temp":float},low_memory=False)

In [271]:
# delete index of all dataset
df = df.drop(df.columns[0],axis=1)
df_origin_data = df
df.head(10)

Unnamed: 0,latitude,longitude,wind_direction_true,wind_speed,present_weather,sea_level_pressure,air_temperature,wetbulb_temperature,sea_surface_temp,total_cloud_amount,cloud_height,wave_direction,wave_period,wave_height,swell_direction,swell_period,swell_height,timestamp
0,45.8,-142.7,310.0,16.5,3.0,1020.0,15.5,14.0,16.0,7.0,6,,8.0,4.5,31.0,8.0,4.5,2005-08-27 17:00:00+00:00
1,47.5,-17.5,240.0,25.0,3.0,1016.5,21.0,19.5,21.5,,,,5.0,4.0,23.0,6.0,,2005-08-24 00:00:00+00:00
2,59.0,-11.4,260.0,10.3,,1001.2,11.8,,13.1,,A,,7.0,4.5,,,,2005-08-26 10:00:00+00:00
3,18.4,118.4,330.0,18.0,61.0,994.8,28.4,27.0,32.0,8.0,3,,12.0,6.0,6.0,10.0,4.0,2005-08-12 06:00:00+00:00
4,39.3,-11.3,20.0,17.0,1.0,1018.6,21.0,19.0,20.8,3.0,6,,8.0,4.0,2.0,9.0,4.0,2005-08-19 18:00:00+00:00
5,42.4,5.8,310.0,15.4,1.0,1010.7,22.4,,18.6,3.0,6,,10.0,6.0,,,,2005-08-22 15:00:00+00:00
6,68.6,-111.1,250.0,5.1,2.0,1013.1,10.5,,,6.0,9,,22.0,40.0,,,,2005-08-17 21:00:00+00:00
7,12.8,52.2,200.0,17.0,,1005.0,25.0,23.0,21.0,,,,7.0,6.0,22.0,5.0,4.0,2005-08-21 18:00:00+00:00
8,57.1,-177.6,170.0,16.0,,1000.8,9.6,,9.9,,A,,10.0,4.5,,,,2005-08-05 16:49:00+00:00
9,53.1,-11.2,280.0,7.7,,1002.6,14.3,,15.6,,A,,9.0,4.0,,,,2005-08-24 20:00:00+00:00


In [272]:
def fill_NaN_and_String(df):
    df = df.replace('A',10)
    df.fillna(0,inplace=True)
    return df

In [273]:
def keep_only_temp(df):
    df = df.drop(["wind_direction_true","present_weather","sea_surface_temp","wetbulb_temperature","total_cloud_amount","cloud_height","wave_direction","wave_period","wave_height","swell_direction","swell_period","swell_height"],axis=1)
    return df

In [274]:
df = keep_only_temp(df)
df = df.dropna()
#df = fill_NaN_and_String(df)
df.head(10)

Unnamed: 0,latitude,longitude,wind_speed,sea_level_pressure,air_temperature,timestamp
0,45.8,-142.7,16.5,1020.0,15.5,2005-08-27 17:00:00+00:00
1,47.5,-17.5,25.0,1016.5,21.0,2005-08-24 00:00:00+00:00
2,59.0,-11.4,10.3,1001.2,11.8,2005-08-26 10:00:00+00:00
3,18.4,118.4,18.0,994.8,28.4,2005-08-12 06:00:00+00:00
4,39.3,-11.3,17.0,1018.6,21.0,2005-08-19 18:00:00+00:00
5,42.4,5.8,15.4,1010.7,22.4,2005-08-22 15:00:00+00:00
6,68.6,-111.1,5.1,1013.1,10.5,2005-08-17 21:00:00+00:00
7,12.8,52.2,17.0,1005.0,25.0,2005-08-21 18:00:00+00:00
8,57.1,-177.6,16.0,1000.8,9.6,2005-08-05 16:49:00+00:00
9,53.1,-11.2,7.7,1002.6,14.3,2005-08-24 20:00:00+00:00


In [275]:
print(len(df))


376146


In [276]:
sea_level = []
air_temp = []
data = []
latitude = []
longitude = []
wind_speed= []

In [277]:
def keep_by_day(day):
    global sea_level,air_temp,data,latitude,longitude,wind_speed
    sea_level = []
    air_temp = []
    data = []
    latitude = []
    longitude = []
    wind_speed= []
    for index, row in df.iterrows():
        storedTime = datetime.datetime.strptime(row["timestamp"][:-6], "%Y-%m-%d %H:%M:%S")
        if storedTime.day == day:
    #    if -8 < row['latitude'] < 62 and -86 < row['longitude'] < 12:
            sea_level.append(row['sea_level_pressure'])
            air_temp.append(row['air_temperature'])
            latitude.append(row['latitude'])
            longitude.append(row['longitude'])
            wind_speed.append(row['wind_speed'])
            data.append(row)

In [278]:
def kmeans_on_pressure_temp():
    global sea_level,air_temp,data,latitude,longitude,wind_speed
    latitude = np.array(latitude)
    longitude = np.array(longitude)
    sea_level = np.array(sea_level)
    air_temp = np.array(air_temp)
    air_pressure = np.concatenate(sea_level,air_temp)
    wind_speed = np.array(wind_speed)
    X = np.dstack((sea_level,air_pressusre))
    #X = StandardScaler().fit_transform(dataZone)
    kmeans = KMeans(n_clusters=nb_clusters,random_state=1,n_jobs=-1)
    kmeans.fit(X_modif)
    
    idx = np.argsort(kmeans.cluster_centers_.sum(axis=1))
    lut = np.zeros_like(idx)
    lut[idx] = np.arange(nb_clusters)
    #lut[kmeans.labels_]
    y_kmeans = lut[kmeans.labels_] #kmeans.predict(X)
    
    return y_kmeans

#centers = kmeans.cluster_centers_
#plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5);

In [279]:
def show_graph(y_kmeans):
    pos = np.column_stack((latitude, longitude))
    plt.scatter(pos[:,0], pos[:, 1], c=y_kmeans, s=50, cmap='viridis')
    plt.show()

In [280]:
nb_clusters = 8

In [281]:
# m = folium.Map([0,0],zoom_start=1.5,prefer_canvas=True)
# m

In [282]:
def tsne_algorithm():
    global sea_level,air_temp,data,latitude,longitude
    latitude = np.array(latitude)
    longitude = np.array(longitude)
    sea_level = np.array(sea_level)
    air_temp = np.array(air_temp)
    X = np.column_stack((latitude,longitude,sea_level, air_temp))
    X_embedded = TSNE(n_components=2,learning_rate=80,n_iter=300).fit_transform(X)
    plt.scatter(X_embedded[:,0], X_embedded[:, 1], cmap='viridis')
    plt.show()
#tsne_algorithm()

In [283]:
def plotDot(points, cluster):
    colors = ["#e6194B","#f58231","#ffe119","#bfef45","#3cb44b","#42d4f4","#911eb4","#000000"]
    # rouge,orange,jaune,vert clair,vert foncé, bleu, violet,noir
    for index,point in enumerate(points):
        for i in range(nb_clusters):
            if cluster[index] == i :
                folium.CircleMarker([point["latitude"], point["longitude"]],
                                radius=3,
                                weight=0.5, color=colors[i]).add_to(m)
            

In [284]:
nb_day = 31
for i in range(1,nb_day+1):
    title = "cluster_on_map_day{}.html".format(i)
    m = folium.Map([0, 0], zoom_start=1.5,prefer_canvas=True)
    keep_by_day(i)
    y_kmeans = kmeans_on_pressure_temp()
    plotDot(data,y_kmeans)
    m.save(title)


[[1008.5    7. ]
 [  28.2 1005.2]
 [  21.    25. ]
 ...
 [1015.6    6.1]
 [  13.6 1020.2]
 [   4.    14.4]]


ValueError: cannot reshape array of size 37503 into shape (2)

In [None]:
def show_scatter_graph(day):
    keep_by_day(day)
    y_kmeans = kmeans_on_pressure_temp()
    show_graph(y_kmeans)

In [None]:
show_scatter_graph(28)

In [None]:
#m = folium.Map([0, 0], zoom_start=1.5,prefer_canvas=True)
#keep_by_day(18)
#y_brc = birch_algorithm()
#plotDot(data,y_brc)
#m.save("test.html")