In [1]:
import pandas as pd
import datetime
import random
import os

from bokeh.plotting import figure
from bokeh.palettes import Category20
from bokeh.layouts import gridplot
from bokeh.io import output_notebook, show

output_notebook()

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

palette = Category20[20]

In [2]:
os.chdir("/".join(os.getcwd().split("/")[0:-1]))
print("working dir", os.getcwd())

working dir /home/tales/dev/mdc_analysis


In [3]:
from src.dao import csv_dao
from src.utils import geo
from src.plot import plot

from sklearn.cluster import KMeans

# Data Points

In [4]:
gps_df = pd.DataFrame()

for user_gps_file in csv_dao.list_user_gps_files():
    userid = user_gps_file[0:4]
    try:
        user_gps_data = csv_dao.load_user_gps_csv(userid=userid)[["longitude", "latitude"]]
        gps_df = gps_df.append(user_gps_data).drop_duplicates()
    except pd.errors.EmptyDataError:
        continue

gps_df = gps_df.drop_duplicates().reset_index(drop=True)

In [5]:
gps_df.head()

Unnamed: 0,longitude,latitude
0,6.56,46.51
1,6.565,46.52
2,6.564,46.52
3,6.565,46.521
4,6.5655,46.522


In [None]:
sample_i = gps_df.sample(int(len(gps_df) / 400)).index.tolist()
len(sample_i)

In [None]:
#inviável mesmo plotando uma amostra metade do tamanho
p = figure(title="All Data Points", plot_width=800, plot_height=600)
p.circle(x=gps_df.loc[sample_i]["latitude"], y=gps_df.loc[sample_i]["longitude"], size=2, alpha=0.2)
show(p)

In [6]:
n_clusters = 16
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(gps_df)
gps_df["cluster"] = kmeans.labels_

In [None]:
grid_p = [[]]
color_i = 0
for cluster in gps_df["cluster"].drop_duplicates():
    color = palette[color_i]
    cluster_df = gps_df[gps_df["cluster"] == cluster]

    p = figure(title="All Data Points - " + str(n_clusters) + " Clusters", plot_width=200, plot_height=200, x_range=(45.5, 48), y_range=(6.5, 11))
    p.circle(x=cluster_df["latitude"], y=cluster_df["longitude"], size=2, alpha=0.4, color=color)
    
    if (color_i % 4) == 3:
        grid_p.append([])
    
    grid_p[-1].append(p)
    
    color_i += 1

show(gridplot(grid_p))

In [9]:
cluster_n = 0
for cluster in gps_df["cluster"].drop_duplicates():
    cluster_df = gps_df[gps_df["cluster"] == cluster]
    cluster_df.to_csv("outputs/data_points_clusters/cluster_" + str(cluster_n) + ".csv", index=False)
    cluster_n += 1

In [7]:
len(gps_df)

3672896