In [1]:
import pandas as pd
import random
import os
import gc

import matplotlib.pyplot as plt

import numpy as np

from bokeh.palettes import Spectral10
from bokeh.io import output_notebook, show

output_notebook()
                                                                                                                                                                                                                                                                                                                                                        
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

palette = Spectral10

pd.set_option('display.float_format', lambda x: '%.3f' % x)

import bokeh
bokeh.__version__

os.chdir("/home/tales/dev/master/mdc_analysis/")
print("working dir", os.getcwd())

from src.dao.csv_dao import load_user_gps_csv, load_user_gps_time_window, load_user_stop_regions_centroids
from src.dao.dbdao import DBDAO
from src.utils.math import normalize
from src.utils.time_utils import enrich_time_columns, local_time
from src.utils.geo import cluster_centroid, weighted_cluster_centroid
from src.data_processment.stop_region import MovingCentroidStopRegionFinder
from src.plot.plot import add_centroid_figure, add_calculated_centroid_figure
from src.plot import plot
from src.utils.geo import index_clusters
from src.plot import plot
from bokeh.layouts import gridplot

working dir /home/tales/dev/master/mdc_analysis


In [2]:
def load_users_gps_data(userids, cols=["userid", "latitude", "longitude", "tz", "time", "local_time", "horizontal_accuracy", "horizontal_dop", "speed"]):
    df = pd.DataFrame()
    for userid in userids:
        user_gps_df = load_user_gps_csv(userid)
        user_gps_df["userid"] = [userid] * len(user_gps_df)
        df = df.append(user_gps_df)
        
    df = local_time(df)
    
    if cols != "*":
        df = df[cols]
    
    df = df.sort_values("local_time")

    return df

def load_user_gps_data(userid, cols=["userid", "latitude", "longitude", "tz", "time", "local_time", "horizontal_accuracy", "horizontal_dop", "speed"]):
    user_gps_data = load_user_gps_csv(userid)
    user_gps_data["userid"] = [userid] * len(user_gps_data)
    
    if cols != "*":
        user_gps_data = user_gps_data[cols]
        
    user_gps_data = user_gps_data.sort_values("local_time")
    
    return user_gps_data

In [3]:
user_ids = [6085, 5965] + DBDAO().users_with_places().sample(3).tolist()
user_ids

[6085, 5965, 5924, 5960, 5964]

In [4]:
def places_home(userid):
    user_gps_data = load_user_gps_data(userid)
    if len(DBDAO().places_home_df(userid=userid)) == 0:
        return pd.DataFrame()
    
    home_visit_data = DBDAO().places_home_df(userid=userid).sort_values("time_start")
    return places(home_visit_data, user_gps_data)

def places(place_label_visit_data, user_gps_data):
    place_label_visit_data = local_time(place_label_visit_data, time_col="time_start", tz_col="tz_start")
    place_label_visit_data = local_time(place_label_visit_data, time_col="time_end", tz_col="tz_end")

    user_visit_locations = pd.DataFrame()

    for index, row in place_label_visit_data.iterrows():
        user_visit_locations = user_visit_locations.append(user_gps_data[(user_gps_data["local_time"] >= row[
            "local_time_start"]) & (user_gps_data["local_time"] <= row["local_time_end"])])
        
    return user_visit_locations

def plot_visit(visit_locations, color="navy", title="user visits", limit_points=500, p=None, size=8, legend=None, width=800, height=600):
    if len(visit_locations) > limit_points:
        visit_locations = visit_locations.sample(limit_points)
    p = plot.plot_user_loc(user_data=visit_locations[["latitude", "longitude"]].drop_duplicates(), 
                      title=title, width=width, height=height, size=size, color=color, p=p, 
                      legend=legend, alpha=0.3)
    return p

# Categoria HOME com inferencia pela hora

In [5]:
def infer_home(userid, by=["time", "speed"]):
    inferred = places_home(userid)
    
    if "time" in by:
        inferred = enrich_time_columns(inferred)
        inferred = inferred[(inferred["hour"] >= 2) & (inferred["hour"] <=4)]

    if "speed" in by:
        quantile_10 = inferred["speed"].quantile(0.10)
        inferred = inferred[(inferred["speed"].isna()) | (inferred["speed"] <= quantile_10)]
        
    return inferred

def remove_outliers(data, quantile_threshold=0.05):
    clean_data = data.copy()
    
    if len(clean_data) == 0:
        return pd.DataFrame()
    
    lower_lat_value = data["latitude"].quantile(quantile_threshold)
    higher_lat_value = data["latitude"].quantile(1 - quantile_threshold)
    lower_lon_value = data["longitude"].quantile(quantile_threshold)
    higher_lon_value = data["longitude"].quantile(1 - quantile_threshold)
    
    clean_data = clean_data[(clean_data["latitude"] >= lower_lat_value) & (clean_data["latitude"] <= higher_lat_value) & (clean_data["longitude"] >= lower_lon_value) & (clean_data["longitude"] <= higher_lon_value)]
    
    return clean_data

### Centróide da categoria HOME

In [6]:
palette = {"visits": "gray", "visits_centroid": "#1c6000", "visits_w_ac": "#27a102", "visits_w_dop": "#1fea00",
                 "inf_time": "#005073", "inf_time_w_ac": "#107dac", "inf_time_w_dop": "#189ad3",
                 "inf_speed": "#ff5252", "inf_speed_w_ac": "#ff7b7b", "inf_speed_w_dop": "#ffbaba",
                 "inf_time_speed": "#ffb640", "inf_time_speed_w_ac": "#ffe833", "inf_time_speed_w_dop": "#fef678"}

for user_id in user_ids:
    print("user_id", user_id)
    p_home = None
    
    home_data = remove_outliers(places_home(user_id))
    
    if len(home_data) == 0:
          continue
        
    home_time_infer = remove_outliers(infer_home(user_id, ["time"]))
    home_speed_infer = remove_outliers(infer_home(user_id, ["speed"]))
    home_time_speed_infer = remove_outliers(infer_home(user_id, ["speed", "time"]))
    
    print(len(home_data), len(home_time_infer), len(home_speed_infer), len(home_time_speed_infer))
    
    home_data["h_acc_complement"] = normalize(home_data["horizontal_accuracy"], complement=True)
    home_data["h_dop_complement"] = normalize(home_data["horizontal_dop"], complement=True)
    
    if len(home_time_infer) > 2:
        home_time_infer["h_acc_complement"] = normalize(home_time_infer["horizontal_accuracy"], complement=True)
        home_time_infer["h_dop_complement"] = normalize(home_time_infer["horizontal_dop"], complement=True)
    
    if len(home_speed_infer) > 2:
        home_speed_infer["h_acc_complement"] = normalize(home_speed_infer["horizontal_accuracy"], complement=True)
        home_speed_infer["h_dop_complement"] = normalize(home_speed_infer["horizontal_dop"], complement=True)
    
    if len(home_time_speed_infer) > 2:
        home_time_speed_infer["h_acc_complement"] = normalize(home_time_speed_infer["horizontal_accuracy"], complement=True)
        home_time_speed_infer["h_dop_complement"] = normalize(home_time_speed_infer["horizontal_dop"], complement=True)
    
    p_home = plot_visit(home_data, color=palette["visits"], limit_points=800, title=str(user_id) + " HOME visits", legend="visits")
    add_centroid_figure(p_home, cluster=home_data, point_color=palette["visits_centroid"], fill_color=palette["visits_centroid"], cluster_alpha=0.3, legend="Visits Centroid", point_size=1)
    add_calculated_centroid_figure(p_home, centroid=weighted_cluster_centroid(home_data, "h_acc_complement"), legend="visits W Ac", point_size=1, point_color=palette["visits_w_ac"], fill_color=palette["visits_w_ac"], cluster_alpha=0.3)
    add_calculated_centroid_figure(p_home, centroid=weighted_cluster_centroid(home_data, "h_dop_complement"), legend="visits W DOP", point_size=1, point_color=palette["visits_w_dop"], fill_color=palette["visits_w_dop"], cluster_alpha=0.3)
    
    if len(home_time_infer) > 2:
        #p_home = plot_visit(home_time_infer, color="#005073", limit_points=800, title=str(user_id) + " HOME visits", p=p_home, legend="inferred time")
        add_centroid_figure(p_home, cluster=home_time_infer, point_color=palette["inf_time"], fill_color=palette["inf_time"], cluster_alpha=0.7, point_size=1, legend="inf time centroid")
        add_calculated_centroid_figure(p_home, centroid=weighted_cluster_centroid(home_time_infer, "h_acc_complement"), legend="inf time W Ac", point_size=1, point_color=palette["inf_time_w_ac"], fill_color=palette["inf_time_w_ac"], cluster_alpha=0.7)
        add_calculated_centroid_figure(p_home, centroid=weighted_cluster_centroid(home_time_infer, "h_dop_complement"), legend="inf time W DOP", point_size=1, point_color=palette["inf_time_w_dop"], fill_color=palette["inf_time_w_dop"], cluster_alpha=0.7)
    
    if len(home_speed_infer) > 2:
        #p_home = plot_visit(home_speed_infer, color="#a70000", limit_points=800, title=str(user_id) + " HOME visits", p=p_home, legend="inferred speed")
        add_centroid_figure(p_home, cluster=home_speed_infer, point_color=palette["inf_speed"], fill_color=palette["inf_speed"], cluster_alpha=0.7, point_size=1, legend="inf speed centroid")
        add_calculated_centroid_figure(p_home, centroid=weighted_cluster_centroid(home_speed_infer, "h_acc_complement"), legend="inf speed W Ac", point_size=1, point_color=palette["inf_speed_w_ac"], fill_color=palette["inf_speed_w_ac"], cluster_alpha=0.7)
        add_calculated_centroid_figure(p_home, centroid=weighted_cluster_centroid(home_speed_infer, "h_dop_complement"), legend="inf speed W DOP", point_size=1, point_color=palette["inf_speed_w_dop"], fill_color=palette["inf_speed_w_dop"], cluster_alpha=0.7)
        
    if len(home_time_speed_infer) > 2:
        #p_home = plot_visit(home_time_speed_infer, color="#fef678", limit_points=800, title=str(user_id) + " HOME visits", p=p_home, legend="inferred speed")
        add_centroid_figure(p_home, cluster=home_time_speed_infer, point_color=palette["inf_time_speed"], fill_color=palette["inf_time_speed"], cluster_alpha=0.7, point_size=1, legend="inf time speed centroid")
        add_calculated_centroid_figure(p_home, centroid=weighted_cluster_centroid(home_time_speed_infer, "h_acc_complement"), legend="inf time speed W Ac", point_size=1, point_color=palette["inf_time_speed_w_ac"], fill_color=palette["inf_time_speed_w_ac"], cluster_alpha=0.7)
        add_calculated_centroid_figure(p_home, centroid=weighted_cluster_centroid(home_time_speed_infer, "h_dop_complement"), legend="inf time speed W DOP", point_size=1, point_color=palette["inf_time_speed_w_dop"], fill_color=palette["inf_time_speed_w_dop"], cluster_alpha=0.7)
    
    show(p_home)

user_id 6085
1064 253 297 91


user_id 5965
5666 940 1429 141


user_id 5924
283 0 37 1


user_id 5960
user_id 5964
2100 13 353 1


## Match Stop Region with HOME inferred points

In [27]:
def stop_regions_home(home_points, stop_regions):
    if len(home_points) == 0:
        return []
    
    home_stop_regions = []
    
    for index_sr, sr in stop_regions.iterrows():
        stop_region_checked = False
        
        points_match = home_points[(home_points["local_time"] > sr["local_start_time"]) & (home_points["local_time"] < sr["local_end_time"])]
        
        if len(points_match) > 0:
            home_stop_regions.append(sr["sr_id"])
                        
    return home_stop_regions

def infer_close_sr_as_home(home_stop_regions, stop_regions):
    sr_close_to_home = []
    
    for stop_region in stop_regions.iterrows():
        for home_stop_region in home_stop_regions:
            
            d = geo.distance_epsg_4326(stop_region["latitude"], 
                                       stop_region["longitude"],
                                       home_stop_region["latitude"], 
                                       home_stop_region["longitude"])
            
            if d <= 20:
                sr_close_to_home = sr_close_to_home.append(stop_region)
                
    return sr_close_to_home

def plot_points_and_centroids(points, centroids, user_id, width=600, height=400):
    p = plot.plot_stop_regions_centroids(centroids, title="User: {} - GPS points and Stop Region".format(str(user_id)), width=width, height=height)
    p = plot.plot_user_loc(user_data=points, p=p)
    return p

In [8]:
user_id = 6085
user_sr = load_user_stop_regions_centroids(user_id)
print("{} stop regions".format(len(user_sr)))
home_points = remove_outliers(places_home(user_id))

sr_ids = stop_regions_home(home_points, user_sr)
print("{} stop regions on HOME".format(len(sr_ids)))

home_sr = user_sr[user_sr["sr_id"].isin(sr_ids)]
not_home_sr = user_sr[~user_sr["sr_id"].isin(sr_ids)]

p = plot_points_and_centroids(home_points, user_sr[user_sr["sr_id"].isin(sr_ids)], user_id, width=600, height=400)
show(p)

1398 stop regions
23 stop regions on HOME


In [9]:
user_id = 5965
user_sr = load_user_stop_regions_centroids(user_id)
print("{} stop regions".format(len(user_sr)))
home_points = remove_outliers(places_home(user_id))

sr_ids = stop_regions_home(home_points, user_sr)
print("{} stop regions on HOME".format(len(sr_ids)))

p = plot_points_and_centroids(home_points, user_sr[user_sr["sr_id"].isin(sr_ids)], user_id, width=600, height=400)
show(p)

1362 stop regions
98 stop regions on HOME


#### The home inference can detect multiple places.



In [10]:
user_id = 6086
user_sr = load_user_stop_regions_centroids(user_id)
print("{} stop regions".format(len(user_sr)))
home_points = remove_outliers(places_home(user_id))

sr_ids = stop_regions_home(home_points, user_sr)
print("{} stop regions on HOME".format(len(sr_ids)))

p = plot_points_and_centroids(home_points, user_sr[user_sr["sr_id"].isin(sr_ids)], user_id, width=600, height=400)
show(p)

450 stop regions
28 stop regions on HOME


#### The home inference is robust for moving points stated as home place

## It seems that the matching stop regions with home time is better than using centroids


#### Closed Issue