In [3]:
import pandas as pd
import random
import os
import gc

import matplotlib.pyplot as plt

import numpy as np

from bokeh.palettes import Spectral10
from bokeh.io import output_notebook, show

output_notebook()

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

palette = Spectral10

pd.set_option('display.float_format', lambda x: '%.3f' % x)

import bokeh
bokeh.__version__

os.chdir("/home/tales/dev/mdc_analysis/")
print("working dir", os.getcwd())

from src.dao.csv_dao import load_user_gps_csv, load_user_gps_time_window
from src.dao.dbdao import DBDAO
from src.utils.time_utils import enrich_time_columns, local_time
from src.data_processment.stop_region import MovingCentroidStopRegionFinder
from src.plot.plot import add_centroid_figure
from src.utils.geo import index_clusters

working dir /home/tales/dev/mdc_analysis


In [4]:
def load_users_gps_data(userids):
    df = pd.DataFrame()
    for userid in userids:
        user_gps_df = load_user_gps_csv(userid)
        user_gps_df["userid"] = [userid] * len(user_gps_df)
        df = df.append(user_gps_df)
        
    df = local_time(df)
    df = df[["userid", "latitude", "longitude", "tz", "time", "local_time"]].sort_values("local_time")

    return df

def load_user_gps_data(userid):
    user_gps_data = load_user_gps_csv(userid)
    user_gps_data["userid"] = [userid] * len(user_gps_data)
    
    user_gps_data = user_gps_data[["userid", "latitude", "longitude", "tz", "time", "local_time"]].sort_values("local_time")
    return user_gps_data

In [5]:
n_sample = 10

userids = DBDAO().users_with_places().sample(n_sample).tolist()

userids = 5950,5936,6037,6104,6177,5973,5980,5942,6026,6073

# Categoria HOME (pela tabela places)

In [19]:
from src.plot.plot import plot_user_loc
from bokeh.plotting import gridplot

def places_home(userid):
    user_gps_data = load_user_gps_data(userid)
    if len(DBDAO().places_home_df(userid=userid)) == 0:
        return pd.DataFrame()
    
    home_visit_data = DBDAO().places_home_df(userid=userid).sort_values("time_start")
    return places(home_visit_data, user_gps_data)

def places_home_friend(userid):
    user_gps_data = load_user_gps_data(userid)
    if len(DBDAO().places_home_friend_df(userid=userid)) == 0:
        return pd.DataFrame()
    
    home_friend_visit_data = DBDAO().places_home_friend_df(userid=userid).sort_values("time_start")
    return places(home_friend_visit_data, user_gps_data)

def places_work(userid):
    user_gps_data = load_user_gps_data(userid)
    if len(DBDAO().places_work_df(userid=userid)) == 0:
        return pd.DataFrame()
    
    work_visit_data = DBDAO().places_work_df(userid=userid).sort_values("time_start")
    return places(work_visit_data, user_gps_data)

def places(place_label_visit_data, user_gps_data):
    place_label_visit_data = local_time(place_label_visit_data, time_col="time_start", tz_col="tz_start")
    place_label_visit_data = local_time(place_label_visit_data, time_col="time_end", tz_col="tz_end")

    user_visit_locations = pd.DataFrame()

    for index, row in place_label_visit_data.iterrows():
        user_visit_locations = user_visit_locations.append(user_gps_data[(user_gps_data["local_time"] >= row[
            "local_time_start"]) & (user_gps_data["local_time"] <= row["local_time_end"])])
        
    return user_visit_locations

def plot_visit(visit_locations, color="navy", title="user visits", limit_points=500, p=None, legend=None, width=400, height=300):
    if len(visit_locations) > limit_points:
        visit_locations = visit_locations.sample(limit_points)
    p = plot_user_loc(visit_locations[["latitude", "longitude"]].drop_duplicates(), title, width=width, height=height, size=4, color=color, p=p, legend=legend)
    return p

# Categoria HOME com inferencia pela hora

In [7]:
def infer_home(userid):
    user_home_locations = enrich_time_columns(places_home(userid))
    return user_home_locations[(user_home_locations["hour"] >= 3) & (user_home_locations["hour"] <=4)]

In [None]:
grid = [[]]

for userid in userids:    
    print(userid)
    home_data = places_home(userid)
    home_inferred_data = infer_home(userid)
    
    p_home = plot_visit(home_data, color="navy", title=str(userid) + " HOME visits", legend="from table visits")
    p_home = plot_visit(home_inferred_data, color="magenta", title=str(userid) + " HOME visits", p=p_home, legend="inferred")
    
    if len(grid[-1]) % 2 == 0:
        grid.append([])
        
    grid[-1].append(p_home)
    
show(gridplot(grid))

In [None]:
home_inferred_data.head()

# Categoria HOME OF FRIEND

In [None]:
i = 0
grid = [[]]
for userid in userids:
    user_gps_data = load_user_gps_data(userid)
    user_home_locations = places_home(userid)
    user_home_friend_locations = places_home_friend(userid)
    
    if user_home_locations.empty or user_home_friend_locations.empty:
        continue
    
    user_home_locations = user_home_locations[["longitude", "latitude"]].drop_duplicates()
    user_home_friend_locations = user_home_friend_locations[["longitude", "latitude"]].drop_duplicates()
    
    print(userid, len(user_home_locations), len(user_home_friend_locations))
    
    if len(user_home_locations) > 500:
        user_home_locations = user_home_locations.sample(500)

    if len(user_home_locations) > 500:
        user_home_friend_locations = user_home_friend_locations.sample(500)   
    
    p_home = plot_visit(user_home_locations, color="navy", title=str(userid) + " HOME visits")
    p_home_friend = plot_visit(user_home_friend_locations, color="green", title=str(userid) + " HOME FRIEND visits")
    
    if i % 2 == 0:
        grid.append([])
        
    grid[-1].append(p_home)
    i += 1
    grid[-1].append(p_home_friend)
    i += 1
    p_home = None
    gc.collect()

show(gridplot(grid))

# Analyzing user 6177 HOME FRIEND category

In [8]:
user_6177_hf = places_home_friend(6177)
user_6177_hf = enrich_time_columns(user_6177_hf)
user_6177_hf.sort_values(by=("local_time")).tail()

Unnamed: 0,userid,latitude,longitude,tz,time,local_time,hour,min,sec,weekday,local_datetime
182325,6177,46.55,6.75,-3600,1299406381,1299402781,9,13,1,Sunday,2011-03-06 09:13:01
19179,6177,46.55,6.75,-3600,1299406391,1299402791,9,13,11,Sunday,2011-03-06 09:13:11
216572,6177,46.55,6.75,-3600,1299406401,1299402801,9,13,21,Sunday,2011-03-06 09:13:21
29348,6177,46.56,6.752,-3600,1299406640,1299403040,9,17,20,Sunday,2011-03-06 09:17:20
5019,6177,46.56,6.752,-3600,1299406650,1299403050,9,17,30,Sunday,2011-03-06 09:17:30


In [9]:
hof_6177_visits = DBDAO().places_home_friend_df(userid=6177)
hof_6177_visits
hof_6177_visits = local_time(hof_6177_visits, "time_start", "tz_start")
hof_6177_visits = local_time(hof_6177_visits, "time_end", "tz_end")

idx_max = (hof_6177_visits["time_end"] - hof_6177_visits["time_start"]).idxmax()
hof_6177_visits.loc[idx_max]

userid                      6177
place_label                    2
placeid                        3
time_start            1283536753
time_end              1283587405
tz_start                   -7200
tz_end                     -7200
trusted_start                  t
trusted_end                    t
trusted_transition             t
local_time_start      1283529553
local_time_end        1283580205
Name: 31, dtype: object

In [17]:
one_visit = user_6177_hf[(user_6177_hf["local_time"] >= 1283529553) & (user_6177_hf["local_time"] <= 1283580205)]
home_6177 = infer_home(6177)
clusters = MovingCentroidStopRegionFinder(region_radius=50, delta_time=300).find_clusters(one_visit, verbose=False)

In [20]:
p = plot_visit(one_visit, legend="home of friend", width=800, height=600)
plot_visit(home_6177, p=p, color="magenta", legend="home")
for cluster in clusters:
    add_centroid_figure(figure=p, cluster=cluster)
show(p)