### Carregando os dados

In [1]:
import os
import pandas as pd
import ast
import geopy.distance as gp
import math
import datetime
import operator

os.chdir("/home/tales/dev/master/mdc_analysis/")
from src.dao import csv_dao
from src.poi_grabber import google_places

In [2]:
user_stop_regions = csv_dao.load_user_stop_regions(5967)
user_stop_regions_centroids = csv_dao.load_user_stop_regions_centroids(5967)
pois = google_places.load_all_google_places_data()

Métodos auxiliares

In [3]:
def slice_geo_data2(data, center_lat, center_lon, lat_name, lon_name, tolerance):
    min_lats,max_lats,min_lons,max_lons = get_search_boundaries(center_lat, center_lon, tolerance)
    return data[(data[lat_name] >= min_lats)  & (data[lat_name] <= max_lats) &
                (data[lon_name] >= min_lons) & (data[lon_name] <= max_lons)]

def get_search_boundaries(center_lat, center_lon, 
                          tolerance, threshold=1):
    upper_limit = tolerance + threshold
    lower_limit = tolerance - threshold
    min_lats,max_lats,min_lons,max_lons = get_lat_long_with_tolerance(center_lat, 
                                                                        center_lon, 
                                                                        tolerance, 
                                                                        upper_limit, 
                                                                        lower_limit,
                                                                        True)
    distance = distance_epsg_4326(min_lats, min_lons, center_lat, center_lon)
    return round(min_lats,6),round(max_lats,6),round(min_lons,6),round(max_lons,6)
    
def get_lat_long_with_tolerance(center_lat, center_lon, 
                                tolerance, 
                                upper_limit, lower_limit, adjust=False):
    km_per_degree = 111.32
    meter_degree = (1 / km_per_degree) / 1000
    coef = tolerance * meter_degree
    min_lats = center_lat - coef
    min_lons = center_lon - coef
    max_lats = center_lat + coef
    max_lons = center_lon + coef
    distance = distance_epsg_4326(max_lats, max_lons, center_lat, center_lon)
    if(adjust == True):
        if(distance > upper_limit):
            new_tolerance = tolerance - ((distance - upper_limit) / 1.4)
            return get_lat_long_with_tolerance(center_lat, center_lon, 
                                           new_tolerance, 
                                           upper_limit, lower_limit) 
        if(distance < lower_limit):
            new_tolerance = tolerance + ((lower_limit - distance) / 1.4)
            return get_lat_long_with_tolerance(center_lat, center_lon, 
                                           new_tolerance, 
                                           upper_limit, lower_limit) 
    else:
        if(distance > upper_limit):
            return get_lat_long_with_tolerance(center_lat, center_lon, 
                                           tolerance - 1, 
                                           upper_limit, lower_limit) 
        if(distance < lower_limit):
            return get_lat_long_with_tolerance(center_lat, center_lon, 
                                           tolerance + 1, 
                                           upper_limit, lower_limit) 
    return min_lats,max_lats,min_lons,max_lons

def distance_epsg_4326(lat1, lon1, lat2, lon2):
    coords1 = (lat1, lon1)
    coords2 = (lat2, lon2)
    distance = gp.distance(coords1, coords2).m
    return distance

In [4]:
# Seleciona o(s) POI(s) mais próximo(s) de acordo com a tolerância
# utilizada -- default: 1m
def get_closest_pois(center_lat, center_lon, 
                    pois, distance_tolerance):
    distances = {}
    closest_pois_ids = []
    if(len(pois) > 0):
        for index, row in pois.iterrows():
            distances[row["place_id"]] = distance_epsg_4326(center_lat, center_lon, 
                         row["latitude"], row["longitude"])
        ordered_distances = sorted(distances.items(), key=operator.itemgetter(1))
        closest_poi_distance = ordered_distances[0][1]
        for poi,distance in distances.items():
            if(distance <= closest_poi_distance + distance_tolerance):
                closest_pois_ids.append(poi)    
    return closest_pois_ids


# Retorna o tempo em minutos a partir do timestamp inicial e final
def get_time_spent_in_minutes(timestamp_start_time, timestamp_end_time):
    return int(timestamp_end_time-timestamp_start_time) / 60

# Calcula os quartis
def get_blox_plot_info(data):
    df = pd.DataFrame(data) 
    quantiles = df.quantile([0.25,0.5,0.75])
    return quantiles

    
# Agrupa stop-regions com POIs
def group_stop_regions_with_pois(stop_regions, 
                       pois, 
                       lat_name, lon_name, 
                       poi_distance_tolerance=1, 
                       search_tolerance=50):
    result_pois = {}
    for index, row in stop_regions.iterrows():
        center_lat = round(row[lat_name], 6)
        center_lon = round(row[lon_name], 6)
        sr_id = row['sr_id'] # denominacao pro id?
        sr_total_time = row['local_end_time'] - row['local_start_time']
        filtered_pois = slice_geo_data2(pois, center_lat, center_lon, lat_name, lon_name, search_tolerance)
        for poi in get_closest_pois(center_lat, center_lon, filtered_pois, poi_distance_tolerance):
            time_spent_in_minutes = get_time_spent_in_minutes(row['local_start_time'], row['local_end_time'])
            if poi in result_pois:
                result_pois[poi][1].append(time_spent_in_minutes)
                result_pois[poi][2].append(sr_id)
            else:
                result_pois[poi] = [poi, [time_spent_in_minutes], [sr_id]]
    for poi in result_pois:
        result_pois[poi][1] = get_blox_plot_info(result_pois[poi][1])
    return result_pois    

In [5]:
users = os.listdir("outputs/stop_regions/")

In [6]:
users_sr = []

In [7]:
rows = []
n=0
for user_id in users:
    n+=1
    print("n:", n, "user_id:", user_id)
    user_sr_centroids = csv_dao.load_user_stop_regions_centroids(user_id)
    results = group_stop_regions_with_pois(user_sr_centroids,
                                 pois,
                                 "latitude",
                                 "longitude")
    
    for r_key in results.keys():
        row = {}
        r = results[r_key]
        row["user_id"] = user_id
        row["place_id"] = r[0]
        row["q1"] = r[1].loc[0.25].item()
        row["q2"] = r[1].loc[0.5].item()
        row["q3"] = r[1].loc[0.75].item()
        row["sr_list"] = r[2]
    
        rows.append(row)

n: 1 user_id: 6189
n: 2 user_id: 5936
n: 3 user_id: 6087
n: 4 user_id: 5973
n: 5 user_id: 6085
n: 6 user_id: 6074
n: 7 user_id: 6012
n: 8 user_id: 5982
n: 9 user_id: 5948
n: 10 user_id: 5974
n: 11 user_id: 6090
n: 12 user_id: 6199
n: 13 user_id: 6068
n: 14 user_id: 6024
n: 15 user_id: 5976
n: 16 user_id: 6094
n: 17 user_id: 5941
n: 18 user_id: 5995
n: 19 user_id: 5962
n: 20 user_id: 6093
n: 21 user_id: 6033
n: 22 user_id: 6079
n: 23 user_id: 6038
n: 24 user_id: 6175
n: 25 user_id: 6042
n: 26 user_id: 5924
n: 27 user_id: 6083
n: 28 user_id: 6178
n: 29 user_id: 5958
n: 30 user_id: 6086
n: 31 user_id: 6100
n: 32 user_id: 5945
n: 33 user_id: 5925
n: 34 user_id: 5935
n: 35 user_id: 6172
n: 36 user_id: 5955
n: 37 user_id: 6073
n: 38 user_id: 5980
n: 39 user_id: 6010
n: 40 user_id: 5993
n: 41 user_id: 6037
n: 42 user_id: 5979
n: 43 user_id: 5966
n: 44 user_id: 5985
n: 45 user_id: 5967
n: 46 user_id: 5970
n: 47 user_id: 6169
n: 48 user_id: 6188
n: 49 user_id: 6097
n: 50 user_id: 6096
n: 51 use

In [8]:
pd.DataFrame(rows)[["user_id", "place_id", "q1", "q2", "q3", "sr_list"]].sample(8)

Unnamed: 0,user_id,place_id,q1,q2,q3,sr_list
13006,5947,ChIJSZAq3NPJjkcRCXtpAaTVYu0,14.245833,30.683333,46.65,"[5947_63, 5947_118, 5947_184, 5947_227, 5947_2..."
9084,5937,ChIJacHAyjLIjkcRSiCjYTIXlow,55.383333,55.383333,55.383333,[5937_85]
21445,6167,ChIJOTPFcZ0thEcR4lr7IqXKN8U,28.983333,28.983333,28.983333,[6167_87]
14217,6072,ChIJ____P0oujEcRdbuoYp1gLXA,28.133333,28.133333,28.133333,[6072_813]
2969,5995,ChIJ3TgzFCkujEcRoaM3zy-6l4o,8.15,8.15,8.15,[5995_439]
17823,5953,ChIJfUiU1KXLjkcRlrUrOxg4db8,10.25,10.25,10.25,[5953_616]
19417,6053,ChIJHWEEwTppjkcRhOeQ5oU6z5k,10.083333,10.083333,10.083333,[6053_26]
23119,5988,ChIJzy1UBzR7jEcR-GvDgA5urVM,89.866667,89.866667,89.866667,[5988_51]


In [10]:
pd.DataFrame(rows)[["user_id", "place_id", "q1", "q2", "q3", "sr_list"]].to_csv("outputs/sr_time_quartiles/sr_time_quartiles.csv", index=False)