In [1]:
import os

os.chdir("/home/tales/dev/master/mdc_analysis/")
print("working dir", os.getcwd())

import pandas as pd
import matplotlib.pyplot as plt

import gc
from src.dao import csv_dao
from src.plot import plot
from src.utils import geo

from bokeh.io import output_notebook, show
output_notebook()

pd.set_option('display.float_format', lambda x: '%.4f' % x)

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

working dir /home/tales/dev/master/mdc_analysis


# Stop Region nearest POIs 

## Loading

In [2]:
def valid_amenities(pois):
    return pois[pois["amenity"].isna() == False]

def unique_stop_regions(knn_pois):
    unique_sr = knn_pois.drop_duplicates(subset=["lat_sr", "lon_sr"], keep="first")["sr_id"].tolist()
    return knn_pois[knn_pois["sr_id"].isin(unique_sr)]

def load_sr_knn_pois_for_all_users():
    users = csv_dao.list_stop_region_usernames()
    knn_pois = pd.DataFrame()
    
    for user in users:
        try:
            user_knn_pois = csv_dao.load_sr_distance_to_close_pois(user)
            print("User {} data loaded".format(user))
        except FileNotFoundError:
            continue
            
        user_knn_pois["user"] = user
        knn_pois = knn_pois.append(user_knn_pois)
    print("done!")
    return knn_pois

In [3]:
len(csv_dao.list_stop_region_usernames())

163

In [4]:
sr_knn_pois = load_sr_knn_pois_for_all_users()
print("All Stop Regions:    {}".format(len(sr_knn_pois["sr_id"].drop_duplicates())))
unique_sr_knn_pois = unique_stop_regions(sr_knn_pois)
print("Unique Stop Regions: {}".format(len(unique_sr_knn_pois["sr_id"].drop_duplicates())))

unique_sr_knn_pois.sample(6)

User 6189 data loaded
User 5936 data loaded
User 6087 data loaded
User 5973 data loaded
User 6085 data loaded
User 6074 data loaded
User 6012 data loaded
User 5982 data loaded
User 5948 data loaded
User 5974 data loaded
User 6090 data loaded
User 6199 data loaded
User 6068 data loaded
User 6024 data loaded
User 5976 data loaded
User 6094 data loaded
User 5941 data loaded
User 5995 data loaded
User 5962 data loaded
User 6093 data loaded
User 6033 data loaded
User 6079 data loaded
User 6038 data loaded
User 6175 data loaded
User 6042 data loaded
User 5924 data loaded
User 6083 data loaded
User 6178 data loaded
User 5958 data loaded
User 6086 data loaded
User 6100 data loaded
User 5945 data loaded
User 5925 data loaded
User 5935 data loaded
User 6172 data loaded
User 5955 data loaded
User 6073 data loaded
User 5980 data loaded
User 6010 data loaded
User 5993 data loaded
User 6037 data loaded
User 5979 data loaded
User 5966 data loaded
User 5985 data loaded
User 5967 data loaded
User 5970 

Unnamed: 0,distance,osm_id,lat_sr,lon_sr,sr_id,position,user
17,2735.0997,822484525,46.2764,6.9336,5928_3263,17,5928
10,104.0058,1286683100,46.1962,6.1432,6038_320,10,6038
14,5221.1646,410725384,46.62,6.8199,5938_4634,14,5938
5,2073.4289,4665637464,46.5364,6.83,5938_6016,5,5938
18,3904.6022,410725339,46.5363,6.83,5938_2804,18,5938
16,2276.9579,323186523,46.5722,6.6299,6178_1960,16,6178


In [5]:
pois = csv_dao.load_hot_osm_pois()
valid_pois = valid_amenities(pois)

valid_pois.sample(6)

Unnamed: 0,osm_id,amenity,name,place,latitude,longitude,SRID,lon_4326,lat_4326
72099,2603741618,restaurant,Papa Joe's,,828483.6,5933470.07,900913,7.4424,46.9473
243798,4900260245,fast_food,Déliwish,,948735.98,6006014.5,900913,8.5226,47.3903
280570,2518991685,cafe,Planetarium,,1032695.22,6005983.29,900913,9.2769,47.3901
97585,2608198839,restaurant,Shanghai 3,,925684.13,5951075.78,900913,8.3156,47.0551
49762,3313987088,fast_food,Café Le Mix,,790338.83,5877628.49,900913,7.0997,46.6037
243462,2069927465,driving_school,Beerli,,948804.74,6006144.27,900913,8.5233,47.3911


## Merging dataframes

In [None]:
unique_sr_knn_pois = unique_sr_knn_pois.merge(valid_pois[["osm_id", "amenity", "lat_4326", "lon_4326"]], how="inner", on="osm_id")
unique_sr_knn_pois.sort_values(by=["sr_id", "position"]).head()

## How far to the Stop Regions are the POIs?

In [None]:
firsts = pd.DataFrame()
seconds = pd.DataFrame()
thirds = pd.DataFrame()
for sr_id in unique_sr_knn_pois["sr_id"].drop_duplicates():
    sr_knn = unique_sr_knn_pois[unique_sr_knn_pois["sr_id"] == sr_id]
    
    firsts = firsts.append(sr_knn[sr_knn["position"] == 0])
    seconds = seconds.append(sr_knn[sr_knn["position"] == 1])
    thirds = thirds.append(sr_knn[sr_knn["position"] == 2])    

In [None]:
firsts["distance"].describe()

In [None]:
firsts["distance"].plot.hist(xlim=(0, 3000))

In [None]:
seconds["distance"].describe()

In [None]:
seconds["distance"].plot.hist(xlim=(0, 3000))

In [None]:
thirds["distance"].describe()

In [None]:
thirds["distance"].plot.hist(xlim=(0, 3000))

## Closer POIs frequency of amenities

tipo lugar que o cara costuma frequentar... :)

In [None]:
f, ax = plt.subplots(figsize=(15,5))
firsts["amenity"].value_counts().head(35).plot.bar(title="Frequency of point amenity for the CLOSEST POI")

In [None]:
f, ax = plt.subplots(figsize=(15,5))
firsts.append(seconds).append(thirds)["amenity"].value_counts().head(35).plot.bar(title="Frequency of point amenity for 3 closets POIs")

## Variety of closer POIs

In [None]:
max_distance = 500
most_common_frequency = []
len_close_pois = []

for sr_id in unique_sr_knn_pois["sr_id"].drop_duplicates():
    knn_pois = unique_sr_knn_pois[unique_sr_knn_pois["sr_id"] == sr_id]
    use_knn_pois = knn_pois[knn_pois["distance"] <= max_distance]
    
    len_close_pois.append(len(use_knn_pois))
    
    if len(use_knn_pois) == 0:
        continue    
    most_common_frequency.append(use_knn_pois["amenity"].value_counts(normalize=True).iloc[0])

In [None]:
pd.Series(most_common_frequency).plot.hist(title="Frequency of most common amenities close to Stop Regions. Distance <= {}".format(max_distance))

In [None]:
pd.Series(len_close_pois).plot.hist(title="Frequency of POIs closer to Stop Region. Distance <= {}".format(max_distance))

## Plotting Close Neighbors

In [None]:
unique_sr_knn_pois.head()

In [None]:
user_unique_sr_knn_pois[unique_sr_knn_pois["user"] == 6086]

p = plot.plot_stop_regions_centroids(user_unique_sr_knn_pois[["lat_sr", "lon_sr"]].drop_duplicates(), 
                                     fill_color="navy", title="", lat_col="lat_sr", lon_col="lon_sr",
                                    width=700, height=500)
show(p)

In [None]:
print(len(unique_sr_knn_pois))
p = plot.plot_poi(unique_sr_knn_pois, lat_col="lat_4326", lon_col="lon_4326", title="", 
                      figure=p, width=700, height=500, color="green")

show(p)

In [None]:
# for sr_id in unique_sr_knn_pois["sr_id"].drop_duplicates():
#     print(sr_id)
#     sr_data = unique_sr_knn_pois[unique_sr_knn_pois["sr_id"] == sr_id]
#     p = plot.plot_poi(sr_data, lat_col="lat_4326", lon_col="lon_4326", title="", 
#                       figure=p, width=700, height=550, color="green")
    
# show(p)
# print("Sample size: {}".format(n_sample))
# print("{}% of valid amenities".format(round(float(n_sample) / len(valid_pois), 2)))

## POIs distribution

In [None]:
# p = None

# n_sample = 10000

# plot_pois = valid_pois.sample(n_sample)

# for amenity in plot_pois["amenity"].drop_duplicates():
#     amenity_data = plot_pois[plot_pois["amenity"] == amenity]
#     p = plot.plot_poi(amenity_data, title="", figure=p, width=400, height=300)
    
# show(p)
# print("Sample size: {}".format(n_sample))
# print("{}% of valid amenities".format(round(float(n_sample) / len(valid_pois), 2)))