Fuente de los datos: [Foursquare dataset "Global-scale Check-in Dataset with User Social Networks"](https://sites.google.com/site/yangdingqi/home/foursquare-dataset).

In [None]:
import sys
from pathlib import Path

AVES_ROOT = Path("../../..")

DATA_PATH = AVES_ROOT / "data" / "external" / "foursquare"
DATA_PATH


In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import geopandas as gpd

# esto configura la calidad de la imagen. dependerá de tu resolución. el valor por omisión es 80
mpl.rcParams["figure.dpi"] = 120
# esto depende de las fuentes que tengas instaladas en el sistema.
# mpl.rcParams["font.family"] = "Fira Sans Extra Condensed"


In [None]:
bounds = [-70.790663,-33.748251,-70.709295,-33.704992]

In [None]:

from shapely.geometry import box
zones = gpd.GeoDataFrame(geometry=[box(*bounds)]).set_crs('epsg:4326')
zones.head()


In [None]:
from aves.models.grid import H3Grid

grid = H3Grid.from_geodf(zones, grid_level=9, extra_margin=0.05).geodf
grid.plot()


In [None]:
import dask.dataframe as dd

checkin_pois = dd.read_csv(
    DATA_PATH / "dataset_WWW2019" / "raw_POIs.txt",
    sep="\t",
    names=["poi_id", "lat", "lon", "category", "country"],
    encoding="utf-8",
)
checkin_pois = checkin_pois[
    checkin_pois["lat"].between(bounds[1], bounds[3])
    & checkin_pois["lon"].between(bounds[0], bounds[2])
].compute()

checkin_pois.head()


In [None]:
from aves.features.geo import to_point_geodataframe

pois = to_point_geodataframe(
    checkin_pois, longitude="lon", latitude="lat", drop=True
)  
pois


In [None]:
pois["category"].value_counts().sort_values().tail(75).plot(
    kind="barh", logx=True, width=0.9, figsize=(7, 15)
)


In [None]:
ax = grid.plot(edgecolor='white', facecolor='#efefef')
pois.plot(marker=".", markersize=1, color="purple", alpha=0.5, ax=ax)


In [None]:
grid_pois = gpd.sjoin(pois, grid, predicate="within")
grid_pois


In [None]:
from aves.visualization.figures import figure_from_geodataframe
from aves.visualization.maps import choropleth_map

fig, ax = figure_from_geodataframe(zones, height=7)

choropleth_map(
    ax,
    grid.join(grid_pois.groupby("index_right").size().rename("n_pois")),
    "n_pois",
)


In [None]:
from aves.features.utils import logodds_ratio_with_uninformative_dirichlet_prior

zone_pois = (
    grid_pois.groupby(["index_right", "category"])
    .size()
    .unstack(fill_value=0)
    .pipe(logodds_ratio_with_uninformative_dirichlet_prior)
)

sns.clustermap(zone_pois, center=0, method="ward", cmap="PuOr_r")


In [None]:
# con esto nos quedamos solamente con las celdas que tienen POIs
grid = grid.loc[zone_pois.index].copy()


In [None]:
from aves.visualization.figures import small_multiples_from_geodataframe

fig, axes = small_multiples_from_geodataframe(zones, 3, height=5)

for ax, col in zip(axes, ["Home (private)", "Office", "Plaza"]):
    choropleth_map(ax, grid.join(zone_pois), col, linewidth=0.1, edgecolor="black")
    ax.set_title(col)


In [None]:
checkins = (
    dd.read_csv(
        DATA_PATH / "dataset_WWW2019" / "raw_Checkins_anonymized.txt",
        sep="\t",
        names=["user_id", "poi_id", "datetime", "delta_time"],
    )
    # nos interesan los checkins en los POIs que conocemos
    .join(checkin_pois.set_index("poi_id"), on="poi_id", how="inner")
    .compute()
)
checkins.head()


In [None]:
from aves.features.geo import to_point_geodataframe

checkins_geo = to_point_geodataframe(
    checkins, longitude="lon", latitude="lat", drop=True
) 


In [None]:
checkins_geo.groupby("category").size().sort_values().tail(100).plot(
    kind="barh", logx=True, figsize=(7, 15), width=0.9
)


In [None]:
from aves.visualization.maps import bubble_map

fig, ax = figure_from_geodataframe(zones, height=7)

bubble_map(
    ax,
    pois.join(
        checkins_geo.groupby("poi_id")
        .size()
        .rename("popularity")
        .to_frame()
        .pipe(np.sqrt),
        on="poi_id",
    ),
    "popularity",
    edgecolor="none",
    scale=10,
    alpha=0.1,
)


In [None]:
grid_checkins = gpd.sjoin(checkins_geo, grid, predicate="within")
grid_checkins


In [None]:
fig, axes = small_multiples_from_geodataframe(zones, 3, height=5)

grid_popularity = (
    grid_checkins.groupby(["index_right", "category"])
    .size()
    .unstack(fill_value=0)
    .pipe(logodds_ratio_with_uninformative_dirichlet_prior)
)

for ax, col in zip(axes, ["Home (private)", "Office", "Plaza"]):
    choropleth_map(
        ax, grid.join(grid_popularity), col, linewidth=0.1, edgecolor="black"
    )
    ax.set_title(col)


Para mostrar viajes vamos a considerar los desplazamientos desde un lugar a otro. Usaremos un criterio simple, sin discriminar si los checkins se realizaron en el mismo día. Para ello haremos uso de dos pasos:

1. Definiremos una función _shift_ que concatena dos filas contiguas en la tabla.
2. Al ordenar la tabla por `user_id` y `datetime`, dos filas contiguas que tengan el mismo `user_id` representan un desplazamiento.

In [None]:
def shift(df):
    origin = df.rename({"poi_id": "origin"}, axis=1)[["origin", "user_id"]]
    destination = df.rename({"poi_id": "destination", "user_id": "user_id_d"}, axis=1)[
        ["destination", "user_id_d"]
    ].shift()
    trips = (
        origin.join(destination)
        .dropna()
        .pipe(lambda x: x[(x.user_id == x.user_id_d) & (x.origin != x.destination)])
        .groupby(["user_id", "origin", "destination"])
        .size()
    )
    trips.name = "n_trips"
    return trips


In [None]:
user_trip_counts = grid_checkins.sort_values(['user_id', 'datetime']).pipe(shift).reset_index()
user_trip_counts.head()


In [None]:
user_trips_grid = (
    user_trip_counts.join(
        grid_pois[["poi_id", "index_right", "category"]].set_index("poi_id"),
        on="origin",
    )
    .rename({"index_right": "origin_cell_id", "category": "origin_category"}, axis=1)
    .join(
        grid_pois[["poi_id", "index_right", "category"]].set_index("poi_id"),
        on="destination",
    )
    .rename(
        {"index_right": "destination_cell_id", "category": "destination_category"},
        axis=1,
    )
)
user_trips_grid.head()


Visualizemos en flowmap.blue. Visitar:

https://flowmap.blue/in-browser

Y luego copiar en cada caja el output de las celdas que contengan `to_csv`.

In [None]:
locations = (
    grid.reset_index()
    .rename({"index": "id", "h3_cell_id": "name"}, axis=1)
    .assign(lat=lambda x: x.centroid.geometry.y, lon=lambda x: x.centroid.geometry.x)
    .drop("geometry", axis=1)
)
locations


In [None]:
flows = (
    user_trips_grid.groupby(["origin_cell_id", "destination_cell_id"])["n_trips"]
    .sum()
    .rename("count")
    .reset_index()
    .rename({"origin_cell_id": "origin", "destination_cell_id": "dest"}, axis=1)
)
flows


In [None]:
print(locations.to_csv(index=False))


In [None]:
print(flows.to_csv(index=False))


Otro tipo de O-D: por actividad.

In [None]:
top_destinations = (
    user_trips_grid.groupby("destination_category")["n_trips"].sum().sort_values()
)
top_origins = user_trips_grid.groupby("origin_category")["n_trips"].sum().sort_values()


In [None]:
popular_transitions = (
    user_trips_grid[
        user_trips_grid["origin_category"] != user_trips_grid["destination_category"]
    ]
    .groupby(["origin_category", "destination_category"])["n_trips"]
    .sum()
    .sort_values(ascending=False)
)
popular_transitions


In [None]:
sns.clustermap(
    popular_transitions.unstack(fill_value=0)
    .loc[top_origins[top_origins > 5].index][
        top_destinations[top_destinations > 5].index
    ]
    .pipe(lambda x: np.log(1 + x)),
    cmap="inferno", figsize=(24, 24)
)
