In [13]:
import h3
import polars as pl
import pandas as pd

In [14]:
state = 'Texas'

checkins_df = pl.read_csv(
    f'/home/victor-hugo/Documentos/HAVANA-2.0/data/intermediate/checkins/{state}.csv'
)

poi_encoder_df = pl.read_parquet(
    f'/home/victor-hugo/Documentos/HAVANA-2.0/data/intermediate/region_embeddings/poi-encoder/{state}/64_dimension_9_resolution.parquet',
    use_pyarrow=True
)

In [15]:
def generate_h3_cell(row: pl.Series) -> str:
    """
    Generate H3 cell from latitude and longitude

    Args:
        row (pd.Series): Row of a DataFrame

    Returns:
        str: H3 cell
    """
    lat = row["latitude"]
    lon = row["longitude"]

    h3_cell = h3.latlng_to_cell(lat, lon, 9)
    return h3_cell

In [16]:
checkins_df = (
    checkins_df
    .with_columns(
        region_id=(
            pl.struct(["latitude", "longitude"])
            .map_elements(
                generate_h3_cell, 
                return_dtype=pl.String
            )
        )
    )
)

(
    checkins_df
    .join(
        checkins_df
        .join(
            poi_encoder_df,
            on="region_id",
            how="anti"
        ),
        on=['userid', 'placeid'],
        how="anti"
    )
    .drop('region_id')
    .write_csv(f'/home/victor-hugo/Documentos/HAVANA-2.0/data/intermediate/checkins/{state}.csv')
)

checkins_df = pd.read_csv(
    f'/home/victor-hugo/Documentos/HAVANA-2.0/data/intermediate/checkins/{state}.csv'
)

In [17]:
places_quantity_per_user = (
    checkins_df.groupby("userid")
    .agg({"placeid": "nunique"})
    .sort_values(by=["placeid"], ascending=False)
    .reset_index()
)

places_quantity_per_user = places_quantity_per_user[places_quantity_per_user["placeid"] >= 2]

valid_users = places_quantity_per_user["userid"].unique()

checkins_df = checkins_df[checkins_df["userid"].isin(valid_users)]

checkins_df.to_csv(
    f'/home/victor-hugo/Documentos/HAVANA-2.0/data/intermediate/checkins/{state}.csv', 
    index=False
)