In [43]:
import polars as pl
import h3
import ast
import pandas as pd

In [44]:
checkins_df = pl.read_csv(
    '/home/victor-hugo/Documentos/HAVANA-2.0/data/intermediate/checkins/New York.csv'
)

In [45]:
def generate_h3_cell(row: pl.Series) -> str:
    """
    Generate H3 cell from latitude and longitude

    Args:
        row (pd.Series): Row of a DataFrame

    Returns:
        str: H3 cell
    """
    lat = row["latitude"]
    lon = row["longitude"]

    h3_cell = h3.latlng_to_cell(lat, lon, 9)
    return h3_cell

In [46]:
checkins_df = checkins_df.with_columns(
    region_id=pl.struct(["latitude", "longitude"]).map_elements(generate_h3_cell, return_dtype=pl.String)
)

In [47]:
poi_encoder_df = pl.read_parquet(
    '/home/victor-hugo/Documentos/HAVANA-2.0/data/intermediate/region_embeddings/poi-encoder/New York/64_dimension_9_resolution.parquet',
    use_pyarrow=True
)

In [48]:
(
    checkins_df
    .join(
        checkins_df
        .join(
            poi_encoder_df,
            on="region_id",
            how="anti"
        ),
        on=['userid', 'placeid'],
        how="anti"
    )
    .drop('region_id')
    .write_csv('/home/victor-hugo/Documentos/HAVANA-2.0/data/intermediate/checkins/New York.csv')
)

In [49]:
checkins_df = pd.read_csv(
    '/home/victor-hugo/Documentos/HAVANA-2.0/data/intermediate/checkins/New York.csv'
)

In [50]:
places_quantity_per_user = (
    checkins_df.groupby("userid")
    .agg({"placeid": "nunique"})
    .sort_values(by=["placeid"], ascending=False)
    .reset_index()
)

places_quantity_per_user = places_quantity_per_user[places_quantity_per_user["placeid"] >= 2]

valid_users = places_quantity_per_user["userid"].unique()

checkins_df = checkins_df[checkins_df["userid"].isin(valid_users)]

In [51]:
checkins_df.to_csv('/home/victor-hugo/Documentos/HAVANA-2.0/data/intermediate/checkins/New York.csv', index=False)