In [1]:
import polars as pl

In [2]:
florida = pl.read_csv(
    '/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/Florida.csv'
)

In [13]:
(
    pl.read_csv(
        '/home/victor-hugo/HAVANA-2.0/data/intermediate/region_embeddings/hex2vec/Texas/64_dimension_7_resolution.csv',
        use_pyarrow=True
    )
    .write_parquet(
        '/home/victor-hugo/HAVANA-2.0/data/intermediate/region_embeddings/hex2vec/Texas/64_dimension_7_resolution.parquet',
        use_pyarrow=True
    )
)

In [24]:
def get_statistics(df_path: str, df_name: str) -> None:
    df = pl.read_csv(df_path).unique()
    
    n_unique_users = (
        df
        .select("userid")
        .unique()
        .shape[0]
    )
    median_checkins_per_user = (
        df
        .group_by("userid")
        .agg(pl.len().alias("qt_checkins"))
        .select("qt_checkins")
        .median()
        .item()
    )
    number_pois = (
        df
        .select("placeid")
        .unique()
        .shape[0]
    )
    
    print(f"Statistics for {df_name}")
    print(f"Number of check-ins: {df.shape[0]}")
    print(f"Number of unique users: {n_unique_users}")
    print(f"Number of unique POIs: {number_pois}")
    print(f"Median number of check-ins per user: {median_checkins_per_user}")

# Texas

In [26]:
get_statistics(
    df_path='/home/victor-hugo/HAVANA-2.0/data/raw/checkins_local/checkins_local_Texas.csv',
    df_name='Texas Reduced'
)

Statistics for Texas Reduced
Number of check-ins: 166065
Number of unique users: 550
Number of unique POIs: 56716
Median number of check-ins per user: 21.5


In [27]:
get_statistics(
    df_path='/home/victor-hugo/HAVANA-2.0/data/raw/checkins_separated_categories/checkins_Texas.csv',
    df_name='Texas Big Data'
)

Statistics for Texas Big Data
Number of check-ins: 3351025
Number of unique users: 37522
Number of unique POIs: 135570
Median number of check-ins per user: 13.0


In [28]:
get_statistics(
    df_path='/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/Texas.csv',
    df_name='Texas Filtered'
)

Statistics for Texas Filtered
Number of check-ins: 3345090
Number of unique users: 32323
Number of unique POIs: 135469
Median number of check-ins per user: 19.0


In [37]:
get_statistics(
    df_path='/home/victor-hugo/HAVANA-2.0/texas_checkin_cleaned.csv',
    df_name='Texas Artigo'
)

Statistics for Texas Artigo
Number of check-ins: 3057075
Number of unique users: 19219
Number of unique POIs: 71575
Median number of check-ins per user: 49.0


# Florida

In [29]:
get_statistics(
    df_path='/home/victor-hugo/HAVANA-2.0/data/raw/checkins_local/checkins_local_Florida.csv',
    df_name='Florida Reduced'
)

Statistics for Florida Reduced
Number of check-ins: 12297
Number of unique users: 323
Number of unique POIs: 5097
Median number of check-ins per user: 8.0


In [30]:
get_statistics(
    df_path='/home/victor-hugo/HAVANA-2.0/data/raw/checkins_separated_categories/checkins_Florida.csv',
    df_name='Florida Big Data'
)

Statistics for Florida Big Data
Number of check-ins: 989677
Number of unique users: 20301
Number of unique POIs: 65009
Median number of check-ins per user: 7.0


In [31]:
get_statistics(
    df_path='/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/Florida.csv',
    df_name='Florida Filtered'
)

Statistics for Florida Filtered
Number of check-ins: 985739
Number of unique users: 16839
Number of unique POIs: 64946
Median number of check-ins per user: 11.0


In [32]:
get_statistics(
    df_path='/home/victor-hugo/HAVANA-2.0/florida_checkin_cleaned.csv',
    df_name='Florida Artigo'
)

Statistics for Florida Artigo
Number of check-ins: 859201
Number of unique users: 10036
Number of unique POIs: 31640
Median number of check-ins per user: 23.0


# California

In [33]:
get_statistics(
    df_path='/home/victor-hugo/HAVANA-2.0/data/raw/checkins_local/checkins_local_California.csv',
    df_name='California Reduced'
)

Statistics for California Reduced
Number of check-ins: 103257
Number of unique users: 664
Number of unique POIs: 34772
Median number of check-ins per user: 19.0


In [34]:
get_statistics(
    df_path='/home/victor-hugo/HAVANA-2.0/data/raw/checkins_separated_categories/checkins_California.csv',
    df_name='California Big Data'
)

Statistics for California Big Data
Number of check-ins: 2534352
Number of unique users: 36106
Number of unique POIs: 148314
Median number of check-ins per user: 11.0


In [35]:
get_statistics(
    df_path='/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/California.csv',
    df_name='California Filtered'
)

Statistics for California Filtered
Number of check-ins: 2528690
Number of unique users: 30959
Number of unique POIs: 148199
Median number of check-ins per user: 15.0


In [36]:
get_statistics(
    df_path='/home/victor-hugo/HAVANA-2.0/california_checkin_cleaned.csv',
    df_name='California Artigo'
)

Statistics for California Artigo
Number of check-ins: 2247965
Number of unique users: 18083
Number of unique POIs: 76134
Median number of check-ins per user: 36.0
