In [1]:
import pandas as pd
from sklearn.utils import resample
import polars as pl

In [2]:
def filter_checkins(checkins_df: pd.DataFrame) -> pd.DataFrame:
    """
    Filter checkins data, removing users with less than 2 locals visited

    Args:
        checkins_df (pd.DataFrame): Checkins data

    Returns:
        pd.DataFrame: Filtered checkins data
    """
    places_quantity_per_user = (
        checkins_df.groupby("userid")
        .agg({"placeid": "nunique"})
        .sort_values(by=["placeid"], ascending=False)
        .reset_index()
    )

    places_quantity_per_user = places_quantity_per_user[places_quantity_per_user["placeid"] >= 4]

    valid_users = places_quantity_per_user["userid"].unique()

    checkins_df = checkins_df[checkins_df["userid"].isin(valid_users)]

    return checkins_df

In [3]:
def get_statistics(df_path: str, df_name: str) -> None:
    df = pl.read_csv(df_path).unique()
    
    n_unique_users = (
        df
        .select("userid")
        .unique()
        .shape[0]
    )
    median_checkins_per_user = (
        df
        .group_by("userid")
        .agg(pl.len().alias("qt_checkins"))
        .select("qt_checkins")
        .median()
        .item()
    )
    number_pois = (
        df
        .select("placeid")
        .unique()
        .shape[0]
    )
    max_checkins_per_user = (
        df
        .group_by("userid")
        .agg(pl.len().alias("qt_checkins"))
        .select("qt_checkins")
        .max()
        .item()
    )
    min_checkins_per_user = (
        df
        .group_by("userid")
        .agg(pl.len().alias("qt_checkins"))
        .select("qt_checkins")
        .min()
        .item()
    )
    median_category_per_user = (
        df
        .group_by("userid")
        .agg(pl.n_unique('category').alias("qt_categoria"))
        .select("qt_categoria")
        .median()
        .item()
    )
    
    print(f"Statistics for {df_name}")
    print(f"Number of check-ins: {df.shape[0]}")
    print(f"Number of unique users: {n_unique_users}")
    print(f"Number of unique POIs: {number_pois}")
    print(f"Median number of check-ins per user: {median_checkins_per_user}")
    print(f"Max number of check-ins per user: {max_checkins_per_user}")
    print(f"Min number of check-ins per user: {min_checkins_per_user}")
    print(f"Median number of categories per user: {median_category_per_user}")

# Florida

In [4]:
state = "Florida"
checkins_df = pd.read_csv(
    f"/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/{state}_complete.csv"
)

In [5]:
n_samples = 30000
checkins_df_list = []

for class_label in checkins_df['category'].unique():
    checkins_category = checkins_df[checkins_df['category'] == class_label]
    
    checkins_resampled = resample(
        checkins_category, 
        replace=False,
        n_samples=n_samples,
        random_state=42
    )
    
    checkins_df_list.append(checkins_resampled)

checkins_df = pd.concat(checkins_df_list)

In [6]:
checkins_df = filter_checkins(checkins_df)

In [7]:
checkins_df['category'].value_counts()

category
Shopping         29033
Community        28647
Outdoors         28216
Food             28125
Entertainment    28060
Nightlife        27988
Travel           25075
Name: count, dtype: int64

In [8]:
checkins_df.to_csv(
    f"/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/{state}.csv",
    index=False
)

In [9]:
get_statistics(
    df_path=f"/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/{state}.csv",
    df_name=state
)

Statistics for Florida
Number of check-ins: 195092
Number of unique users: 6659
Number of unique POIs: 39411
Median number of check-ins per user: 11.0
Max number of check-ins per user: 2843
Min number of check-ins per user: 4
Median number of categories per user: 4.0


# Texas

In [5]:
state = "Texas"
checkins_df = pd.read_csv(
    f"/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/{state}_complete.csv"
)

In [6]:
n_samples = 30000
checkins_df_list = []

for class_label in checkins_df['category'].unique():
    checkins_category = checkins_df[checkins_df['category'] == class_label]
    
    checkins_resampled = resample(
        checkins_category, 
        replace=False,
        n_samples=n_samples,
        random_state=42
    )
    
    checkins_df_list.append(checkins_resampled)

checkins_df = pd.concat(checkins_df_list)
checkins_df = filter_checkins(checkins_df)

display(checkins_df['category'].value_counts())

category
Shopping         28094
Outdoors         27842
Community        27226
Food             26772
Entertainment    26319
Nightlife        26311
Travel           24435
Name: count, dtype: int64

In [7]:
checkins_df.to_csv(
    f"/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/{state}.csv",
    index=False
)

In [8]:
get_statistics(
    df_path=f"/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/{state}.csv",
    df_name=state
)

Statistics for Texas
Number of check-ins: 186974
Number of unique users: 9214
Number of unique POIs: 52388
Median number of check-ins per user: 9.0
Max number of check-ins per user: 2184
Min number of check-ins per user: 4
Median number of categories per user: 4.0


# California

In [10]:
state = "California"
checkins_df = pd.read_csv(
    f"/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/{state}_complete.csv"
)

In [11]:
n_samples = 30000
checkins_df_list = []

for class_label in checkins_df['category'].unique():
    checkins_category = checkins_df[checkins_df['category'] == class_label]
    
    checkins_resampled = resample(
        checkins_category, 
        replace=False,
        n_samples=n_samples,
        random_state=42
    )
    
    checkins_df_list.append(checkins_resampled)

checkins_df = pd.concat(checkins_df_list)

In [12]:
checkins_df = filter_checkins(checkins_df)

In [13]:
checkins_df['category'].value_counts()

category
Shopping         28140
Community        27355
Outdoors         27307
Food             26909
Nightlife        26735
Entertainment    26479
Travel           24583
Name: count, dtype: int64

In [14]:
checkins_df.to_csv(
    f"/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/{state}.csv",
    index=False
)

In [15]:
get_statistics(
    df_path=f"/home/victor-hugo/HAVANA-2.0/data/intermediate/checkins/{state}.csv",
    df_name=state
)

Statistics for California
Number of check-ins: 187500
Number of unique users: 8755
Number of unique POIs: 58087
Median number of check-ins per user: 10.0
Max number of check-ins per user: 1310
Min number of check-ins per user: 4
Median number of categories per user: 4.0
