In [2]:
import os
import pandas as pd

GeoLife_df = pd.read_csv('Data/Output/all_users_context_combined.csv')
MoreUser_df = pd.read_csv('Data/MoreUser/all.csv')

In [None]:

def GetSubsequence(df, topk=3, savePath="./Data/Output/split_by_topk_"):

    if "Unnamed: 0" in df.columns:
        df = df.drop(columns=["Unnamed: 0"])

    df = df.copy()
    df["orig_idx"] = range(len(df))

    out_root = savePath + f"{topk}"
    high_dir = os.path.join(out_root, "high_freq")
    low_dir  = os.path.join(out_root, "low_freq")
    os.makedirs(high_dir, exist_ok=True)
    os.makedirs(low_dir, exist_ok=True)

    high_all = []
    low_all = []

    for user_id, g in df.groupby("userID", sort=False):
        grid_counts = g["grid"].value_counts(dropna=False)

        topk_grids = grid_counts.head(topk).index.tolist()

        g_high = g[g["grid"].isin(topk_grids)].sort_values("orig_idx")
        g_low  = g[~g["grid"].isin(topk_grids)].sort_values("orig_idx")

        g_high.drop(columns=["orig_idx"]).to_csv(
            os.path.join(high_dir, f"user_{user_id}_high_top{topk}.csv"),
            index=False
        )
        g_low.drop(columns=["orig_idx"]).to_csv(
            os.path.join(low_dir, f"user_{user_id}_low_top{topk}.csv"),
            index=False
        )

        high_all.append(g_high)
        low_all.append(g_low)

    high_all_df = pd.concat(high_all, ignore_index=True).drop(columns=["orig_idx"])
    low_all_df  = pd.concat(low_all,  ignore_index=True).drop(columns=["orig_idx"])

    high_all_df.to_csv(os.path.join(out_root, f"Routine_top{topk}.csv"), index=False)
    low_all_df.to_csv(os.path.join(out_root, f"Nonroutine_top{topk}.csv"), index=False)

    print("Done.")
    print("Per-user files saved to:")
    print("  ", high_dir)
    print("  ", low_dir)
    print("Merged files saved to:")
    print("  ", os.path.join(out_root, f"Routine_top{topk}.csv"))
    print("  ", os.path.join(out_root, f"Nonroutine_top{topk}.csv"))


In [None]:
GetSubsequence(GeoLife_df, topk=3)

In [None]:
GetSubsequence(MoreUser_df, topk=3, savePath="./Data/MoreUser/split_by_topk_")
# consume 54.0 s.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def compute_and_plot_topk_visit_ratio(
    df,
    user_col="userID",
    grid_col="grid",
    topk=3,
    bins=20
):

    results = []

    for user_id, g in df.groupby(user_col):
        grid_counts = g[grid_col].value_counts()

        total_visits = grid_counts.sum()
        topk_visits = grid_counts.head(topk).sum()

        visit_ratio = (
            topk_visits / total_visits
            if total_visits > 0 else 0.0
        )

        results.append({
            user_col: user_id,
            "visit_ratio": visit_ratio
        })

    result_df = pd.DataFrame(results)

    plt.figure()
    plt.hist(result_df["visit_ratio"], bins=bins)
    plt.xlabel(f"Top-{topk} Visit Ratio")
    plt.ylabel("Number of Users")
    plt.title(f"Distribution of Top-{topk} Visit Ratio Across Users")
    plt.show()

    return result_df


In [None]:
compute_and_plot_topk_visit_ratio(GeoLife_df)

In [None]:
compute_and_plot_topk_visit_ratio(GeoLife_df, topk=5)

In [None]:
compute_and_plot_topk_visit_ratio(MoreUser_df)

In [None]:
compute_and_plot_topk_visit_ratio(MoreUser_df, topk=5)