In [None]:
import pandas as pd
from datetime import datetime, timedelta
from openlocationcode import openlocationcode as olc

# Filter out users and POIs with low frequency
def do_filter(df, poi_min_freq=10, user_min_freq=10):
    df = df.copy()
    
    # df['PoiFreq'] = df.groupby('Pid')['Uid'].transform('nunique')
    df['PoiFreq'] = df.groupby('Pid')['Uid'].transform('count')
    df = df[df['PoiFreq'] >= poi_min_freq]
    
    # df['UserFreq'] = df.groupby('Uid')['Pid'].transform('nunique')
    df['UserFreq'] = df.groupby('Uid')['Pid'].transform('count')
    df = df[df['UserFreq'] >= user_min_freq]

    df = df.drop(columns=['PoiFreq', 'UserFreq'])
    
    # df = df.groupby('Pid').filter(lambda x: len(x) > poi_min_freq)
    # df = df.groupby('Uid').filter(lambda x: len(x) > user_min_freq)
    
    return df


def get_pluscode(latitude, longitude):
    # Plus Code
    plus_code = olc.encode(latitude, longitude)
    return plus_code[:6]

file_name = "datasets/NYC.txt"
df = pd.read_csv(file_name, sep="\t", encoding='latin-1', header=None, names=[
    "User ID", "Venue ID", "Venue Category ID", "Venue Category Name", "Latitude", "Longitude", "Timezone Offset", "UTC Time"
])

df["Region"] = df.apply(lambda row: get_pluscode(row['Latitude'], row['Longitude']), axis=1)

df["UTC Time"] = pd.to_datetime(df["UTC Time"], format="%a %b %d %H:%M:%S %z %Y")

df["Local Time"] = (df["UTC Time"] + df["Timezone Offset"].apply(lambda x: timedelta(minutes=x))).dt.strftime("%Y-%m-%d %H:%M")


df.columns = ["Uid", "Pid", "Venue Category ID", "Catname", "Lat", "Lon", "Timezone Offset", "UTC Time", "Region", "Time"]

df = df[["Uid", "Pid", "Catname", "Region", "Time"]]

filtered_df = do_filter(df, poi_min_freq=10, user_min_freq=10)

datapath = file_name.split(".")[0] + "/"
outname = file_name.split("/")[-1].split(".")[0]
filtered_df.to_csv(f"{datapath}{outname}.csv", index=False)

In [None]:
import pandas as pd
import random
import os

dataset = "NYC"
df = pd.read_csv(f"datasets/{dataset}/{dataset}.csv")

uids = list(df["Uid"].unique())
pids = list(df["Pid"].unique())
cats = list(df["Catname"].unique())
regs = list(df["Region"].unique())

random.shuffle(uids)
random.shuffle(pids)
random.shuffle(cats)
random.shuffle(regs)

uid_map = {uid: i for i, uid in enumerate(uids, start=1)}
pid_map = {pid: i for i, pid in enumerate(pids, start=1)}
cat_map = {cat: i for i, cat in enumerate(cats, start=1)}
reg_map = {reg: i for i, reg in enumerate(regs, start=1)}

df["Uid"] = df["Uid"].map(uid_map)
df["Pid"] = df["Pid"].map(pid_map)
df["Catname"] = df["Catname"].map(cat_map)
df["Region"] = df["Region"].map(reg_map)

if not os.path.exists(f"datasets/{dataset}"):
    os.makedirs(f"datasets/{dataset}")

pd.DataFrame(list(uid_map.items()), columns=["Original_Uid", "Mapped_Uid"]).to_csv(f"datasets/{dataset}/uid_mapping.csv", index=False)
pd.DataFrame(list(pid_map.items()), columns=["Original_Pid", "Mapped_Pid"]).to_csv(f"datasets/{dataset}/pid_mapping.csv", index=False)
pd.DataFrame(list(cat_map.items()), columns=["Original_Catname", "Mapped_Catname"]).to_csv(f"datasets/{dataset}/catname_mapping.csv", index=False)
pd.DataFrame(list(reg_map.items()), columns=["Original_Region", "Mapped_Region"]).to_csv(f"datasets/{dataset}/region_mapping.csv", index=False)


df.to_csv(f"datasets/{dataset}/data.csv", index=False)

In [None]:
import pandas as pd
from collections import Counter, defaultdict
import networkx as nx
from datetime import datetime

dataset = 'NYC'
file_name = f"datasets/{dataset}/data.csv"

df = pd.read_csv(file_name)
df["Time"] = pd.to_datetime(df["Time"]).dt.hour



poi_sequence = df.groupby("Uid").agg({
    "Pid": list,
    "Catname": list
}).reset_index()


def get_forward_neighbors(df, column, min_freq=1):
    neighbor_counts = defaultdict(Counter)
    all_pois = set()

    for sequence in df[column]:
        all_pois.update(sequence)
        for i in range(len(sequence) - 1):
            current_poi = sequence[i]
            next_poi = sequence[i + 1]
            neighbor_counts[current_poi][next_poi] += 1

    df_data = []
    for poi in all_pois:
        counter = neighbor_counts.get(poi, {})
        filtered_neighbors = {
            neighbor: freq for neighbor, freq in counter.items() if freq >= min_freq
        }
        if filtered_neighbors:
            sorted_neighbors = [
                neighbor for neighbor, _ in sorted(filtered_neighbors.items(), key=lambda x: x[1], reverse=True)
            ]
        else:
            sorted_neighbors = []
        df_data.append((poi, sorted_neighbors))

    neighbors_df = pd.DataFrame(df_data, columns=[column, "neighbors"])
    return neighbors_df

def get_neighbors(df, column, min_freq=1):
    neighbor_counts = defaultdict(Counter)
    all_pois = set()

    for sequence in df[column]:
        all_pois.update(sequence)
        for i, poi in enumerate(sequence):
            if i > 0:  
                neighbor_counts[poi][sequence[i - 1]] += 1
            if i < len(sequence) - 1:  
                neighbor_counts[poi][sequence[i + 1]] += 1

    df_data = []
    for poi in all_pois:
        counter = neighbor_counts.get(poi, {})

        filtered_neighbors = {
            neighbor: freq for neighbor, freq in counter.items() if freq >= min_freq
        }

        sorted_neighbors = [
            neighbor for neighbor, _ in sorted(filtered_neighbors.items(), key=lambda x: x[1], reverse=True)
        ]
        df_data.append((poi, sorted_neighbors))

    neighbors_df = pd.DataFrame(df_data, columns=[column, "neighbors"])
    return neighbors_df

poi_info = df.groupby("Pid").agg({
    "Uid": list,
    "Catname": lambda x: x.iloc[0],
    "Region": lambda x: x.iloc[0],
    "Time": list
}).reset_index()

poi_info["Uid"] = poi_info["Uid"].apply(lambda uids: [uid for uid, count in Counter(uids).items() if count >= 1])

poi_info["Time"] = poi_info["Time"].apply(lambda times: [time for time, count in Counter(times).items() if count >= 1])

poi_neighbors = get_neighbors(poi_sequence,"Pid", 1)
poi_info["neighbors"] = poi_info["Pid"].map(poi_neighbors.set_index("Pid")["neighbors"])

forward_neighbors = get_forward_neighbors(poi_sequence,"Pid", 1)
poi_info["forward_neighbors"] = poi_info["Pid"].map(forward_neighbors.set_index("Pid")["neighbors"])


poi_info.to_csv(f"datasets/{dataset}/poi_info.csv", index=False)

In [None]:
import pandas as pd

dataset = "NYC"
file_name = f"datasets/{dataset}/data.csv"
df = pd.read_csv(file_name)

df = df[['Uid', 'Pid', 'Time']]

df = df.sort_values(by='Time')

train_size = int(0.8 * len(df))

train_df = df[:train_size]

test_df = df[train_size:]

def romove_users_pois_test(df_train, df_test):
    users_train = df_train['Uid'].unique()
    pois_train = df_train['Pid'].unique()
    df_test = df_test[df_test['Uid'].isin(users_train)]
    df_test = df_test[df_test['Pid'].isin(pois_train)]
    return df_test

test_df = romove_users_pois_test(train_df, test_df)


test_uids = test_df['Uid'].unique()

expanded_df = df[df['Uid'].isin(test_uids)]

train_df.to_csv(f'datasets/{dataset}/train_data.csv', index=False)
expanded_df.to_csv(f'datasets/{dataset}/test_data.csv', index=False)


In [None]:
import pandas as pd
import random

def generate_train_sequences(df: pd.DataFrame, window_size: int, step_size: int, mask_prob: float) -> pd.DataFrame:

    df = df.copy()
    df['Time'] = pd.to_datetime(df['Time'])

    results = []

    for uid, group in df.groupby('Uid'):
        group = group.sort_values('Time').reset_index(drop=True)
        
        if len(group) > 80:
            group = group.iloc[-80:]
        
        n = len(group)

        if n < window_size:
            if n >= 10:  
                input_pids = group['Pid'].iloc[:-1].tolist()
                input_times = group['Time'].iloc[:-1].tolist()
                target_pid = group['Pid'].iloc[-1]
                target_time = group['Time'].iloc[-1]

                results.append({
                    'Uid': uid,
                    'Pids': input_pids,
                    'Times': input_times,
                    'Target': target_pid,
                    'Target_time': target_time
                })
            continue 

        for start in range(n - 1, window_size - 2, -step_size):
            end = start + 1 
            window = group.iloc[start - window_size + 1 : start + 1]

            input_pids = window['Pid'].iloc[:-1].tolist()
            input_times = window['Time'].iloc[:-1].tolist()
            original_target_pid = window['Pid'].iloc[-1]
            original_target_time = window['Time'].iloc[-1]

            if random.random() < mask_prob and len(input_pids) >= 1:
                drop_idx = random.randint(0, len(input_pids) - 1)
                target_pid = input_pids[drop_idx]
                target_time = input_times[drop_idx]
                input_pids = input_pids[:drop_idx] + input_pids[drop_idx + 1:] + [original_target_pid]
                input_times = input_times[:drop_idx] + input_times[drop_idx + 1:] + [original_target_time]
            else:
                target_pid = original_target_pid
                target_time = original_target_time

            results.append({
                'Uid': uid,
                'Pids': input_pids,
                'Times': input_times,
                'Target': target_pid,
                'Target_time': target_time
            })

    train = pd.DataFrame(results)

    train['Times'] = train['Times'].apply(lambda x: [t.strftime('%Y-%m-%d %H:%M') for t in x])
    train['Target_time'] = train['Target_time'].dt.strftime('%Y-%m-%d %H:%M')

    return train

def generate_test_sequences(test_df: pd.DataFrame, window_size: int):
    
    test_df = test_df.copy()
    test_df['Time'] = pd.to_datetime(test_df['Time'])

    val_records = []
    test_records = []

    for uid, group in test_df.groupby('Uid'):
        group = group.sort_values('Time').reset_index(drop=True)
        n = len(group)

        if n < window_size:
            if n > 2:
                test_records.append({
                    'Uid': uid,
                    'Pids': group['Pid'].iloc[:-1].tolist(),
                    'Times': group['Time'].iloc[:-1].tolist(),
                    'Target': group['Pid'].iloc[-1],
                    'Target_time': group['Time'].iloc[-1]
                })
                val_records.append({
                    'Uid': uid,
                    'Pids': group['Pid'].iloc[:-2].tolist(),
                    'Times': group['Time'].iloc[:-2].tolist(),
                    'Target': group['Pid'].iloc[-2],
                    'Target_time': group['Time'].iloc[-2]
                })
            continue

        if n >= window_size + 1:
            val_start = n - window_size - 1
            val_window = group.iloc[val_start:val_start + window_size]
            val_records.append({
                'Uid': uid,
                'Pids': val_window['Pid'].iloc[:-1].tolist(),
                'Times': val_window['Time'].iloc[:-1].tolist(),
                'Target': val_window['Pid'].iloc[-1],
                'Target_time': val_window['Time'].iloc[-1]
            })

        test_window = group.iloc[n - window_size:]
        test_records.append({
            'Uid': uid,
            'Pids': test_window['Pid'].iloc[:-1].tolist(),
            'Times': test_window['Time'].iloc[:-1].tolist(),
            'Target': test_window['Pid'].iloc[-1],
            'Target_time': test_window['Time'].iloc[-1]
        })

    val_df = pd.DataFrame(val_records)
    test_df = pd.DataFrame(test_records)

    for df in [val_df, test_df]:
        df['Times'] = df['Times'].apply(lambda x: [t.strftime('%Y-%m-%d %H:%M') for t in x])
        df['Target_time'] = df['Target_time'].dt.strftime('%Y-%m-%d %H:%M')

    return val_df, test_df



dataset = "NYC"
train = pd.read_csv(f"datasets/{dataset}/train_data.csv")
test = pd.read_csv(f"datasets/{dataset}/test_data.csv")
all_data = pd.read_csv(f"datasets/{dataset}/data.csv")

train = generate_train_sequences(train, 50, 10, 0.1)
val, test = generate_test_sequences(test, 50)

val_all, test_all = generate_test_sequences(all_data, 50)
test_all.to_csv(f"datasets/{dataset}/data/test_all.csv", index=False)

train.to_csv(f"datasets/{dataset}/data/train.csv", index=False)
val.to_csv(f"datasets/{dataset}/data/val.csv", index=False)
test.to_csv(f"datasets/{dataset}/data/test.csv", index=False)


In [None]:
import pandas as pd

dataset = "NYC"
file_name = f"datasets/{dataset}/data.csv"
df = pd.read_csv(file_name)

df = df[['Uid', 'Pid', 'Time']]

user_history_length = df.groupby('Uid').size()

average_history_length = user_history_length.mean()

print(average_history_length)

history_length_counts = user_history_length.value_counts()

most_frequent_length = history_length_counts.idxmax()

print(most_frequent_length)