In [None]:
import pandas as pd
from datetime import timedelta
import random
import os
import numpy as np

N_TOP_FREQUENT = 3 
TIME_LIMITS = {
    'month': timedelta(days=30),
    'week': timedelta(days=7),
    'day': timedelta(days=1)
}

def get_fuzzy_time_expression(time_delta, target_time):
    
    if time_delta > TIME_LIMITS['month']:
        months = round(time_delta.days / 30)
        return f"in about {months} months" if months > 1 else "in the next month"
    
    elif time_delta > TIME_LIMITS['week']:
        weeks = round(time_delta.days / 7)
        weekday = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"][target_time.weekday()]
        return f"in about {weeks} weeks, specifically next {weekday}"
    
    elif time_delta > TIME_LIMITS['day']:
        days = time_delta.days
        return f"in about {days} days, on {target_time.date().strftime('%Y-%m-%d')}"
    
    else: # Time Delta <= 1 day
        hour = target_time.hour
        if 5 <= hour < 12:
            period = "in the morning"
        elif 12 <= hour < 18:
            period = "in the afternoon"
        elif 18 <= hour < 22:
            period = "in the evening"
        else:
            period = "late at night"
        return f"later today, {period}"

def generate_context_for_df(df_input: pd.DataFrame, N_top_frequent: int = 3) -> pd.DataFrame:

    df = df_input.copy()
    
    df['stime'] = pd.to_datetime(df['stime'])
    df['etime'] = pd.to_datetime(df['etime'])
    df['context_fuzzy'] = None  
    df['context_precise'] = None  
    
    if len(df) < N_top_frequent or df.empty:
        return df

    top_n_grids = df['grid'].value_counts().nlargest(N_top_frequent).index.tolist()
    print(top_n_grids)
    aperiodic_stay_list = df[~df['grid'].isin(top_n_grids)].index.tolist()
    current_stay_candidates = df.index[:-1].tolist()

    while aperiodic_stay_list and current_stay_candidates:
        current_stay_idx = random.choice(current_stay_candidates)
        
        future_aperiodic_stays = [
            target_idx for target_idx in aperiodic_stay_list 
            if target_idx > current_stay_idx
        ]
        
        if not future_aperiodic_stays:
            current_stay_candidates.remove(current_stay_idx)
            continue

        generate_context_stay_idx = random.choice(future_aperiodic_stays)
        
        current_stay = df.loc[current_stay_idx]
        generate_context_stay = df.loc[generate_context_stay_idx]
        
        start_grid = current_stay['grid']
        end_grid = generate_context_stay['grid']
        user_id = current_stay['userID']
        
        time_delta = generate_context_stay['stime'] - current_stay['etime']
 
        precise_time_str = generate_context_stay['stime'].strftime('%Y-%m-%d %H:%M:%S')
        context_precise = (
            f"User {user_id} will move from grid {start_grid} to grid {end_grid}, "
            f"at {precise_time_str}."
        )
        
        fuzzy_time_expression = None
        should_use_fuzzy = False

        if time_delta > TIME_LIMITS['month']:
            if random.random() >= 0.1: 
                fuzzy_time_expression = get_fuzzy_time_expression(time_delta, generate_context_stay['stime'])
                should_use_fuzzy = True
        elif time_delta > TIME_LIMITS['week']:
            if random.random() >= 0.3: 
                fuzzy_time_expression = get_fuzzy_time_expression(time_delta, generate_context_stay['stime'])
                should_use_fuzzy = True
        elif time_delta > TIME_LIMITS['day']:
            if random.random() >= 0.5: 
                fuzzy_time_expression = get_fuzzy_time_expression(time_delta, generate_context_stay['stime'])
                should_use_fuzzy = True
        else: # Time Delta <= 1 day
            if random.random() >= 0.7: 
                fuzzy_time_expression = get_fuzzy_time_expression(time_delta, generate_context_stay['stime'])
                should_use_fuzzy = True

        if should_use_fuzzy and fuzzy_time_expression:
            context_fuzzy = (
                f"User {user_id} will move from grid {start_grid} to grid {end_grid}, "
                f"arriving around {fuzzy_time_expression}."
            )
        else:
            context_fuzzy = context_precise

        df.loc[current_stay_idx, 'context_fuzzy'] = context_fuzzy
        df.loc[current_stay_idx, 'context_precise'] = context_precise
        
        aperiodic_stay_list.remove(generate_context_stay_idx)
        current_stay_candidates.remove(current_stay_idx)

    return df

def process_data_directory(
    data_dir: str, 
    individual_output_dir: str,
    combined_output_filepath: str,
    N_top_frequent: int = 3
):
    if not os.path.exists(data_dir):
        print(f"Error: The specified input folder path could not be found -> {data_dir}")
        return
        
    if not os.path.exists(individual_output_dir):
        os.makedirs(individual_output_dir)
        print(f"Create an output folder: {individual_output_dir}")

    all_users_data = []
    
    for filename in os.listdir(data_dir):

        if filename.endswith(".csv"):
            filepath = os.path.join(data_dir, filename)
            
            print(f"\n--- Processing file: {filename} ---")
            try:
                df_raw = pd.read_csv(filepath, index_col=0) 
                
                if df_raw.empty or len(df_raw) < 2:
                    print(f"Skipping: The data is empty or there are not enough rows.")
                    continue
                
                df_processed = generate_context_for_df(df_raw, N_top_frequent)
                
                if 'userID' not in df_processed.columns:
                    print("Warning: 'userID' column is missing. Try to infer the UserID from the filename.")
                    user_id = filename.split('_')[0] 
                    df_processed.insert(0, 'userID', user_id)
                
                output_filename_single = f"{filename}"
                output_filepath_single = os.path.join(individual_output_dir, output_filename_single)
                df_processed.to_csv(output_filepath_single, index=True)
                print(f" Save individual user data to: {output_filepath_single}")
                print(f" Includes context columns: context_fuzzy and context_precise")
                
                all_users_data.append(df_processed)

            except Exception as e:
                print(f"An error occurred while processing file {filename}: {e}")
                continue

    if all_users_data:
        df_all = pd.concat(all_users_data)
        df_all.to_csv(combined_output_filepath, index=True)
        print(f"\n All user data has been successfully merged and saved to:{combined_output_filepath}")
    else:
        print("\n No new CSV files were successfully processed and merged.")

if __name__ == "__main__":
    INPUT_FOLDER = "./Data/Output/Stays" 
    INDIVIDUAL_OUTPUT_FOLDER = "./Data/Output/Context"
    COMBINED_OUTPUT_FILE = "./Data/Output/all_users_context_combined.csv" 
    
    process_data_directory(
        data_dir=INPUT_FOLDER,
        individual_output_dir=INDIVIDUAL_OUTPUT_FOLDER,
        combined_output_filepath=COMBINED_OUTPUT_FILE
    )

In [None]:
train_df = pd.read_csv("./Data/train_all_stay.csv")
test_df = pd.read_csv("./Data/test_all_stay.csv")

