In [None]:
import pandas as pd
import numpy as np

In [None]:
output_path = '/data'
stay_locations = pd.read_parquet(f"{output_path}/stay")

## Calculating K-visitation

In [None]:
def calculate_k_places(places_df, amenity_list, smallest_values, 
                                sort_column='home_dist', ascending=True, 
                                k_type='k_dist', min_places_per_user=1):
    """
    Calculate K-places using corrected greedy algorithm
    
    Scenarios:
    1. Complete: Requirements satisfied - mark cumulative places as 1, stop
    2. Incomplete: Requirements not satisfied - mark ALL places as 1
    
    Parameters:
    -----------
    places_df : DataFrame
        Places dataframe with user_id and amenity columns
    amenity_list : list
        List of amenity column names
    smallest_values : Series or array
        Minimum required values for each amenity
    sort_column : str
        Column to sort by ('home_dist' for K-dist, 'visit_freq' for K-freq)
    ascending : bool
        Sort order (True for distance, False for frequency)
    k_type : str
        Type identifier for output column
    min_places_per_user : int
        Minimum number of places to select per user
    
    Returns:
    --------
    DataFrame : Original dataframe with K-place indicators added
    """
    
    print(f"Calculating {k_type} places (sorted by {sort_column})...")
    
    # Sort data
    fvp_sorted = places_df.sort_values(
        by=['user_id', sort_column], 
        ascending=[True, ascending]
    ).reset_index(drop=True)
    
    # Fill missing amenity values
    fvp_sorted[amenity_list] = fvp_sorted[amenity_list].fillna(0)
    
    # Convert to numpy for faster computation
    smallest_values_np = smallest_values.to_numpy() if hasattr(smallest_values, 'to_numpy') else np.array(smallest_values)
    
    # Initialize result arrays
    k_indicator = np.zeros(len(fvp_sorted), dtype=np.int8)
    k_status = np.full(len(fvp_sorted), 'unassigned', dtype=object)
    
    # Group by user for processing
    user_groups = fvp_sorted.groupby('user_id')
    total_users = len(user_groups)
    
    print(f"Processing {total_users} users...")
    
    for user_id, user_data in tqdm(user_groups, desc=f"Processing {k_type}", unit="users"):
        indices = user_data.index.tolist()
        user_poi = user_data[amenity_list].to_numpy()
        
        # Initialize tracking variables
        total_poi_access = np.zeros_like(smallest_values_np)
        k_user = np.zeros(len(indices), dtype=np.int8)
        requirements_met = False
        
        # Process each place for this user to find completion point
        for idx, row_poi in enumerate(user_poi):
            # Add current place's amenities
            total_poi_access += row_poi
            
            # Check if requirements are met after adding this place
            is_complete = np.all(total_poi_access >= smallest_values_np)
            
            if is_complete:
                # SCENARIO 1: COMPLETE - Mark cumulative places (0 to idx) as K-places
                k_user[:idx+1] = 1
                requirements_met = True
                break
        
        # SCENARIO 2: INCOMPLETE - If requirements not met after all places
        if not requirements_met:
            # Mark ALL places as K-places
            k_user[:] = 1
        
        # Assign results back to main arrays
        for i, idx in enumerate(indices):
            k_indicator[idx] = k_user[i]
        
        # Determine completion status
        if requirements_met:
            status = 'complete'    # Requirements fully met
        else:
            status = 'incomplete'  # Requirements not met, all places selected
        
        # Apply status to all places for this user
        for idx in indices:
            k_status[idx] = status
    
    # Add results to dataframe
    fvp_sorted[f'{k_type}'] = k_indicator
    fvp_sorted[f'{k_type}_status'] = k_status
    
    # Summary statistics
    total_k_places = k_indicator.sum()
    users_complete = (fvp_sorted.groupby('user_id')[f'{k_type}_status'].first() == 'complete').sum()
    users_incomplete = (fvp_sorted.groupby('user_id')[f'{k_type}_status'].first() == 'incomplete').sum()
    
    print(f"{k_type} calculation complete!")
    print(f"Total {k_type} places identified: {total_k_places}")
    print(f"Users with complete coverage: {users_complete}")
    print(f"Users with incomplete coverage: {users_incomplete}")
    
    return fvp_sorted

In [None]:
# Wrapper function for calculating both K-dist and K-freq
def calculate_both_k_places(places_df, amenity_list, smallest_values):
    """Calculate both K-dist and K-freq places with corrected logic"""
    
    print("üîÑ CALCULATING K-PLACES (CORRECTED)")
    print("=" * 50)
    
    # Calculate K-dist places (sorted by distance, ascending)
    places_with_kdist = calculate_k_places(
        places_df=places_df,
        amenity_list=amenity_list,
        smallest_values=smallest_values,
        sort_column='home_dist',
        ascending=True,
        k_type='k_dist',
        min_places_per_user=1
    )
    
    print("\n" + "-" * 50)
    
    # Calculate K-freq places (sorted by frequency, descending)
    places_with_both = calculate_k_places(
        places_df=places_with_kdist,
        amenity_list=amenity_list,
        smallest_values=smallest_values,
        sort_column='visit_freq',
        ascending=False,
        k_type='k_freq',
        min_places_per_user=1
    )
    
    return places_with_both

In [None]:
amenity_list = [
    'CIVIC_RELIGION', 'CULTURE', 'DINING', 'EDUCATION', 'FITNESS', 'GROCERIES', 'HEALTHCARE', 'RETAIL', 'SERVICE', 'TRANSPORT'
    ]

smallest_values = np.ones(len(amenity_list), dtype=int)


# Run the calculation excluding single-visit locations
stay_1plus = stay_locations[stay_locations['visit_freq']>1].copy()

places_k = calculate_both_k_places(stay_1plus, amenity_list, smallest_values)

## K-visitations excluding work

In [None]:
def calculate_k_places_exclude_work(places_df, amenity_list, smallest_values, work_distance_threshold=200):
    """
    Calculate K-places excluding places close to work locations
    
    This function:
    1. Excludes places within work_distance_threshold meters of work location
    2. Calculates K-dist and K-freq on remaining places
    3. Returns results with suffix '_nw' (no work)
    
    Parameters:
    -----------
    places_df : DataFrame
        Places dataframe with user_id, work_dist, and amenity columns
    amenity_list : list
        List of amenity column names
    smallest_values : Series or array
        Minimum required values for each amenity
    work_distance_threshold : int
        Distance threshold in meters to exclude work-related places
    
    Returns:
    --------
    DataFrame : Original dataframe with additional K-place indicators (_nw variants)
    """
    
    print(f"üîÑ CALCULATING K-PLACES EXCLUDING WORK (within {work_distance_threshold}m)")
    print("=" * 70)
    
    # Create a copy to avoid modifying original data
    places_work_filtered = places_df.copy()
    
    # Filter out places close to work locations
    # Keep places that are either:
    # 1. Far from work (> threshold distance), OR
    # 2. Have missing work distance (no work location identified)
    work_filter = (places_work_filtered['work_dist'] > work_distance_threshold) | \
                  (places_work_filtered['work_dist'].isna())
    
    places_non_work = places_work_filtered[work_filter].copy().reset_index(drop=True)
    
    total_places_before = len(places_work_filtered)
    total_places_after = len(places_non_work)
    excluded_places = total_places_before - total_places_after
    
    print(f"Places before work exclusion: {total_places_before}")
    print(f"Places after work exclusion: {total_places_after}")
    print(f"Places excluded (within {work_distance_threshold}m of work): {excluded_places}")
    print(f"Exclusion rate: {excluded_places/total_places_before*100:.1f}%")
    
    # Check if users still have places after work exclusion
    users_before = places_work_filtered['user_id'].nunique()
    users_after = places_non_work['user_id'].nunique()
    
    print(f"Users before work exclusion: {users_before}")
    print(f"Users after work exclusion: {users_after}")
    print(f"Users lost (no non-work places): {users_before - users_after}")
    
    if len(places_non_work) == 0:
        print("‚ùå No places remain after work exclusion!")
        return places_work_filtered
    
    # Calculate K-dist on non-work places (sorted by distance from home, ascending)
    print(f"\nüìç Calculating K-dist (no work) on {len(places_non_work)} places...")
    places_kdist_nw = calculate_k_places(
        places_df=places_non_work,
        amenity_list=amenity_list,
        smallest_values=smallest_values,
        sort_column='home_dist',
        ascending=True,
        k_type='k_dist_nw',
        min_places_per_user=1
    )
    
    print("\n" + "-" * 50)
    
    # Calculate K-freq on non-work places (sorted by frequency, descending)
    print(f"\nüîÑ Calculating K-freq (no work) on {len(places_non_work)} places...")
    places_both_nw = calculate_k_places(
        places_df=places_kdist_nw,
        amenity_list=amenity_list,
        smallest_values=smallest_values,
        sort_column='visit_freq',
        ascending=False,
        k_type='k_freq_nw',
        min_places_per_user=1
    )
    
    # Merge results back to original dataframe
    print(f"\nüîó Merging results back to original dataset...")
    
    # Create mapping from non-work results back to original indices
    # We need to use a combination of user_id and stay_gid10 as unique identifier
    places_both_nw['place_key'] = places_both_nw['user_id'] + '_' + places_both_nw['stay_gid10']
    places_work_filtered['place_key'] = places_work_filtered['user_id'] + '_' + places_work_filtered['stay_gid10']
    
    # Initialize columns in original dataframe
    places_work_filtered['k_dist_nw'] = 0
    places_work_filtered['k_dist_nw_status'] = 'excluded'
    places_work_filtered['k_freq_nw'] = 0
    places_work_filtered['k_freq_nw_status'] = 'excluded'
    
    # Update with calculated values for non-work places
    merge_columns = ['place_key', 'k_dist_nw', 'k_dist_nw_status', 'k_freq_nw', 'k_freq_nw_status']
    places_result = places_work_filtered.merge(
        places_both_nw[merge_columns],
        on='place_key',
        how='left',
        suffixes=('', '_calc')
    )
    
    # Update the K-place indicators where calculated values exist
    mask = places_result['k_dist_nw_calc'].notna()
    places_result.loc[mask, 'k_dist_nw'] = places_result.loc[mask, 'k_dist_nw_calc']
    places_result.loc[mask, 'k_dist_nw_status'] = places_result.loc[mask, 'k_dist_nw_status_calc']
    places_result.loc[mask, 'k_freq_nw'] = places_result.loc[mask, 'k_freq_nw_calc']
    places_result.loc[mask, 'k_freq_nw_status'] = places_result.loc[mask, 'k_freq_nw_status_calc']
    
    # Clean up temporary columns
    places_result = places_result.drop(columns=[
        'place_key', 'k_dist_nw_calc', 'k_dist_nw_status_calc', 
        'k_freq_nw_calc', 'k_freq_nw_status_calc'
    ])
    
    # Summary statistics
    print(f"\nüìä SUMMARY STATISTICS (NO WORK):")
    print("=" * 40)
    
    # K-dist no-work statistics
    k_dist_nw_places = (places_result['k_dist_nw'] == 1).sum()
    k_dist_nw_complete = (places_result.groupby('user_id')['k_dist_nw_status'].first() == 'complete').sum()
    k_dist_nw_incomplete = (places_result.groupby('user_id')['k_dist_nw_status'].first() == 'incomplete').sum()
    k_dist_nw_excluded = (places_result.groupby('user_id')['k_dist_nw_status'].first() == 'excluded').sum()
    
    print(f"K-dist (no work) places: {k_dist_nw_places}")
    print(f"Users complete: {k_dist_nw_complete}")
    print(f"Users incomplete: {k_dist_nw_incomplete}")
    print(f"Users excluded: {k_dist_nw_excluded}")
    
    # K-freq no-work statistics
    k_freq_nw_places = (places_result['k_freq_nw'] == 1).sum()
    k_freq_nw_complete = (places_result.groupby('user_id')['k_freq_nw_status'].first() == 'complete').sum()
    k_freq_nw_incomplete = (places_result.groupby('user_id')['k_freq_nw_status'].first() == 'incomplete').sum()
    k_freq_nw_excluded = (places_result.groupby('user_id')['k_freq_nw_status'].first() == 'excluded').sum()
    
    print(f"K-freq (no work) places: {k_freq_nw_places}")
    print(f"Users complete: {k_freq_nw_complete}")
    print(f"Users incomplete: {k_freq_nw_incomplete}")
    print(f"Users excluded: {k_freq_nw_excluded}")
    
    return places_result


# Run the no-work calculation with 500m threshold
print("Running K-places calculation excluding work areas...")
places_k_with_nw = calculate_k_places_exclude_work(
    places_k, 
    amenity_list, 
    smallest_values, 
    work_distance_threshold=200
)

In [None]:
# Function to compare all K-place variants
def compare_k_place_variants(places_df):
    """
    Compare statistics across all K-place calculation variants
    """
    
    print("üìà COMPARISON OF K-PLACE VARIANTS")
    print("=" * 60)
    
    variants = {
        'K-dist (all)': ('k_dist', 'k_dist_status'),
        'K-freq (all)': ('k_freq', 'k_freq_status'),
        'K-dist (no work)': ('k_dist_nw', 'k_dist_nw_status'),
        'K-freq (no work)': ('k_freq_nw', 'k_freq_nw_status')
    }
    
    comparison_stats = []
    
    for variant_name, (k_col, status_col) in variants.items():
        if k_col not in places_df.columns:
            print(f"‚ö†Ô∏è  {variant_name}: Columns not found")
            continue
            
        # Calculate statistics
        total_k_places = (places_df[k_col] == 1).sum()
        
        user_stats = places_df.groupby('user_id')[status_col].first()
        users_complete = (user_stats == 'complete').sum()
        users_incomplete = (user_stats == 'incomplete').sum()
        users_excluded = (user_stats == 'excluded').sum()
        total_users = len(user_stats)
        
        avg_k_places_per_user = places_df.groupby('user_id')[k_col].sum().mean()
        
        comparison_stats.append({
            'Variant': variant_name,
            'Total K-places': total_k_places,
            'Users Complete': users_complete,
            'Users Incomplete': users_incomplete,
            'Users Excluded': users_excluded,
            'Total Users': total_users,
            'Avg K-places/User': avg_k_places_per_user
        })
    
    # Display comparison table
    comparison_df = pd.DataFrame(comparison_stats)
    
    print("\nComparison Table:")
    print("-" * 80)
    print(f"{'Variant':<20} {'K-places':<10} {'Complete':<10} {'Incomplete':<12} {'Excluded':<10} {'Avg/User':<10}")
    print("-" * 80)
    
    for _, row in comparison_df.iterrows():
        print(f"{row['Variant']:<20} {row['Total K-places']:<10} {row['Users Complete']:<10} "
              f"{row['Users Incomplete']:<12} {row['Users Excluded']:<10} {row['Avg K-places/User']:<10.1f}")
    
    return comparison_df

In [None]:
# Compare all variants
print("\n" + "="*80)
comparison_results = compare_k_place_variants(places_k_with_nw)

# Show sample of results
print(f"\nüìã SAMPLE RESULTS:")
sample_cols = ['user_id', 'stay_gid10', 'visit_freq', 'home_dist', 'work_dist',
               'k_dist', 'k_dist_status', 'k_freq', 'k_freq_status',
               'k_dist_nw', 'k_dist_nw_status', 'k_freq_nw', 'k_freq_nw_status']

sample_users = places_k_with_nw['user_id'].unique()[:3]
for user_id in sample_users:
    print(f"\nUser: {user_id}")
    user_sample = places_k_with_nw[places_k_with_nw['user_id'] == user_id][sample_cols]
    print(user_sample.to_string(index=False))