<h3><strong>STEP 1: Load Data from Folders</strong></h3>

In [1]:
import pandas as pd
import os
import glob

def load_geo_data(base_path):
    """
    Load geolocation data from all fold directories
    """
    # List all the fold directories
    fold_dirs = [f for f in os.listdir(base_path) if f.startswith('fold=')]
    print("Found folds:", fold_dirs)
    
    # Load and combine all data, adding a 'fold' column
    all_data_frames = []
    
    for fold_dir in fold_dirs:
        fold_number = int(fold_dir.split('=')[1])  # Extract the fold integer
        fold_path = os.path.join(base_path, fold_dir)
        
        # Find all parquet files with the specific naming pattern in this fold directory
        parquet_pattern = os.path.join(fold_path, "part-*.snappy.parquet")
        parquet_files = glob.glob(parquet_pattern)
        print(f"Found {len(parquet_files)} parquet files in {fold_dir}")
        
        # Load all parquet files for this fold
        fold_dfs = []
        for file_path in parquet_files:
            df = pd.read_parquet(file_path)
            fold_dfs.append(df)
        
        # Combine all files for this fold
        if fold_dfs:
            fold_df = pd.concat(fold_dfs, ignore_index=True)
            fold_df['fold'] = fold_number
            all_data_frames.append(fold_df)
    
    # Combine all folds into a single DataFrame
    if all_data_frames:
        geo_df = pd.concat(all_data_frames, ignore_index=True)
        print(f"Full dataset shape: {geo_df.shape}")
        print(f"Unique clients per fold: {geo_df.groupby('fold')['client_id'].nunique()}")
        
        # Display basic info about the loaded data
        print("\nFirst few rows of the combined data:")
        print(geo_df.head())
        print("\nColumn names:")
        print(geo_df.columns.tolist())
        
        return geo_df
    else:
        print("No data found. Please check the file paths.")
        return None

# Execute Step 1
base_path = r"/home/opc/datasets/detail/detail/detail/geo"
geo_df = load_geo_data(base_path)

# Save the raw loaded data
if geo_df is not None:
    geo_df.to_pickle("geo_data_raw.pkl")
    print("Raw data saved to geo_data_raw.pkl")

Found folds: ['fold=0', 'fold=1', 'fold=2', 'fold=3', 'fold=4']
Found 92 parquet files in fold=0
Found 92 parquet files in fold=1
Found 92 parquet files in fold=2
Found 92 parquet files in fold=3
Found 92 parquet files in fold=4
Full dataset shape: (66295724, 6)
Unique clients per fold: fold
0    14863
1    14409
2    14396
3    14380
4    14525
Name: client_id, dtype: int64

First few rows of the combined data:
                                           client_id  \
0  397d52b11ce72699a5482e5397f63a03af1636c85e9e61...   
1  397d52b11ce72699a5482e5397f63a03af1636c85e9e61...   
2  397d52b11ce72699a5482e5397f63a03af1636c85e9e61...   
3  397d52b11ce72699a5482e5397f63a03af1636c85e9e61...   
4  397d52b11ce72699a5482e5397f63a03af1636c85e9e61...   

                  event_time  geohash_4  geohash_5  geohash_6  fold  
0 2022-03-25 13:39:51.569288      41342     406579    1018528     0  
1 2022-06-11 07:21:32.505485      41342     406579    1018528     0  
2 2022-08-08 21:25:07.461323      413

<h3><strong>STEP 2: Preprocess Data (Explode Arrays and Convert Timestamps)</strong></h3>

In [4]:
import numpy as np
import pandas as pd
from pandas.api.types import is_list_like

# 1) choose a row that has multiple events if one exists
def _is_multi_event(x):
    return is_list_like(x) and not isinstance(x, (str, bytes)) and len(x) > 1

row_idx = 0
multi = geo_df[geo_df["event_time"].apply(_is_multi_event)]
if not multi.empty:
    row_idx = int(multi.index[0])  # use the first multi event row

# 2) run your existing function on that row
test_result = test_corrected_explosion(geo_df, row_index=row_idx)

# 3) only try to decode plausible geohashes, compute the right denominator
alphabet = set("0123456789bcdefghjkmnpqrstuvwxyz")
def looks_like_geohash(v):
    if pd.isna(v):
        return False
    s = str(v).strip().lower()
    if len(s) == 0 or len(s) > 12:
        return False
    return set(s).issubset(alphabet)

# pick up to 20 candidates from the exploded rows
cands = test_result["geohash_6"].head(20)
valid_mask = cands.apply(looks_like_geohash)
subset = cands[valid_mask]

print(f"\nDecoding up to {len(cands)} values, {len(subset)} look like valid base32 geohash strings")

# robust geohash import
try:
    import geohash as gh
except Exception:
    try:
        import Geohash as gh
    except Exception:
        import pygeohash as gh

def decode_geohash_value(val):
    try:
        s = str(val).strip().lower()
        lat, lon = gh.decode(s)
        return lat, lon
    except Exception as e:
        return (np.nan, np.nan)

success = 0
total = max(1, len(subset))  # avoid zero division
for i, val in enumerate(subset):
    latlon = decode_geohash_value(val)
    ok = not pd.isna(latlon[0])
    success += int(ok)
    print(f"geohash_6[{i}]: {val} -> {latlon} {'OK' if ok else 'FAIL'}")

print(f"\nSuccessful decodes: {success}/{total}")

Testing corrected explosion on first row...
Original row shape: (1, 6)
Number of events in this client: 1
Exploded row shape: (1, 6)

Decoding up to 1 values, 1 look like valid base32 geohash strings
geohash_6[0]: 1018528 -> (-89.99656677246094, -132.7471160888672) OK

Successful decodes: 1/1


In [5]:
import pandas as pd
import numpy as np
import os
import glob
from pandas.api.types import is_list_like

# paths, set to where your data lives
INPUT_PICKLE = "/home/opc/datasets/detail/geo_data_raw.pkl"
OUTPUT_DIR   = "/home/opc/datasets/detail/processing_batches"
COMBINED_OUT = "/home/opc/datasets/detail/geo_data_preprocessed.pkl"

def _to_dt_list(x):
    # accepts scalar unix seconds, Timestamp, or list of either
    if isinstance(x, pd.Timestamp):
        return [x]
    if not is_list_like(x) or isinstance(x, (str, bytes)):
        return [pd.to_datetime(x, unit="s", errors="coerce")]
    return [ts if isinstance(ts, pd.Timestamp) else pd.to_datetime(ts, unit="s", errors="coerce") for ts in x]

def _to_list(x):
    # ensures list for geohash columns
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return []
    if is_list_like(x) and not isinstance(x, (str, bytes)):
        return list(x)
    return [x]

# Explode all nested arrays for the entire dataset
def preprocess_geo_data_corrected(geo_df, batch_size=2500, output_dir=OUTPUT_DIR):
    os.makedirs(output_dir, exist_ok=True)

    total_rows = len(geo_df)
    num_batches = (total_rows + batch_size - 1) // batch_size
    print(f"Processing {total_rows} rows in {num_batches} batches of size {batch_size}")

    all_batch_files = []

    for batch_num in range(num_batches):
        print(f"Processing batch {batch_num+1}/{num_batches}")
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, total_rows)
        batch_df = geo_df.iloc[start_idx:end_idx].copy()

        # convert event_time safely, only on this batch
        batch_df["event_time"] = batch_df["event_time"].apply(_to_dt_list)

        # ensure geohash columns are list like for this batch
        for col in ["geohash_4", "geohash_5", "geohash_6"]:
            if col in batch_df.columns:
                batch_df[col] = batch_df[col].apply(_to_list)

        # explode using a simple Python loop per row, ok in batches
        exploded_rows = []
        for _, row in batch_df.iterrows():
            n_events = len(row["event_time"])
            for i in range(n_events):
                exploded_rows.append({
                    "client_id": row["client_id"],
                    "event_time": row["event_time"][i],
                    "geohash_4": row["geohash_4"][i] if i < len(row["geohash_4"]) else None,
                    "geohash_5": row["geohash_5"][i] if i < len(row["geohash_5"]) else None,
                    "geohash_6": row["geohash_6"][i] if i < len(row["geohash_6"]) else None,
                    "fold": row["fold"],
                })

        batch_exploded = pd.DataFrame(exploded_rows)

        batch_filename = os.path.join(output_dir, f"batch_{batch_num+1}_of_{num_batches}.pkl")
        batch_exploded.to_pickle(batch_filename)
        all_batch_files.append(batch_filename)

        print(f"Batch {batch_num+1} saved to {batch_filename}")
        print(f"  Exploded from {len(batch_df)} to {len(batch_exploded)} rows")

        del batch_df, batch_exploded, exploded_rows

    return all_batch_files

def combine_batches(batch_files, output_file=COMBINED_OUT):
    print("Combining all batches...")
    all_batches = [pd.read_pickle(bf) for bf in batch_files]
    geo_df_exploded = pd.concat(all_batches, ignore_index=True)
    print(f"Combined dataset shape: {geo_df_exploded.shape}")
    geo_df_exploded.to_pickle(output_file)
    print(f"Combined data saved to {output_file}")
    return geo_df_exploded

# load if needed
try:
    geo_df
except NameError:
    geo_df = pd.read_pickle(INPUT_PICKLE)

# tune batch size if RAM is tight
batch_files = preprocess_geo_data_corrected(geo_df, batch_size=2500, output_dir=OUTPUT_DIR)
geo_df_exploded = combine_batches(batch_files, output_file=COMBINED_OUT)
print("Step 2 completed successfully!")

Processing 66295724 rows in 26519 batches of size 2500
Processing batch 1/26519
Batch 1 saved to /home/opc/datasets/detail/processing_batches/batch_1_of_26519.pkl
  Exploded from 2500 to 2500 rows
Processing batch 2/26519
Batch 2 saved to /home/opc/datasets/detail/processing_batches/batch_2_of_26519.pkl
  Exploded from 2500 to 2500 rows
Processing batch 3/26519
Batch 3 saved to /home/opc/datasets/detail/processing_batches/batch_3_of_26519.pkl
  Exploded from 2500 to 2500 rows
Processing batch 4/26519
Batch 4 saved to /home/opc/datasets/detail/processing_batches/batch_4_of_26519.pkl
  Exploded from 2500 to 2500 rows
Processing batch 5/26519
Batch 5 saved to /home/opc/datasets/detail/processing_batches/batch_5_of_26519.pkl
  Exploded from 2500 to 2500 rows
Processing batch 6/26519
Batch 6 saved to /home/opc/datasets/detail/processing_batches/batch_6_of_26519.pkl
  Exploded from 2500 to 2500 rows
Processing batch 7/26519
Batch 7 saved to /home/opc/datasets/detail/processing_batches/batch_

<h3><strong>STEP 3: Decode Geohashes and Prepare for Feature Engineering</strong></h3>

In [6]:
# Check the processed PKL file before Step 3
import pandas as pd
import numpy as np
import os

PROCESSED_PATH = "/home/opc/TAP/models/notebooks/geo_data_preprocessed.pkl"  # update if you saved elsewhere

def inspect_processed_data(file_path=PROCESSED_PATH):
    """
    Inspect the processed data to verify the explosion was done correctly
    """
    try:
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            return None

        geo_df_processed = pd.read_pickle(file_path)
        n = len(geo_df_processed)
        print(f"Successfully loaded processed data with {n} rows")

        if n == 0:
            print("Dataset is empty")
            return geo_df_processed

        print("\n=== DATASET INFO ===")
        print(geo_df_processed.info())

        print("\n=== FIRST 10 ROWS ===")
        print(geo_df_processed.head(10))

        print("\n=== GEOHASH COLUMNS CHECK ===")
        for col in ['geohash_4', 'geohash_5', 'geohash_6']:
            if col not in geo_df_processed.columns:
                print(f"{col}: column missing")
                continue
            # find a non null sample if possible
            sample_idx = geo_df_processed[col].first_valid_index()
            sample_value = geo_df_processed[col].iloc[0] if sample_idx is None else geo_df_processed.at[sample_idx, col]
            print(f"{col}: {type(sample_value)}, sample: {sample_value}")
            if isinstance(sample_value, (list, np.ndarray)):
                print(f"  WARNING: {col} still contains arrays")
            else:
                print(f"  OK: {col} contains individual values")

        print("\n=== DATA TYPES ===")
        print(geo_df_processed.dtypes)

        print("\n=== MISSING VALUES ===")
        print(geo_df_processed.isnull().sum())

        return geo_df_processed

    except Exception as e:
        print(f"Error loading file: {e}")
        return None

# Execute the inspection
processed_data = inspect_processed_data(PROCESSED_PATH)

# If the data still has arrays, we need to fix it before proceeding
if processed_data is not None and len(processed_data) > 0:
    has_arrays = False
    for col in ['geohash_4', 'geohash_5', 'geohash_6']:
        if col in processed_data.columns:
            sample_idx = processed_data[col].first_valid_index()
            if sample_idx is None:
                continue
            sample = processed_data.at[sample_idx, col]
            if isinstance(sample, (list, np.ndarray)):
                has_arrays = True
                print(f"\nERROR: {col} still contains arrays. Step 2 did not work correctly.")
    if has_arrays:
        print("The current data still has arrays in the geohash columns, cannot decode yet.")
    else:
        print("\nData looks good, proceed to Step 3.")

File not found: /home/opc/TAP/models/notebooks/geo_data_preprocessed.pkl


In [7]:
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
import pygeohash as gh  # pip install pygeohash


# paths, adjust if you saved elsewhere
PROCESSED_IN  = "/home/opc/TAP/models/notebooks/geo_data_preprocessed.pkl"
PROCESSED_OUT = "/home/opc/TAP/models/notebooks/geo_data_fully_processed.pkl"

alphabet = set("0123456789bcdefghjkmnpqrstuvwxyz")
def looks_like_geohash(v):
    if pd.isna(v):
        return False
    s = str(v).strip().lower()
    if len(s) == 0 or len(s) > 12:
        return False
    return set(s).issubset(alphabet)

def decode_one(val):
    if not looks_like_geohash(val):
        return (np.nan, np.nan)
    try:
        return gh.decode(str(val).strip())
    except Exception:
        return (np.nan, np.nan)

def decode_geohashes_batch(geo_df_exploded, batch_size=100_000, output_file=PROCESSED_OUT):
    print("STEP 3, decoding geohashes and preparing data")
    total_rows = len(geo_df_exploded)
    num_batches = (total_rows + batch_size - 1) // batch_size
    print(f"Processing {total_rows} rows in {num_batches} batches of size {batch_size}")

    processed_batches = []
    total_dropped = 0

    for batch_num in range(num_batches):
        print(f"Processing batch {batch_num + 1}/{num_batches}")
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, total_rows)
        batch_df = geo_df_exploded.iloc[start_idx:end_idx].copy()

        # single pass decode with progress bar
        tqdm.pandas(desc="Decoding geohash_6")
        coords = batch_df["geohash_6"].progress_apply(decode_one)
        batch_df["latitude"]  = coords.apply(lambda t: t[0])
        batch_df["longitude"] = coords.apply(lambda t: t[1])

        # drop invalid coordinates
        before = len(batch_df)
        batch_df = batch_df.dropna(subset=["latitude", "longitude"])
        dropped = before - len(batch_df)
        total_dropped += dropped
        if dropped > 0:
            pct = 100.0 * dropped / max(1, before)
            print(f"  Dropped {dropped} rows with invalid coordinates [{pct:.1f} percent]")

        processed_batches.append(batch_df)
        del batch_df, coords

    geo_df_processed = pd.concat(processed_batches, ignore_index=True)
    geo_df_processed = geo_df_processed.sort_values(["client_id", "event_time"])
    print(f"Final dataset shape: {geo_df_processed.shape}")
    print(f"Total dropped rows: {total_dropped} [{100.0 * total_dropped / max(1, total_rows):.1f} percent]")

    # save
    out_dir = os.path.dirname(output_file)
    if out_dir:
        os.makedirs(out_dir, exist_ok=True)
    geo_df_processed.to_pickle(output_file)
    print(f"Fully processed data saved to {output_file}")

    return geo_df_processed

def analyze_processed_data(geo_df_processed):
    print("\n=== DATA ANALYSIS ===")
    print(f"Total events: {len(geo_df_processed):,}")
    print(f"Unique clients: {geo_df_processed['client_id'].nunique():,}")
    min_time = geo_df_processed['event_time'].min()
    max_time = geo_df_processed['event_time'].max()
    print(f"Time range: {min_time} to {max_time}")
    print(f"Duration: {max_time - min_time}")
    events_per_client = geo_df_processed.groupby('client_id').size()
    print(f"Average events per client: {events_per_client.mean():.1f}")
    print(f"Min events per client: {events_per_client.min()}")
    print(f"Max events per client: {events_per_client.max()}")
    print(f"Latitude range: {geo_df_processed['latitude'].min():.4f} to {geo_df_processed['latitude'].max():.4f}")
    print(f"Longitude range: {geo_df_processed['longitude'].min():.4f} to {geo_df_processed['longitude'].max():.4f}")
    return events_per_client

# run
geo_df_exploded = pd.read_pickle(PROCESSED_IN)
print(f"Loaded preprocessed data with {len(geo_df_exploded):,} rows")

# tune batch_size smaller if memory is limited, for example 25_000
geo_df_processed = decode_geohashes_batch(geo_df_exploded, batch_size=100_000, output_file=PROCESSED_OUT)
events_per_client = analyze_processed_data(geo_df_processed)

print("\nSample of processed data:")
print(geo_df_processed[['client_id', 'event_time', 'geohash_6', 'latitude', 'longitude', 'fold']].head(10))
print("\nStep 3 completed successfully, ready for feature engineering in Step 4")

ModuleNotFoundError: No module named 'tqdm'

In [None]:
# Step 3 Results Inspection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def inspect_step3_results(file_path="/home/opc/TAP/models/notebooks/geo_data_fully_processed.pkl", sample_size=10000):
    """
    Comprehensive inspection of Step 3 results
    """
    print("=== STEP 3 RESULTS INSPECTION ===")
    
    try:
        # Load the processed data
        geo_df_processed = pd.read_pickle(file_path)
        print(f"Loaded processed data with {len(geo_df_processed):,} rows")
        
        # 1. Basic Information
        print("\n1. BASIC DATASET INFO:")
        print(f"Shape: {geo_df_processed.shape}")
        print("\nColumns and data types:")
        print(geo_df_processed.dtypes)
        
        # 2. Check for missing values
        print("\n2. MISSING VALUES CHECK:")
        missing_values = geo_df_processed.isnull().sum()
        print(missing_values)
        
        # 3. Verify coordinate ranges
        print("\n3. COORDINATE RANGES:")
        print(f"Latitude range: [{geo_df_processed['latitude'].min():.6f}, {geo_df_processed['latitude'].max():.6f}]")
        print(f"Longitude range: [{geo_df_processed['longitude'].min():.6f}, {geo_df_processed['longitude'].max():.6f}]")
        
        # 4. Sample of decoded coordinates
        print("\n4. SAMPLE OF DECODED COORDINATES:")
        sample_df = geo_df_processed.sample(min(sample_size, len(geo_df_processed)))
        print(sample_df[['geohash_6', 'latitude', 'longitude']].head(10))
        
        # 5. Check coordinate distribution
        print("\n5. COORDINATE DISTRIBUTION:")
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        
        # Latitude distribution
        sns.histplot(geo_df_processed['latitude'].dropna(), ax=axes[0], kde=True)
        axes[0].set_title('Latitude Distribution')
        
        # Longitude distribution
        sns.histplot(geo_df_processed['longitude'].dropna(), ax=axes[1], kde=True)
        axes[1].set_title('Longitude Distribution')
        
        plt.tight_layout()
        plt.savefig('coordinate_distribution.png')
        print("Coordinate distribution plots saved as 'coordinate_distribution.png'")
        
        # 6. Check for impossible coordinates
        print("\n6. IMPOSSIBLE COORDINATE CHECK:")
        # Check for coordinates that are clearly invalid
        invalid_lat = ((geo_df_processed['latitude'] < -90) | (geo_df_processed['latitude'] > 90)).sum()
        invalid_lon = ((geo_df_processed['longitude'] < -180) | (geo_df_processed['longitude'] > 180)).sum()
        print(f"Invalid latitudes (outside [-90, 90]): {invalid_lat}")
        print(f"Invalid longitudes (outside [-180, 180]): {invalid_lon}")
        
        # 7. Check for coordinates that are exactly 0,0 (often indicates decoding errors)
        zero_coords = ((geo_df_processed['latitude'] == 0) & (geo_df_processed['longitude'] == 0)).sum()
        print(f"Coordinates at (0, 0): {zero_coords}")
        
        # 8. Check client distribution
        print("\n7. CLIENT DISTRIBUTION:")
        client_counts = geo_df_processed['client_id'].value_counts()
        print(f"Total clients: {len(client_counts):,}")
        print(f"Average events per client: {client_counts.mean():.1f}")
        print(f"Min events per client: {client_counts.min()}")
        print(f"Max events per client: {client_counts.max()}")
        
        # 9. Time range check
        print("\n8. TIME RANGE:")
        print(f"Earliest event: {geo_df_processed['event_time'].min()}")
        print(f"Latest event: {geo_df_processed['event_time'].max()}")
        
        # 10. Fold distribution
        print("\n9. FOLD DISTRIBUTION:")
        fold_counts = geo_df_processed['fold'].value_counts().sort_index()
        print(fold_counts)
        
        return geo_df_processed
        
    except FileNotFoundError:
        print(f"File {file_path} not found. Please run Step 3 first.")
        return None
    except Exception as e:
        print(f"Error during inspection: {e}")
        return None

# Execute the inspection
processed_data = inspect_step3_results()

if processed_data is not None:
    print("\n=== INSPECTION COMPLETE ===")
    print("Based on this inspection, we can determine if Step 3 was successful")
    print("and whether we can proceed to Step 4 (Feature Engineering).")
    
    # Check if there are any critical issues that need to be addressed
    missing_coords = processed_data[['latitude', 'longitude']].isnull().sum().sum()
    if missing_coords > 0:
        print(f"\nWARNING: Found {missing_coords} missing coordinates. This might affect feature engineering.")
    
    invalid_coords = ((processed_data['latitude'] < -90) | (processed_data['latitude'] > 90) | 
                     (processed_data['longitude'] < -180) | (processed_data['longitude'] > 180)).sum()
    if invalid_coords > 0:
        print(f"WARNING: Found {invalid_coords} invalid coordinates outside valid ranges.")
    
    if missing_coords == 0 and invalid_coords == 0:
        print("\n✓ Step 3 completed successfully. Data is ready for Step 4 (Feature Engineering).")
    else:
        print("\n⚠ Step 3 completed with some issues. We may need to clean the data before Step 4.")