<h3><strong>SECTION 1: Load Processed Data</strong></h3>

In [1]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from tqdm import tqdm
import os

def load_processed_data(file_path="geo_data_fully_processed.pkl"):
    """Load the processed data from Step 3"""
    print("Loading processed data...")
    geo_df = pd.read_pickle(file_path)
    print(f"Loaded {len(geo_df):,} events")
    return geo_df

# Execute Section 1
df = load_processed_data()
print("Section 1 completed: Data loaded successfully")
print(f"Data shape: {df.shape}")

Loading processed data...
Loaded 66,295,724 events
Section 1 completed: Data loaded successfully
Data shape: (66295724, 8)


<h3><strong>SECTION 1b: Extract 10% Sample of the Dataset</strong></h3>

In [2]:
import pandas as pd
import numpy as np

def extract_data_sample(df, sample_fraction=0.1, random_state=42, method='random'):
    """
    Extract a sample from the dataset for initial testing
    
    Parameters:
    df: DataFrame to sample from
    sample_fraction: Fraction of data to sample (0.0 to 1.0)
    random_state: Random seed for reproducibility
    method: Sampling method ('random', 'client_based', or 'time_based')
    """
    print(f"Extracting {sample_fraction*100}% sample using {method} method...")
    
    if method == 'random':
        # Simple random sampling
        sample_df = df.sample(frac=sample_fraction, random_state=random_state)
        
    elif method == 'client_based':
        # Sample by clients to maintain client behavior patterns
        unique_clients = df['client_id'].unique()
        sample_clients = np.random.choice(unique_clients, 
                                         size=int(len(unique_clients) * sample_fraction), 
                                         replace=False)
        sample_df = df[df['client_id'].isin(sample_clients)]
        
    elif method == 'time_based':
        # Sample by time period to maintain temporal patterns
        time_range = df['event_time'].max() - df['event_time'].min()
        sample_start = df['event_time'].min() + time_range * np.random.random()
        sample_end = sample_start + time_range * sample_fraction
        sample_df = df[(df['event_time'] >= sample_start) & (df['event_time'] <= sample_end)]
    
    print(f"Original dataset: {len(df):,} rows")
    print(f"Sample dataset: {len(sample_df):,} rows")
    print(f"Sample represents {len(sample_df)/len(df)*100:.2f}% of original data")
    
    # Verify we have a reasonable number of clients in the sample
    if 'client_id' in df.columns:
        original_clients = df['client_id'].nunique()
        sample_clients = sample_df['client_id'].nunique()
        print(f"Original clients: {original_clients:,}")
        print(f"Sample clients: {sample_clients:,}")
    
    return sample_df

# Execute Section 1b: Extract 10% sample
# Choose one of the sampling methods:
# method='random' - Simple random sampling (fastest)
# method='client_based' - Sample by clients (preserves client behavior patterns)
# method='time_based' - Sample by time period (preserves temporal patterns)

sample_df = extract_data_sample(df, sample_fraction=0.1, random_state=42, method='client_based')

# Replace the original dataframe with the sample for further processing
df = sample_df.copy()

print("Section 1b completed: 10% sample extracted")

Extracting 10.0% sample using client_based method...
Original dataset: 66,295,724 rows
Sample dataset: 6,745,765 rows
Sample represents 10.18% of original data
Original clients: 72,573
Sample clients: 7,257
Section 1b completed: 10% sample extracted


<h3><strong>SECTION 2: Basic Movement Features</strong></h3>

In [3]:
# Calculate basic movement features like velocity and distance between points
def calculate_basic_movement_features(df):
    print("Calculating basic movement features...")
    
    # Sort by client and time
    df = df.sort_values(['client_id', 'event_time']).copy()
    
    # Calculate time differences
    df['time_diff_hours'] = df.groupby('client_id')['event_time'].diff().dt.total_seconds() / 3600
    
    # Calculate distance between consecutive points
    df['prev_lat'] = df.groupby('client_id')['latitude'].shift()
    df['prev_lon'] = df.groupby('client_id')['longitude'].shift()
    
    # Calculate distance using geodesic
    tqdm.pandas(desc="Calculating distances")
    df['distance_km'] = df.progress_apply(
        lambda row: geodesic((row['prev_lat'], row['prev_lon']), 
                            (row['latitude'], row['longitude'])).km 
        if not pd.isna(row['prev_lat']) else 0, axis=1
    )
    
    # Calculate velocity (km/h)
    df['velocity_kmh'] = df['distance_km'] / df['time_diff_hours']
    df['velocity_kmh'] = df['velocity_kmh'].replace([np.inf, -np.inf], np.nan)
    
    # Clean up temporary columns
    df = df.drop(['prev_lat', 'prev_lon'], axis=1)
    
    return df

# Execute Section 2
df = calculate_basic_movement_features(df)
print("Section 2 completed: Basic movement features calculated")
print(df[['client_id', 'event_time', 'distance_km', 'velocity_kmh']].head())

Calculating basic movement features...


Calculating distances: 100%|██████████| 6745765/6745765 [17:37<00:00, 6377.11it/s]  


Section 2 completed: Basic movement features calculated
                                                  client_id  \
66121636  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...   
66121637  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...   
66121638  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...   
66121639  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...   
66121640  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...   

                  event_time  distance_km  velocity_kmh  
66121636 2022-01-08 08:23:53          0.0           NaN  
66121637 2022-01-09 06:20:43          0.0           0.0  
66121638 2022-01-10 06:17:25          0.0           0.0  
66121639 2022-01-11 14:31:30          0.0           0.0  
66121640 2022-01-14 07:57:30          0.0           0.0  


<h3><strong>SECTION 3: Suspicious Location Detection</strong></h3>

In [4]:
def is_in_ocean(lat, lon):
    """Check if coordinates are in major ocean areas"""
    # Pacific Ocean (approximate bounding box)
    if (-60 <= lat <= 60) and (-180 <= lon <= -70) or (130 <= lon <= 180):
        return True
    
    # Atlantic Ocean
    if (-60 <= lat <= 60) and (-70 <= lon <= 20):
        return True
    
    # Indian Ocean
    if (-60 <= lat <= 30) and (20 <= lon <= 120):
        return True
    
    return False

def is_in_polar_region(lat, lon):
    """Check if coordinates are in polar regions"""
    # Antarctica
    if lat < -60:
        return True
    
    # Remote Arctic
    if lat > 75 and abs(lon) > 150:  # Very remote arctic areas
        return True
        
    return False

def is_in_desert(lat, lon):
    """Check if coordinates are in major desert areas"""
    # Sahara Desert
    if (15 <= lat <= 30) and (-15 <= lon <= 40):
        return True
    
    # Arabian Desert
    if (20 <= lat <= 30) and (35 <= lon <= 60):
        return True
    
    # Gobi Desert
    if (40 <= lat <= 45) and (90 <= lon <= 115):
        return True
    
    # Australian Outback
    if (-30 <= lat <= -20) and (120 <= lon <= 140):
        return True
    
    return False

def is_in_international_waters(lat, lon):
    """
    Check if coordinates are far from any coastline
    Simplified version - assumes ocean coordinates far from land are suspicious
    """
    if is_in_ocean(lat, lon):
        # If it's in ocean and far from the edges of continents, consider it international waters
        if (abs(lon) > 100 and abs(lat) < 30) or (abs(lon) < 20 and lat < -40):
            return True
    return False

print("Section 3 completed: Suspicious location helper functions defined")

Section 3 completed: Suspicious location helper functions defined


<h3><strong>SECTION 4: Add Suspicious Location Features</strong></h3>

In [5]:
# SECTION 4: Add Suspicious Location Features
def add_suspicious_location_features(df):
    print("Adding suspicious location features...")
    
    tqdm.pandas(desc="Checking ocean locations")
    df['is_ocean'] = df.progress_apply(lambda row: is_in_ocean(row['latitude'], row['longitude']), axis=1)
    
    tqdm.pandas(desc="Checking polar regions")
    df['is_polar'] = df.progress_apply(lambda row: is_in_polar_region(row['latitude'], row['longitude']), axis=1)
    
    tqdm.pandas(desc="Checking desert areas")
    df['is_desert'] = df.progress_apply(lambda row: is_in_desert(row['latitude'], row['longitude']), axis=1)
    
    tqdm.pandas(desc="Checking international waters")
    df['is_international_waters'] = df.progress_apply(
        lambda row: is_in_international_waters(row['latitude'], row['longitude']), axis=1
    )
    
    # Composite suspicious location score
    df['suspicious_location_score'] = (
        df['is_ocean'].astype(int) * 0.4 +
        df['is_polar'].astype(int) * 0.3 +
        df['is_desert'].astype(int) * 0.2 +
        df['is_international_waters'].astype(int) * 0.1
    )
    
    return df

# Execute Section 4
df = add_suspicious_location_features(df)
print("Section 4 completed: Suspicious location features added")
print(df[['client_id', 'is_ocean', 'is_polar', 'is_desert', 'is_international_waters', 'suspicious_location_score']].head())

Adding suspicious location features...


Checking ocean locations: 100%|██████████| 6745765/6745765 [01:10<00:00, 96121.94it/s] 
Checking polar regions: 100%|██████████| 6745765/6745765 [01:09<00:00, 96523.67it/s] 
Checking desert areas: 100%|██████████| 6745765/6745765 [01:09<00:00, 97159.65it/s] 
Checking international waters: 100%|██████████| 6745765/6745765 [01:10<00:00, 95244.15it/s] 


Section 4 completed: Suspicious location features added
                                                  client_id  is_ocean  \
66121636  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...     False   
66121637  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...     False   
66121638  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...     False   
66121639  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...     False   
66121640  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...     False   

          is_polar  is_desert  is_international_waters  \
66121636      True      False                    False   
66121637      True      False                    False   
66121638      True      False                    False   
66121639      True      False                    False   
66121640      True      False                    False   

          suspicious_location_score  
66121636                        0.3  
66121637                        0.3  
66121638                        0.3  
6612

<h3><strong>SECTION 5: Calculate home location (most common coordinates)</strong></h3>

In [6]:
def calculate_client_baselines(df):
    """Calculate client-specific behavioral baselines"""
    print("Calculating client behavioral baselines...")
    
    # Group by client
    client_groups = df.groupby('client_id')
    
    # Calculate home location (most common coordinates)
    home_locations = client_groups.agg({
        'latitude': lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0],
        'longitude': lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0]
    }).reset_index()
    home_locations.columns = ['client_id', 'home_latitude', 'home_longitude']
    
    # Calculate average velocity per client
    avg_velocity = client_groups['velocity_kmh'].mean().reset_index()
    avg_velocity.columns = ['client_id', 'avg_velocity_kmh']
    
    # Calculate velocity standard deviation per client
    velocity_std = client_groups['velocity_kmh'].std().reset_index()
    velocity_std.columns = ['client_id', 'velocity_std_kmh']
    
    # Merge all client baseline features
    client_baselines = home_locations.merge(avg_velocity, on='client_id').merge(velocity_std, on='client_id')
    
    return client_baselines

# Execute Section 5
client_baselines = calculate_client_baselines(df)
print("Section 5 completed: Client behavioral baselines calculated")
print(client_baselines.head())

Calculating client behavioral baselines...
Section 5 completed: Client behavioral baselines calculated
                                           client_id  home_latitude  \
0  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...     -89.978027   
1  0002ddd816198d32474486d54f4bfe4f7b361119b5dc45...     -33.486328   
2  0008482a86bca0ad595a949f9e314e157a288c332db269...     -89.978027   
3  000f4309610cc90124943138fdf6d50a2b5967a9ba79b2...     -75.322266   
4  000fff599dacf53fffcf4663ab370d21921d1988f8cfad...     -89.978027   

   home_longitude  avg_velocity_kmh  velocity_std_kmh  
0     -134.934082        403.656888       3731.853505  
1     -118.652344        992.204393       3945.654443  
2     -134.934082      14731.870949     266677.079131  
3     -122.167969          0.095655          0.165679  
4     -134.934082        179.631911       1584.836514  


<h3><strong>SECTION 6: Calculate distance from home</strong></h3>

In [7]:
def add_client_baseline_features(df, client_baselines):
    """Add client baseline features to the main dataframe"""
    print("Adding client baseline features...")
    
    # Merge client baselines
    df = df.merge(client_baselines, on='client_id', how='left')
    
    # Calculate distance from home
    tqdm.pandas(desc="Calculating distance from home")
    df['distance_from_home_km'] = df.progress_apply(
        lambda row: geodesic((row['home_latitude'], row['home_longitude']), 
                            (row['latitude'], row['longitude'])).km, axis=1
    )
    
    # Calculate velocity anomalies (z-score)
    df['velocity_z_score'] = (df['velocity_kmh'] - df['avg_velocity_kmh']) / df['velocity_std_kmh']
    df['velocity_z_score'] = df['velocity_z_score'].replace([np.inf, -np.inf], np.nan)
    
    return df

# Execute Section 6
df = add_client_baseline_features(df, client_baselines)
print("Section 6 completed: Client baseline features added")
print(df[['client_id', 'home_latitude', 'home_longitude', 'distance_from_home_km', 'velocity_z_score']].head())

Adding client baseline features...


Calculating distance from home: 100%|██████████| 6745765/6745765 [16:09<00:00, 6960.49it/s] 


Section 6 completed: Client baseline features added
                                           client_id  home_latitude  \
0  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...     -89.978027   
1  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...     -89.978027   
2  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...     -89.978027   
3  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...     -89.978027   
4  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...     -89.978027   

   home_longitude  distance_from_home_km  velocity_z_score  
0     -134.934082                    0.0               NaN  
1     -134.934082                    0.0         -0.108165  
2     -134.934082                    0.0         -0.108165  
3     -134.934082                    0.0         -0.108165  
4     -134.934082                    0.0         -0.108165  


<h3><strong>SECTION 7: Time-based Features</strong></h3>

In [8]:
def add_temporal_features(df):
    """Add time-based features"""
    print("Adding temporal features...")
    
    # Time of day features
    df['hour_of_day'] = df['event_time'].dt.hour
    df['is_night'] = ((df['hour_of_day'] >= 22) | (df['hour_of_day'] <= 6)).astype(int)
    df['is_business_hours'] = ((df['hour_of_day'] >= 9) & (df['hour_of_day'] <= 17)).astype(int)
    
    # Day of week features
    df['day_of_week'] = df['event_time'].dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Month feature
    df['month'] = df['event_time'].dt.month
    
    return df

# Execute Section 7
df = add_temporal_features(df)
print("Section 7 completed: Temporal features added")
print(df[['client_id', 'event_time', 'hour_of_day', 'is_night', 'is_weekend']].head())

Adding temporal features...
Section 7 completed: Temporal features added
                                           client_id          event_time  \
0  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51... 2022-01-08 08:23:53   
1  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51... 2022-01-09 06:20:43   
2  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51... 2022-01-10 06:17:25   
3  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51... 2022-01-11 14:31:30   
4  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51... 2022-01-14 07:57:30   

   hour_of_day  is_night  is_weekend  
0            8         0           1  
1            6         1           1  
2            6         1           0  
3           14         0           0  
4            7         0           0  


<h3><strong>SECTION 8: Composite Suspicion Score</strong></h3>

In [11]:
def calculate_composite_suspicion_score(df):
    """Calculate a composite suspicion score combining all features"""
    print("Calculating composite suspicion score...")
    
    # Normalize features for scoring
    max_distance = df['distance_from_home_km'].max()
    df['distance_score'] = df['distance_from_home_km'] / max_distance if max_distance > 0 else 0
    
    # Velocity anomaly score (absolute z-score, capped at 3)
    df['velocity_anomaly_score'] = np.abs(df['velocity_z_score']).clip(0, 3) / 3
    
    # Time anomaly score (higher weight for night and weekend)
    df['time_anomaly_score'] = (df['is_night'] * 0.6 + df['is_weekend'] * 0.4)
    
    # Composite score (weights can be adjusted)
    df['composite_suspicion_score'] = (
        df['suspicious_location_score'] * 0.3 +
        df['distance_score'] * 0.25 +
        df['velocity_anomaly_score'] * 0.25 +
        df['time_anomaly_score'] * 0.2
    )
    
    return df

# Execute Section 8
df = calculate_composite_suspicion_score(df)
print("Section 8 completed: Composite suspicion score calculated")
print(df[['client_id', 'suspicious_location_score', 'distance_score', 
          'velocity_anomaly_score', 'time_anomaly_score', 'composite_suspicion_score']].head())

Calculating composite suspicion score...
Section 8 completed: Composite suspicion score calculated
                                           client_id  \
0  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...   
1  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...   
2  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...   
3  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...   
4  0001edbc5ab720f70a615ed9e8429df9b6c3f3c3999a51...   

   suspicious_location_score  distance_score  velocity_anomaly_score  \
0                        0.3             0.0                     NaN   
1                        0.3             0.0                0.036055   
2                        0.3             0.0                0.036055   
3                        0.3             0.0                0.036055   
4                        0.3             0.0                0.036055   

   time_anomaly_score  composite_suspicion_score  
0                 0.4                        NaN  
1                 1.0        

<h3><strong>SECTION 9: Save Results</strong></h3>

In [12]:
def save_engineered_features(df, output_file="geo_data_with_features.pkl"):
    """Save the dataframe with all engineered features"""
    print("Saving engineered features...")
    df.to_pickle(output_file)
    print(f"Features saved to {output_file}")
    return df

# Execute Section 9
df = save_engineered_features(df)
print("Section 9 completed: Engineered features saved")
print("All feature engineering steps completed successfully!")
print(f"Final dataset shape: {df.shape}")

Saving engineered features...
Features saved to geo_data_with_features.pkl
Section 9 completed: Engineered features saved
All feature engineering steps completed successfully!
Final dataset shape: (6745765, 32)
