In [1]:
import pandas as pd
import random
from typing import List
from shapely.geometry.base import BaseGeometry
import shapely
from shapely.geometry import Point, MultiPoint
import duckdb
import numpy as np
from typing import Any, Dict, Optional
from shapely.geometry import shape
import json
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [2]:
from datetime import datetime
from dateutil import rrule

START_DATE = datetime(2025, 3, 1)
END_DATE = datetime(2025, 6, 30)

# Iterate through every month from START_DATE to END_DATE
all_months = list(rrule.rrule(rrule.MONTHLY, dtstart=START_DATE, until=END_DATE))

# Format as ISO dates
all_months = [x.strftime("%Y-%m-%d") for x in all_months]

In [3]:
from datasets import Dataset, load_dataset

# Load dataset from Hugging Face
ds = load_dataset("joefee/cell-service-data")

In [4]:
# Convert to pandas dataframe
df = ds['train'].to_pandas()

In [5]:
# Browse the dataset
df

Unnamed: 0,timestamp,unique_cell,measurement_type_name,in_outdoor_state,value,latitude,longitude,signal_level
0,2025-03-01 15:48:22.455,d66c4660d6f77433b503d2e0159ce7053bdd76dc,,,-91.0,52.941137,-1.180972,-112.207920
1,2025-03-01 03:15:47.436,bb8abb3cfc1d2eb47edba5f216a0078d829575f2,,Probably Indoor,-9.0,52.756933,-1.517308,-120.000000
2,2025-03-01 12:26:07.000,3fb387a3995398fc5602cc0e431ce935044c594e,,Probably Outdoor,-108.0,52.770351,-1.208205,-108.719592
3,2025-03-01 09:49:03.000,e662f0bf857466dafbb8da554abfec2e86335deb,,Surely Outdoor,46.8,52.644803,-1.189687,-103.070615
4,2025-03-01 00:56:44.711,0fed16bc81fbd0ef53e6071cf9fd8449d810fdfc,,Surely Indoor,-7.0,52.887214,-1.534917,-96.205530
...,...,...,...,...,...,...,...,...
31504611,2025-06-30 09:21:01.666,5a6685a51c70f12cc7b03db00a18db684e60835c,,Probably Indoor,1.4,52.641313,-1.095236,-86.808643
31504612,2025-06-30 06:38:45.485,71f3829f43969786e0adb953a0390146e67d1b3b,,,-109.0,52.771422,-1.197493,-102.676974
31504613,2025-06-30 15:24:45.000,334868f2087c2f62b8cf9d658ba7ffec4bf9ca77,,Probably Outdoor,-105.0,52.645504,-1.126884,
31504614,2025-06-30 01:56:10.540,0f96898a83142aa9cd14d8f27dfa1aa529af2fca,,Probably Indoor,-14.0,52.627893,-1.336682,-104.649658


In [6]:
# Old dataset, unused
# df = pd.read_csv("hf://datasets/joefee/service-data/np_obs_jit_jf_tf_tw.csv")

In [7]:
# Bin this into signal level categories
# UK Ofcom Reference URL: 
# https://www.ofcom.org.uk/siteassets/resources/documents/phones-telecoms-and-internet/comparing-service-quality/2025/map-your-mobile-2025-threshold-methodology.pdf
df["signal_level_category"] = pd.cut(df["signal_level"], bins=[-np.inf, -105, -95, -82, -74, np.inf], labels=["1. Very Weak", "2. Weak", "3. Moderate", "4. Strong", "5. Very Strong"])

In [8]:
# Identify cells with sufficient data points
# Cells must also have signal_level_category value
min_points_required = 30

sufficient_data_cells = duckdb.query(f"""
WITH monthly_count AS (
    SELECT
        unique_cell, 
        date_trunc('month', CAST(timestamp AS timestamp)) as month, 
        COUNT(*) as count
    FROM df
    WHERE signal_level IS NOT NULL
    GROUP BY unique_cell, month HAVING COUNT(*) >= {30}
)
SELECT unique_cell 
FROM monthly_count 
GROUP BY unique_cell
HAVING COUNT(DISTINCT month) = {len(all_months)}
""").to_df()['unique_cell'].tolist()

In [9]:
# Identify all unique cells
# remove outlier cells
outlier_cells = ["4dc7c9ec434ed06502767136789763ec11d2c4b7"]
sufficient_data_cells = [cell for cell in sufficient_data_cells if cell not in outlier_cells]

print(f"Number of cells with sufficient data: {len(sufficient_data_cells)}")
print(f"Total number of cells: {df['unique_cell'].nunique()}")

Number of cells with sufficient data: 5244
Total number of cells: 36631


In [10]:
def generate_svm_boundary_geom(df, **args) -> Optional[Dict[str, Any]]:
    """
    Generate a valid MultiPolygon with true cut-out holes from One-Class SVM boundary.
    Converts input longitude/latitude columns to float type if they are not already.

    Args:
        df (pd.DataFrame): DataFrame with 'longitude' and 'latitude' columns.
                        These columns can contain numbers or Decimal objects.
        **args: Keyword arguments passed directly to svm.OneClassSVM.

    Returns:
        dict: GeoJSON mapping of the resulting MultiPolygon, or None if unsuccessful.
    """
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import svm
    from shapely.geometry import LineString, Polygon, MultiPolygon, mapping, Point
    from shapely.ops import polygonize, unary_union
    import pandas as pd
    # Import Decimal for explicit type checking/conversion if needed, though astype(float) is usually sufficient
    from decimal import Decimal

    if not isinstance(df, pd.DataFrame) or not all(col in df.columns for col in ['longitude', 'latitude']):
        print("Error: Input df must be a pandas DataFrame with 'longitude' and 'latitude' columns.")
        return None

    if len(df) < 2:
        print("Warning: Need at least 2 data points for SVM.")
        return None

    # --- Convert coordinate columns to float type ---
    # This resolves the Decimal vs float TypeError
    try:
        df_copy = df.copy() # Work on a copy to avoid modifying the original DataFrame
        df_copy['longitude'] = df_copy['longitude'].astype(float)
        df_copy['latitude'] = df_copy['latitude'].astype(float)
        coords = df_copy[['longitude', 'latitude']].values
    except (TypeError, ValueError) as e:
        print(f"Error converting coordinate columns to float: {e}")
        return None

    # --- 1. Train the SVM ---
    try:
        clf = svm.OneClassSVM(**args)
        clf.fit(coords)
    except Exception as e:
        print(f"Error during SVM training: {e}")
        return None

    # --- 2. Create mesh grid for contouring ---
    # Now calculations will use standard floats
    x_min, x_max = coords[:, 0].min(), coords[:, 0].max()
    y_min, y_max = coords[:, 1].min(), coords[:, 1].max()

    x_range = x_max - x_min
    y_range = y_max - y_min
    x_margin = x_range * 1 if x_range > 1e-9 else 0.1
    y_margin = y_range * 1 if y_range > 1e-9 else 0.1

    x_min -= x_margin
    x_max += x_margin
    y_min -= y_margin
    y_max += y_margin

    resolution = 500
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, resolution),
                        np.linspace(y_min, y_max, resolution))
    grid_points = np.c_[xx.ravel(), yy.ravel()]

    try:
        Z = clf.decision_function(grid_points).reshape(xx.shape)
    except Exception as e:
        print(f"Error during SVM decision function evaluation: {e}")
        return None

    # --- 3. Extract contour lines at level 0 (the boundary) ---
    fig, ax = plt.subplots()
    try:
        cs = ax.contour(xx, yy, Z, levels=[0])
    except Exception as e:
        print(f"Error during contour generation: {e}")
        plt.close(fig)
        return None
    plt.close(fig)

    if not cs.allsegs or not cs.allsegs[0]:
        print("Warning: No contour lines found at level 0.")
        return None

    segments = cs.allsegs[0]
    lines = [LineString(seg) for seg in segments if len(seg) >= 2]

    if not lines:
        print("Warning: No valid LineStrings created from contour segments.")
        return None

    # --- 4. Polygonize the lines ---
    try:
        all_polygons = list(polygonize(lines))
    except Exception as e:
        print(f"Error during polygonization: {e}")
        return None

    if not all_polygons:
        print("Warning: Polygonization did not yield any polygons.")
        return None

    # --- 5. Classify polygons and perform unary union ---
    positive_polygons = []
    for p in all_polygons:
        if p.is_valid and p.area > 1e-9:
            rep_point = p.representative_point()
            try:
                decision_val = clf.decision_function([[rep_point.x, rep_point.y]])[0]
                if decision_val >= 0:
                    # Ensure the polygon added is valid - unary_union can struggle with invalid inputs
                    if p.is_valid:
                        positive_polygons.append(p)
                    else:
                    # Attempt to buffer by 0 to fix potential self-intersections
                        buffered_p = p.buffer(0)
                        if buffered_p.is_valid and isinstance(buffered_p, Polygon):
                                positive_polygons.append(buffered_p)
                        else:
                            print(f"Warning: Skipping invalid polygon generated during classification step even after buffer(0). Area: {p.area}")

            except Exception as e:
                print(f"Warning: Error checking decision function for a polygon point: {e}")


    if not positive_polygons:
        print("Warning: No valid polygons were classified as inside the SVM boundary.")
        return None

    # --- 6. Unary Union to merge positive polygons and create holes ---
    try:
        # Filter again for validity just before union, as buffer(0) might create MultiPolygons
        valid_positive_polygons = [poly for poly in positive_polygons if poly.is_valid and isinstance(poly, Polygon)]
        if not valid_positive_polygons:
            print("Warning: No valid polygons remaining before unary union.")
            return None
        result_geom = unary_union(valid_positive_polygons)

    except Exception as e:
        # Catch potential errors during unary_union (often related to complex topology)
        print(f"Error during unary union: {e}")
        # As a fallback, try creating a MultiPolygon directly from the valid positive polygons
        # This might result in overlaps instead of proper union, but is better than nothing.
        print("Attempting fallback: creating MultiPolygon from individual positive polygons.")
        try:
            result_geom = MultiPolygon(valid_positive_polygons)
            if not result_geom.is_valid:
                print("Warning: Fallback MultiPolygon is invalid.")
                # Try buffer(0) on the multipolygon as a last resort
                buffered_result = result_geom.buffer(0)
                if buffered_result.is_valid:
                    print("Fallback MultiPolygon fixed with buffer(0).")
                    result_geom = buffered_result
                else:
                    print("Error: Fallback MultiPolygon remains invalid even after buffer(0). Cannot proceed.")
                    return None
        except Exception as fallback_e:
            print(f"Error during fallback MultiPolygon creation: {fallback_e}")
            return None


    # --- 7. Format output as MultiPolygon GeoJSON mapping ---
    final_multi_poly = None
    if result_geom is None: # Should not happen with current logic, but check anyway
        print("Error: Resulting geometry is None after union/fallback.")
        return None

    # Simplify handling by ensuring result_geom is always iterable (list of polygons)
    geoms_to_wrap = []
    if isinstance(result_geom, Polygon):
        if result_geom.is_valid:
            geoms_to_wrap = [result_geom]
    elif isinstance(result_geom, MultiPolygon):
        # Filter out invalid geoms within the MultiPolygon if any
        geoms_to_wrap = [g for g in result_geom.geoms if g.is_valid and isinstance(g, Polygon)]
    elif hasattr(result_geom, 'geoms'): # Handle GeometryCollection
        print(f"Warning: unary_union resulted in a GeometryCollection. Filtering for valid Polygons.")
        geoms_to_wrap = [g for g in result_geom.geoms if g.is_valid and isinstance(g, Polygon)]

    if not geoms_to_wrap:
        print("Warning: No valid polygons found in the final geometry after union/cleanup.")
        return None

    # Create the final MultiPolygon
    final_multi_poly = MultiPolygon(geoms_to_wrap)

    # Final validity check
    if final_multi_poly.is_valid:
        return mapping(final_multi_poly)
    else:
        # Try one last buffer(0) fix
        print("Warning: Final MultiPolygon is invalid. Attempting buffer(0) fix.")
        buffered_final = final_multi_poly.buffer(0)
        if buffered_final.is_valid and isinstance(buffered_final, (Polygon, MultiPolygon)):
            # Re-wrap if buffer resulted in a single Polygon
            if isinstance(buffered_final, Polygon):
                final_multi_poly = MultiPolygon([buffered_final])
            else:
                final_multi_poly = buffered_final
            print("Final MultiPolygon fixed with buffer(0).")
            return mapping(final_multi_poly)
        else:
            print("Error: Final MultiPolygon remains invalid even after buffer(0).")
            return None

In [11]:
allowed_cells = ["5a6685a51c70f12cc7b03db00a18db684e60835c"]

In [18]:
metric = "recall"  # Options: "accuracy", "precision", "recall", "f1"

for cell_id in [x for x in sufficient_data_cells if x in allowed_cells]:

    my_geojson = {
        "type": "FeatureCollection",
        "features": []
    }

    my_best_convex_hull:tuple[float, BaseGeometry] = None
    my_best_svm_hyperplanes:Dict[Any, tuple[float, BaseGeometry]] = {}

    # Select data for this cell only
    df_cell = df[df['unique_cell'] == cell_id]
    print(f"---")
    print(f"Processing cell {cell_id}")

    # Exclude last month
    for month in all_months[:-1]:
        
        # Random colour for each month
        colour = "#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])

        # Identify train/test split
        # Use one month for training, next month for testing
        df_train = df_cell[(pd.to_datetime(df_cell['timestamp']).dt.to_period('M') == pd.to_datetime(month).to_period('M'))]
        test_month = (pd.to_datetime(month).to_period('M') + 1).strftime("%Y-%m-%d")
        df_test = df_cell[(pd.to_datetime(df_cell['timestamp']).dt.to_period('M') == pd.to_datetime(test_month).to_period('M'))]

        print(f"---Training month: {month}, Testing month: {test_month}---")
        print(f"Train: {len(df_train)}, Test: {len(df_test)}")

        # Construct a convex hull with shapely using train_df
        points_train = [Point(xy) for xy in zip(df_train['longitude'], df_train['latitude'])]
        points_test = [Point(xy) for xy in zip(df_test['longitude'], df_test['latitude'])]

        # Find center of mass among these points
        multipoint = MultiPoint(points_train)
        center_of_mass = multipoint.centroid

        # For each point, calculate distance to center of mass
        distances = [point.distance(center_of_mass) for point in points_train]
        
        # Find the 95% percentile distance
        threshold_distance = pd.Series(distances).quantile(0.95)

        # Filter points to only those within the threshold distance
        filtered_points = [point for point, distance in zip(points_train, distances) if distance <= threshold_distance]

        # Create new multipoint from filtered points
        multipoint_filtered = MultiPoint(filtered_points)
        
        # Calculate the convex hull
        convex_hull = multipoint_filtered.convex_hull

        # Validate against test_df
        # Count how many test points fall within the convex hull
        test_points_within = [point for point in points_test if convex_hull.contains(point)]
        print(f"Convex Hull: {len(filtered_points)} filtered points, {len(test_points_within)} out of {len(points_test)} test points within hull")

        # Construct confusion matrix values
        y_pred = [True if convex_hull.contains(point) else False for point in points_test]
        y_true = [True] * len(points_test)

        # Compute classification metrics
        accuracy_score_value = accuracy_score(y_true, y_pred)
        precision_score_value = precision_score(y_true, y_pred, zero_division=0)
        recall_score_value = recall_score(y_true, y_pred, zero_division=0)
        f1_score_value = f1_score(y_true, y_pred, zero_division=0)

        # Add best convex hull based on selected metric
        current_metric_value = None
        if metric == "accuracy":
            current_metric_value = accuracy_score_value
        elif metric == "precision":
            current_metric_value = precision_score_value
        elif metric == "recall":
            current_metric_value = recall_score_value
        elif metric == "f1":
            current_metric_value = f1_score_value
        else:
            print(f"Warning: Unknown metric '{metric}' specified for best convex hull selection.")
        
        if my_best_convex_hull is None or (current_metric_value is not None and current_metric_value > my_best_convex_hull[0]):
            my_best_convex_hull = (current_metric_value, convex_hull)
            print(f"New best convex hull based on {metric}: {current_metric_value:.4f}")
        

        # Now generate SVM boundary using filtered points
        svm_args = {
            'kernel': 'rbf',
            'nu': float(0.08),  # Lower value allow more flexible boundary
            'gamma': float(len(df_train) * 30)  # Higher value allow more complex decision boundary
        }

        # Run SVM for different signal level categories
        # Starting from very weak to very strong
        # Cumulative union of all these geoms to ensure coverage of all signal levels
        levels = {
            5: ["5. Very Strong"],
            4: ["4. Strong", "5. Very Strong"],
            3: ["3. Moderate", "4. Strong", "5. Very Strong"],
            2: ["2. Weak", "3. Moderate", "4. Strong", "5. Very Strong"],
            1: ["1. Very Weak", "2. Weak", "3. Moderate", "4. Strong", "5. Very Strong"],
        }
        
        # Generate SVM boundaries for each level
        for key, level in levels.items():
            df_subset_level_train = df_train[df_train["signal_level_category"].isin(level)]

            # Use all levels for testing
            df_subset_level_test = df_test

            if len(df_subset_level_train) < min_points_required:
                print(f"Skipping SVM for cell {cell_id} at levels {level} due to insufficient points ({len(df_subset_level_train)} < {min_points_required})")
                continue
            
            points_level_train = [Point(xy) for xy in zip(df_subset_level_train['longitude'], df_subset_level_train['latitude'])]
            points_level_test = [Point(xy) for xy in zip(df_subset_level_test['longitude'], df_subset_level_test['latitude'])]
            
            # Run SVM boundary generation
            buffer = 0.000 
            geom_level = generate_svm_boundary_geom(df_subset_level_train, **svm_args)
            
            if geom_level is not None:
                # If the function returned a GeoJSON mapping (dict), convert to shapely geometry
                if isinstance(geom_level, dict):
                    try:
                        geom_obj = shape(geom_level)
                    except Exception as e:
                        print(f"Warning: could not convert generated mapping to shapely geometry for cell {cell_id}, levels {level}: {e}")
                        continue
                else:
                    geom_obj = geom_level

                # Add buffer radius to have better coverage
                if buffer > 0:
                    try:
                        geom_obj = geom_obj.buffer(buffer)
                    except Exception as e:
                        print(f"Warning: buffering geometry failed for cell {cell_id}, levels {level}: {e}")

                # Ensure the geometry is serialized to GeoJSON mapping (dict) before appending
                try:
                    geom_mapping = shapely.geometry.mapping(geom_obj)
                except Exception as e:
                    print(f"Warning: could not create mapping for geometry for cell {cell_id}, levels {level}: {e}")
                    continue

                # Construct confusion matrix values
                y_pred = [True if geom_obj.contains(point) else False for point in points_level_test]
                y_true = [x in level for x in df_subset_level_test["signal_level_category"].tolist()]

                # Compute classification metrics
                accuracy_score_value = accuracy_score(y_true, y_pred)
                precision_score_value = precision_score(y_true, y_pred, zero_division=0)
                recall_score_value = recall_score(y_true, y_pred, zero_division=0)
                f1_score_value = f1_score(y_true, y_pred, zero_division=0)

                # Store best SVM hyperplane based on selected metric
                current_metric_value = None
                if metric == "accuracy":
                    current_metric_value = accuracy_score_value
                elif metric == "precision":
                    current_metric_value = precision_score_value
                elif metric == "recall":
                    current_metric_value = recall_score_value
                elif metric == "f1":
                    current_metric_value = f1_score_value
                else:
                    print(f"Warning: Unknown metric '{metric}' specified for best SVM hyperplane selection.")

                if key not in my_best_svm_hyperplanes or (current_metric_value is not None and current_metric_value > my_best_svm_hyperplanes[key][0]):
                    old_metric_value = my_best_svm_hyperplanes[key][0] if key in my_best_svm_hyperplanes else -1
                    print(f"New best SVM hyperplane for levels {level} based on {metric}: {old_metric_value} -> {current_metric_value:.4f}")
                    my_best_svm_hyperplanes[key] = (current_metric_value, geom_obj)
                else:
                    print(f"Skip. SVM hyperplane for levels {level} did not improve based on {metric}: {current_metric_value:.4f} (best: {my_best_svm_hyperplanes[key][0]:.4f})")
                
                
                
            else:
                print(f"Warning: SVM boundary generation failed for cell {cell_id} at levels {level}")
        
        
    # Append best convex hull to list
    my_geojson['features'].append({
        "type": "Feature",
        "properties": {
            "cell_id": cell_id,
            "fill": "#3300FF",
        },
        "geometry": shapely.geometry.mapping(my_best_convex_hull[1])
    })

    # Ensure successive hyperplanes are fully overlapping
    # For example, level 5 should be fully within level 4, and level 4 within level 3, etc.
    # This means level 4 is the union of level 4 and level 5, level 3 is the union of level 3, level 4 and level 5, etc.
    for i, key in enumerate(sorted(my_best_svm_hyperplanes.keys(), reverse=True)):
        if i == 0:
            continue  # Skip the highest level as there's nothing above it
        higher_key = sorted(my_best_svm_hyperplanes.keys(), reverse=True)[i - 1]
        if higher_key in my_best_svm_hyperplanes:
            higher_geom = my_best_svm_hyperplanes[higher_key][1]
            current_geom = my_best_svm_hyperplanes[key][1]
            # Perform union to ensure full coverage
            try:
                combined_geom = higher_geom.union(current_geom)
                my_best_svm_hyperplanes[key] = (my_best_svm_hyperplanes[key][0], combined_geom)
            except Exception as e:
                print(f"Warning: Could not union geometries for levels {higher_key} and {key} for cell {cell_id}: {e}")


    
    # Append the best SVM hyperplanes to list
    for key, value in my_best_svm_hyperplanes.items():
        my_geojson['features'].append({
            "type": "Feature",
            "properties": {
                "cell_id": cell_id,
                "svm_boundary_levels": key,
                "fill": "#FF0000",
            },
            "geometry": shapely.geometry.mapping(value[1]),
        })
        
    # Export geojson for this cell
    import os
    os.makedirs('cells', exist_ok=True)
    with open(f"cells/cell_{cell_id}.geojson", "w") as f:
        json.dump(my_geojson, f, indent=2)

---
Processing cell 5a6685a51c70f12cc7b03db00a18db684e60835c
---Training month: 2025-03-01, Testing month: 2025-04-30---
Train: 3520, Test: 3548
Convex Hull: 3344 filtered points, 3424 out of 3548 test points within hull
New best convex hull based on recall: 0.9651
New best SVM hyperplane for levels ['5. Very Strong'] based on recall: -1 -> 0.9077
New best SVM hyperplane for levels ['4. Strong', '5. Very Strong'] based on recall: -1 -> 0.8922
New best SVM hyperplane for levels ['3. Moderate', '4. Strong', '5. Very Strong'] based on recall: -1 -> 0.9309
New best SVM hyperplane for levels ['2. Weak', '3. Moderate', '4. Strong', '5. Very Strong'] based on recall: -1 -> 0.9394
New best SVM hyperplane for levels ['1. Very Weak', '2. Weak', '3. Moderate', '4. Strong', '5. Very Strong'] based on recall: -1 -> 0.9392
---Training month: 2025-04-01, Testing month: 2025-05-31---
Train: 3548, Test: 7015
Convex Hull: 3370 filtered points, 6630 out of 7015 test points within hull
Skip. SVM hyperplan