In [2]:
!sudo apt-get install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh # download ollama api
from IPython.display import clear_output
import os
import threading
import subprocess
import requests
import json
import numpy as np
from typing import Dict, List, Any, Tuple
import pandas as pd

def ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
    os.environ['OLLAMA_ORIGINS'] = '*'
    subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()

!ollama pull llama3.1:8b
clear_output()
!pip install -U lightrag[ollama] geopy shapely

from lightrag.core.generator import Generator
from lightrag.core.component import Component
from lightrag.core.model_client import ModelClient
from lightrag.components.model_client import OllamaClient
from geopy.distance import geodesic
from shapely.geometry import Point, Polygon, LineString, shape
from shapely.validation import explain_validity
import time

# Enhanced template for GeoJSON anomaly detection
geojson_anomaly_template = r"""<SYS>
You are an expert geospatial data analyst specializing in detecting anomalies in GeoJSON data.
Analyze the provided GeoJSON data and statistical analysis to identify potential anomalies.

Focus on detecting:
1. Geometric anomalies (invalid coordinates, self-intersecting polygons, etc.)
2. Statistical outliers in properties/attributes
3. Spatial outliers (features in unexpected locations)
4. Data consistency issues
5. Missing or malformed data

Provide specific, actionable findings with coordinates and feature IDs where applicable.
</SYS>

Statistical Analysis:
{{stats}}

GeoJSON Data Summary:
{{geojson_summary}}

Detected Issues:
{{detected_issues}}

User Query: {{input_str}}

Analysis:"""

class GeoJSONAnomalyDetector(Component):
    def __init__(self, model_client: ModelClient, model_kwargs: dict):
        super().__init__()
        self.generator = Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=geojson_anomaly_template,
        )

    def analyze_geojson(self, geojson_data: Dict) -> Dict[str, Any]:
        """Perform statistical and geometric analysis on GeoJSON data"""
        issues = []
        stats = {}

        try:
            features = geojson_data.get('features', [])
            stats['total_features'] = len(features)

            # Analyze geometric properties
            coordinates_list = []
            properties_data = []
            geometry_types = []

            for i, feature in enumerate(features):
                # Geometry analysis
                geometry = feature.get('geometry', {})
                geom_type = geometry.get('type', '')
                geometry_types.append(geom_type)

                # Validate geometry using Shapely
                try:
                    shape_obj = shape(geometry)
                    if not shape_obj.is_valid:
                        issues.append(f"Invalid geometry in feature {i}: {explain_validity(shape_obj)}")
                except Exception as e:
                    issues.append(f"Geometry parsing error in feature {i}: {str(e)}")

                # Extract coordinates for analysis
                coords = geometry.get('coordinates', [])
                if coords:
                    flat_coords = self._flatten_coordinates(coords)
                    coordinates_list.extend(flat_coords)

                # Properties analysis
                properties = feature.get('properties', {})
                properties_data.append(properties)

            # Statistical analysis of coordinates
            if coordinates_list:
                coords_array = np.array(coordinates_list)
                if coords_array.shape[1] >= 2:
                    lons = coords_array[:, 0]
                    lats = coords_array[:, 1]

                    stats['longitude'] = {
                        'min': float(np.min(lons)),
                        'max': float(np.max(lons)),
                        'mean': float(np.mean(lons)),
                        'std': float(np.std(lons))
                    }
                    stats['latitude'] = {
                        'min': float(np.min(lats)),
                        'max': float(np.max(lats)),
                        'mean': float(np.mean(lats)),
                        'std': float(np.std(lats))
                    }

                    # Detect coordinate outliers
                    lon_outliers = np.abs(lons - np.mean(lons)) > 3 * np.std(lons)
                    lat_outliers = np.abs(lats - np.mean(lats)) > 3 * np.std(lats)

                    if np.any(lon_outliers):
                        outlier_indices = np.where(lon_outliers)[0]
                        issues.append(f"Longitude outliers detected at indices: {outlier_indices.tolist()}")

                    if np.any(lat_outliers):
                        outlier_indices = np.where(lat_outliers)[0]
                        issues.append(f"Latitude outliers detected at indices: {outlier_indices.tolist()}")

                    # Check for invalid coordinate ranges
                    if np.any(lons < -180) or np.any(lons > 180):
                        issues.append("Invalid longitude values found (outside -180 to 180 range)")

                    if np.any(lats < -90) or np.any(lats > 90):
                        issues.append("Invalid latitude values found (outside -90 to 90 range)")

            # Analyze geometry types
            stats['geometry_types'] = dict(pd.Series(geometry_types).value_counts())

            # Analyze properties
            if properties_data:
                props_df = pd.DataFrame(properties_data)
                stats['properties'] = {}

                for col in props_df.columns:
                    if props_df[col].dtype in ['int64', 'float64']:
                        col_stats = {
                            'mean': float(props_df[col].mean()) if not props_df[col].isna().all() else None,
                            'std': float(props_df[col].std()) if not props_df[col].isna().all() else None,
                            'min': float(props_df[col].min()) if not props_df[col].isna().all() else None,
                            'max': float(props_df[col].max()) if not props_df[col].isna().all() else None,
                            'null_count': int(props_df[col].isna().sum())
                        }
                        stats['properties'][col] = col_stats

                        # Detect statistical outliers in properties
                        if col_stats['std'] and col_stats['std'] > 0:
                            outliers = np.abs((props_df[col] - col_stats['mean']) / col_stats['std']) > 3
                            if outliers.any():
                                outlier_indices = props_df[outliers].index.tolist()
                                issues.append(f"Statistical outliers in property '{col}' at feature indices: {outlier_indices}")
                    else:
                        # Categorical analysis
                        unique_vals = props_df[col].nunique()
                        null_count = props_df[col].isna().sum()
                        stats['properties'][col] = {
                            'unique_values': int(unique_vals),
                            'null_count': int(null_count),
                            'type': 'categorical'
                        }

        except Exception as e:
            issues.append(f"Analysis error: {str(e)}")

        return {
            'statistics': stats,
            'issues': issues,
            'summary': f"Analyzed {stats.get('total_features', 0)} features with {len(issues)} potential issues detected."
        }

    def _flatten_coordinates(self, coords):
        """Recursively flatten coordinate arrays"""
        result = []

        def _flatten(arr):
            for item in arr:
                if isinstance(item, (list, tuple)):
                    if len(item) >= 2 and all(isinstance(x, (int, float)) for x in item[:2]):
                        result.append(item[:2])  # Take only lon, lat
                    else:
                        _flatten(item)

        _flatten(coords)
        return result

    def call(self, input_data: Dict) -> str:
        geojson_data = input_data.get('geojson', {})
        query = input_data.get('query', 'Analyze this GeoJSON data for anomalies')

        # Perform analysis
        analysis = self.analyze_geojson(geojson_data)

        # Prepare template variables
        template_vars = {
            'input_str': query,
            'stats': json.dumps(analysis['statistics'], indent=2),
            'geojson_summary': analysis['summary'],
            'detected_issues': '\n'.join([f"- {issue}" for issue in analysis['issues']])
        }

        return self.generator.call(template_vars)

    async def acall(self, input_data: Dict) -> str:
        geojson_data = input_data.get('geojson', {})
        query = input_data.get('query', 'Analyze this GeoJSON data for anomalies')

        # Perform analysis
        analysis = self.analyze_geojson(geojson_data)

        # Prepare template variables
        template_vars = {
            'input_str': query,
            'stats': json.dumps(analysis['statistics'], indent=2),
            'geojson_summary': analysis['summary'],
            'detected_issues': '\n'.join([f"- {issue}" for issue in analysis['issues']])
        }

        return await self.generator.acall(template_vars)

# Initialize the anomaly detector
from lightrag.components.model_client import OllamaClient
from IPython.display import Markdown, display

model = {
    "model_client": OllamaClient(),
    "model_kwargs": {"model": "llama3.1:8b"}
}

anomaly_detector = GeoJSONAnomalyDetector(**model)

# Example usage with your GeoJSON data
# Replace 'your_geojson_data' with your actual GeoJSON data
def detect_geojson_anomalies(geojson_data, custom_query=None):
    """
    Detect anomalies in GeoJSON data

    Args:
        geojson_data: Dict containing GeoJSON data
        custom_query: Optional custom query for specific analysis
    """
    query = custom_query or "Please analyze this GeoJSON data and identify any anomalies, outliers, or data quality issues. Focus on geometric validity, coordinate accuracy, and statistical outliers in properties."

    input_data = {
        'geojson': geojson_data,
        'query': query
    }

    result = anomaly_detector(input_data)
    display(Markdown(f"**GeoJSON Anomaly Analysis:**\n\n{result.data}"))
    return result

# Load and analyze your GeoJSON data
print("Loading MunshiNagarData.geojson...")

# Option 1: Load from file
with open('MunshiNagarData.geojson', 'r') as f:
    your_geojson_data = json.load(f)

print(f"Loaded GeoJSON with {len(your_geojson_data.get('features', []))} features")

# Run anomaly detection
print("Running basic anomaly detection...")
detect_geojson_anomalies(your_geojson_data)

print("\n" + "="*50 + "\n")

# Example with custom query for geographic regions and property values
print("Running detailed analysis focusing on geographic and property anomalies...")
detect_geojson_anomalies(
    your_geojson_data,
    "Focus on detecting features that might be in wrong geographic regions or have suspicious property values. Pay special attention to any coordinates that seem out of place for Munshi Nagar area and any property values that are statistical outliers."
)

print("GeoJSON Anomaly Detection system is ready!")
print("Use detect_geojson_anomalies(your_geojson_data) to analyze your data.")

Loading MunshiNagarData.geojson...
Loaded GeoJSON with 212 features
Running basic anomaly detection...


**GeoJSON Anomaly Analysis:**

Based on the provided GeoJSON data and statistical analysis, I've identified several potential issues:

**Geometric Anomalies**

1. **Self-intersection error**: Features 0, 2 have self-intersecting polygons, which can be a sign of invalid geometry.
	* Feature 0: Geometry parsing error at [72.8359824775752, 19.1220712286071]
	* Feature 2: Geometry parsing error at [72.8331855051968, 19.1223147076067]

**Statistical Outliers**

2. **Longitude outliers**: Index 55 has an unusual longitude value (75.3622) compared to the mean (72.8404).
	* Feature ID: 55
	* Longitude: 75.3622

3. **Latitude outliers**: Index 55 has a latitude value (90.65) that's significantly higher than the mean (19.1637) and maximum allowed value (90).

**Spatial Outliers**

4. **Invalid location**: Feature ID: 55
	* Latitude: 90.65 (outside -90 to 90 range)
	* Longitude: 75.3622

**Data Consistency Issues**

5. **Missing data**: No numpy.rec module found during analysis, which might indicate missing or incomplete libraries in the environment.

**Additional Findings**

6. **Invalid coordinates**: Several features have invalid latitude values outside the -90 to 90 range.
	* Notably: Features with IDs 55 and others (not specified) have latitude values outside this range.

To address these issues, I recommend:

1. Re-examine the GeoJSON data for geometric validity and correct any self-intersection errors.
2. Verify the coordinate accuracy of features with outliers (e.g., feature ID 55).
3. Review the libraries and environment to ensure numpy.rec is properly installed.
4. Investigate missing or invalid latitude values and correct them accordingly.

Here's a summary of the findings in GeoJSON format:
```json
{
  "issues": [
    {
      "feature_id": 0,
      "error_type": "geometry_parsing_error",
      "coordinates": [72.8359824775752, 19.1220712286071]
    },
    {
      "feature_id": 2,
      "error_type": "self_intersection",
      "coordinates": [72.8331855051968, 19.1223147076067]
    },
    {
      "feature_id": 55,
      "error_type": "longitude_outlier",
      "longitude": 75.3622
    },
    {
      "feature_id": 55,
      "error_type": "latitude_outlier",
      "latitude": 90.65
    },
    {
      "error_type": "missing_data",
      "library": "numpy.rec"
    }
  ]
}
```
These findings should help you identify and address the potential issues in your GeoJSON data.



Running detailed analysis focusing on geographic and property anomalies...


**GeoJSON Anomaly Analysis:**

To detect anomalies in the provided GeoJSON data, I'll perform the following analysis:

**1. Geometric Anomalies**

*   Feature 0 has a geometry parsing error due to an invalid 'float' object being iterable.
    *   Suggested action: Validate the input data for correct data types.

*   Features 1 and 2 have self-intersections, indicating geometric anomalies.
    *   Coordinates of these features:
        -   Feature 1: [72.8359824775752, 19.1220712286071]
        -   Feature 2: [72.8331855051968, 19.1223147076067]

**2. Statistical Outliers in Properties/Attributes**

*   Longitude outliers detected at index 55.
    *   Coordinate of the feature with an outlier longitude value:
        -   Index 55: [75.3622, latitude\_value] (Note: The exact coordinate is not provided due to missing data)

*   Latitude outliers detected at index 55.
    *   Coordinate of the feature with an outlier latitude value:
        -   Index 55: [longitude\_value, 90.65] (Note: The exact coordinate is not provided due to missing data)

**3. Spatial Outliers**

*   Invalid latitude values found outside the -90 to 90 range.
    *   Feature IDs where invalid latitude values are detected:
        -   Features with latitude value greater than 90
        -   Features with latitude value less than -90

**4. Data Consistency Issues**

*   Analysis error due to a missing module 'numpy.rec'
    *   Suggested action: Install the required numpy module for analysis.

**5. Missing or Malformed Data**

*   Invalid latitude values found outside the -90 to 90 range.
    *   Feature IDs where invalid latitude values are detected:
        -   Features with latitude value greater than 90
        -   Features with latitude value less than -90

GeoJSON Anomaly Detection system is ready!
Use detect_geojson_anomalies(your_geojson_data) to analyze your data.


New code with corrected json file

In [1]:
!sudo apt-get install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh # download ollama api
from IPython.display import clear_output
import os
import threading
import subprocess
import requests
import json
import numpy as np
from typing import Dict, List, Any, Tuple, Optional
import pandas as pd
import copy
from datetime import datetime

def ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
    os.environ['OLLAMA_ORIGINS'] = '*'
    subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()

!ollama pull llama3.1:8b
clear_output()
!pip install -U lightrag[ollama] geopy shapely

from lightrag.core.generator import Generator
from lightrag.core.component import Component
from lightrag.core.model_client import ModelClient
from lightrag.components.model_client import OllamaClient
from geopy.distance import geodesic
from shapely.geometry import Point, Polygon, LineString, shape, mapping
from shapely.validation import explain_validity, make_valid
from shapely.ops import unary_union
import time

# Enhanced template for GeoJSON anomaly detection and correction
geojson_anomaly_template = r"""<SYS>
You are an expert geospatial data analyst specializing in detecting and correcting anomalies in GeoJSON data.
Analyze the provided GeoJSON data and statistical analysis to identify potential anomalies and suggest corrections.

Focus on detecting:
1. Geometric anomalies (invalid coordinates, self-intersecting polygons, etc.)
2. Statistical outliers in properties/attributes
3. Spatial outliers (features in unexpected locations)
4. Data consistency issues
5. Missing or malformed data

Provide specific, actionable findings with coordinates and feature IDs where applicable.
Also suggest correction strategies for each identified issue.
</SYS>

Statistical Analysis:
{{stats}}

GeoJSON Data Summary:
{{geojson_summary}}

Detected Issues:
{{detected_issues}}

Applied Corrections:
{{corrections_applied}}

User Query: {{input_str}}

Analysis and Recommendations:"""

class GeoJSONAnomalyDetector(Component):
    def __init__(self, model_client: ModelClient, model_kwargs: dict):
        super().__init__()
        self.generator = Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=geojson_anomaly_template,
        )
        self.corrections_log = []

    def analyze_geojson(self, geojson_data: Dict) -> Dict[str, Any]:
        """Perform statistical and geometric analysis on GeoJSON data"""
        issues = []
        stats = {}

        try:
            features = geojson_data.get('features', [])
            stats['total_features'] = len(features)

            # Analyze geometric properties
            coordinates_list = []
            properties_data = []
            geometry_types = []

            for i, feature in enumerate(features):
                # Geometry analysis
                geometry = feature.get('geometry', {})
                geom_type = geometry.get('type', '')
                geometry_types.append(geom_type)

                # Validate geometry using Shapely
                try:
                    shape_obj = shape(geometry)
                    if not shape_obj.is_valid:
                        issues.append(f"Invalid geometry in feature {i}: {explain_validity(shape_obj)}")
                except Exception as e:
                    issues.append(f"Geometry parsing error in feature {i}: {str(e)}")

                # Extract coordinates for analysis
                coords = geometry.get('coordinates', [])
                if coords:
                    flat_coords = self._flatten_coordinates(coords)
                    coordinates_list.extend(flat_coords)

                # Properties analysis
                properties = feature.get('properties', {})
                properties_data.append(properties)

            # Statistical analysis of coordinates
            if coordinates_list:
                coords_array = np.array(coordinates_list)
                if coords_array.shape[1] >= 2:
                    lons = coords_array[:, 0]
                    lats = coords_array[:, 1]

                    stats['longitude'] = {
                        'min': float(np.min(lons)),
                        'max': float(np.max(lons)),
                        'mean': float(np.mean(lons)),
                        'std': float(np.std(lons))
                    }
                    stats['latitude'] = {
                        'min': float(np.min(lats)),
                        'max': float(np.max(lats)),
                        'mean': float(np.mean(lats)),
                        'std': float(np.std(lats))
                    }

                    # Detect coordinate outliers
                    lon_outliers = np.abs(lons - np.mean(lons)) > 3 * np.std(lons)
                    lat_outliers = np.abs(lats - np.mean(lats)) > 3 * np.std(lats)

                    if np.any(lon_outliers):
                        outlier_indices = np.where(lon_outliers)[0]
                        issues.append(f"Longitude outliers detected at indices: {outlier_indices.tolist()}")

                    if np.any(lat_outliers):
                        outlier_indices = np.where(lat_outliers)[0]
                        issues.append(f"Latitude outliers detected at indices: {outlier_indices.tolist()}")

                    # Check for invalid coordinate ranges
                    if np.any(lons < -180) or np.any(lons > 180):
                        issues.append("Invalid longitude values found (outside -180 to 180 range)")

                    if np.any(lats < -90) or np.any(lats > 90):
                        issues.append("Invalid latitude values found (outside -90 to 90 range)")

            # Analyze geometry types
            stats['geometry_types'] = dict(pd.Series(geometry_types).value_counts())

            # Analyze properties
            if properties_data:
                props_df = pd.DataFrame(properties_data)
                stats['properties'] = {}

                for col in props_df.columns:
                    if props_df[col].dtype in ['int64', 'float64']:
                        col_stats = {
                            'mean': float(props_df[col].mean()) if not props_df[col].isna().all() else None,
                            'std': float(props_df[col].std()) if not props_df[col].isna().all() else None,
                            'min': float(props_df[col].min()) if not props_df[col].isna().all() else None,
                            'max': float(props_df[col].max()) if not props_df[col].isna().all() else None,
                            'null_count': int(props_df[col].isna().sum())
                        }
                        stats['properties'][col] = col_stats

                        # Detect statistical outliers in properties
                        if col_stats['std'] and col_stats['std'] > 0:
                            outliers = np.abs((props_df[col] - col_stats['mean']) / col_stats['std']) > 3
                            if outliers.any():
                                outlier_indices = props_df[outliers].index.tolist()
                                issues.append(f"Statistical outliers in property '{col}' at feature indices: {outlier_indices}")
                    else:
                        # Categorical analysis
                        unique_vals = props_df[col].nunique()
                        null_count = props_df[col].isna().sum()
                        stats['properties'][col] = {
                            'unique_values': int(unique_vals),
                            'null_count': int(null_count),
                            'type': 'categorical'
                        }

        except Exception as e:
            issues.append(f"Analysis error: {str(e)}")

        return {
            'statistics': stats,
            'issues': issues,
            'summary': f"Analyzed {stats.get('total_features', 0)} features with {len(issues)} potential issues detected."
        }

    def correct_geojson(self, geojson_data: Dict, correction_options: Dict = None) -> Dict[str, Any]:
        """
        Correct anomalies in GeoJSON data and return corrected version

        Args:
            geojson_data: Original GeoJSON data
            correction_options: Dictionary of correction preferences

        Returns:
            Dictionary containing corrected GeoJSON and correction log
        """
        if correction_options is None:
            correction_options = {
                'fix_invalid_geometries': True,
                'remove_coordinate_outliers': True,
                'fix_invalid_coordinates': True,
                'remove_property_outliers': False,  # Conservative default
                'fill_missing_properties': True,
                'standardize_properties': True
            }

        corrected_data = copy.deepcopy(geojson_data)
        corrections = []
        features_to_remove = []

        if 'features' not in corrected_data:
            corrected_data['features'] = []

        # Get statistics for outlier detection
        analysis = self.analyze_geojson(geojson_data)
        stats = analysis['statistics']

        for i, feature in enumerate(corrected_data['features']):
            feature_corrections = []

            # Fix geometry issues
            if correction_options.get('fix_invalid_geometries', True):
                geometry = feature.get('geometry', {})
                if geometry:
                    try:
                        shape_obj = shape(geometry)
                        if not shape_obj.is_valid:
                            # Try to fix invalid geometry
                            fixed_shape = make_valid(shape_obj)
                            if fixed_shape.is_valid:
                                corrected_data['features'][i]['geometry'] = mapping(fixed_shape)
                                feature_corrections.append(f"Fixed invalid geometry")
                            else:
                                features_to_remove.append(i)
                                feature_corrections.append(f"Removed unfixable invalid geometry")
                    except Exception as e:
                        features_to_remove.append(i)
                        feature_corrections.append(f"Removed geometry due to parsing error: {str(e)}")

            # Fix coordinate issues
            if correction_options.get('fix_invalid_coordinates', True):
                geometry = feature.get('geometry', {})
                coords = geometry.get('coordinates', [])
                if coords:
                    fixed_coords, coord_fixes = self._fix_coordinates(coords)
                    if coord_fixes:
                        corrected_data['features'][i]['geometry']['coordinates'] = fixed_coords
                        feature_corrections.extend(coord_fixes)

            # Handle coordinate outliers
            if correction_options.get('remove_coordinate_outliers', True) and 'longitude' in stats and 'latitude' in stats:
                geometry = feature.get('geometry', {})
                if geometry and self._is_coordinate_outlier(geometry, stats):
                    features_to_remove.append(i)
                    feature_corrections.append("Removed feature with outlier coordinates")

            # Fix property issues
            properties = feature.get('properties', {})
            if properties and correction_options.get('standardize_properties', True):
                fixed_props, prop_fixes = self._fix_properties(properties, stats.get('properties', {}), correction_options)
                if prop_fixes:
                    corrected_data['features'][i]['properties'] = fixed_props
                    feature_corrections.extend(prop_fixes)

            if feature_corrections:
                corrections.append(f"Feature {i}: {'; '.join(feature_corrections)}")

        # Remove flagged features (in reverse order to maintain indices)
        for idx in sorted(set(features_to_remove), reverse=True):
            del corrected_data['features'][idx]
            corrections.append(f"Removed feature {idx}")

        # Add metadata about corrections
        if 'properties' not in corrected_data:
            corrected_data['properties'] = {}

        corrected_data['properties']['correction_metadata'] = {
            'corrected_at': datetime.now().isoformat(),
            'original_feature_count': len(geojson_data.get('features', [])),
            'corrected_feature_count': len(corrected_data.get('features', [])),
            'corrections_applied': len(corrections),
            'correction_options': correction_options
        }

        return {
            'corrected_geojson': corrected_data,
            'corrections_log': corrections,
            'correction_summary': f"Applied {len(corrections)} corrections, removed {len(features_to_remove)} features"
        }

    def _fix_coordinates(self, coords) -> Tuple[List, List]:
        """Fix coordinate arrays recursively"""
        fixes = []

        def fix_coord_array(arr):
            if not isinstance(arr, (list, tuple)):
                return arr

            # If this looks like a coordinate pair [lon, lat]
            if len(arr) >= 2 and all(isinstance(x, (int, float)) for x in arr[:2]):
                lon, lat = arr[0], arr[1]
                fixed_lon, fixed_lat = lon, lat

                # Fix longitude range
                if lon < -180:
                    fixed_lon = -180
                    fixes.append(f"Clamped longitude {lon} to -180")
                elif lon > 180:
                    fixed_lon = 180
                    fixes.append(f"Clamped longitude {lon} to 180")

                # Fix latitude range
                if lat < -90:
                    fixed_lat = -90
                    fixes.append(f"Clamped latitude {lat} to -90")
                elif lat > 90:
                    fixed_lat = 90
                    fixes.append(f"Clamped latitude {lat} to 90")

                result = [fixed_lon, fixed_lat]
                if len(arr) > 2:
                    result.extend(arr[2:])  # Preserve additional dimensions
                return result
            else:
                # Recursively fix nested arrays
                return [fix_coord_array(item) for item in arr]

        fixed_coords = fix_coord_array(coords)
        return fixed_coords, fixes

    def _is_coordinate_outlier(self, geometry: Dict, stats: Dict) -> bool:
        """Check if geometry contains outlier coordinates"""
        try:
            coords = geometry.get('coordinates', [])
            flat_coords = self._flatten_coordinates(coords)

            if not flat_coords or 'longitude' not in stats or 'latitude' not in stats:
                return False

            lon_mean, lon_std = stats['longitude']['mean'], stats['longitude']['std']
            lat_mean, lat_std = stats['latitude']['mean'], stats['latitude']['std']

            for coord in flat_coords:
                if len(coord) >= 2:
                    lon, lat = coord[0], coord[1]

                    # Check if coordinate is more than 3 standard deviations away
                    if (abs(lon - lon_mean) > 3 * lon_std or
                        abs(lat - lat_mean) > 3 * lat_std):
                        return True

            return False
        except:
            return False

    def _fix_properties(self, properties: Dict, prop_stats: Dict, options: Dict) -> Tuple[Dict, List]:
        """Fix property issues"""
        fixed_props = copy.deepcopy(properties)
        fixes = []

        for key, value in properties.items():
            if key in prop_stats:
                stat_info = prop_stats[key]

                # Handle missing values
                if pd.isna(value) and options.get('fill_missing_properties', True):
                    if stat_info.get('type') != 'categorical' and stat_info.get('mean') is not None:
                        fixed_props[key] = stat_info['mean']
                        fixes.append(f"Filled missing {key} with mean value")
                    elif stat_info.get('type') == 'categorical':
                        fixed_props[key] = "Unknown"
                        fixes.append(f"Filled missing {key} with 'Unknown'")

                # Handle outliers in numeric properties
                elif (options.get('remove_property_outliers', False) and
                      stat_info.get('type') != 'categorical' and
                      stat_info.get('std') and stat_info.get('mean') is not None):

                    if isinstance(value, (int, float)):
                        z_score = abs((value - stat_info['mean']) / stat_info['std'])
                        if z_score > 3:
                            # Replace with median or mean
                            fixed_props[key] = stat_info['mean']
                            fixes.append(f"Replaced outlier {key} value {value} with mean")

        return fixed_props, fixes

    def _flatten_coordinates(self, coords):
        """Recursively flatten coordinate arrays"""
        result = []

        def _flatten(arr):
            for item in arr:
                if isinstance(item, (list, tuple)):
                    if len(item) >= 2 and all(isinstance(x, (int, float)) for x in item[:2]):
                        result.append(item[:2])  # Take only lon, lat
                    else:
                        _flatten(item)

        _flatten(coords)
        return result

    def call(self, input_data: Dict) -> str:
        geojson_data = input_data.get('geojson', {})
        query = input_data.get('query', 'Analyze this GeoJSON data for anomalies')

        # Perform analysis
        analysis = self.analyze_geojson(geojson_data)

        # Apply corrections if requested
        correction_result = None
        if input_data.get('apply_corrections', False):
            correction_options = input_data.get('correction_options', {})
            correction_result = self.correct_geojson(geojson_data, correction_options)

        # Prepare template variables
        template_vars = {
            'input_str': query,
            'stats': json.dumps(analysis['statistics'], indent=2),
            'geojson_summary': analysis['summary'],
            'detected_issues': '\n'.join([f"- {issue}" for issue in analysis['issues']]),
            'corrections_applied': (
                '\n'.join([f"- {correction}" for correction in correction_result['corrections_log']])
                if correction_result else "No corrections applied"
            )
        }

        return self.generator.call(template_vars)

    async def acall(self, input_data: Dict) -> str:
        geojson_data = input_data.get('geojson', {})
        query = input_data.get('query', 'Analyze this GeoJSON data for anomalies')

        # Perform analysis
        analysis = self.analyze_geojson(geojson_data)

        # Apply corrections if requested
        correction_result = None
        if input_data.get('apply_corrections', False):
            correction_options = input_data.get('correction_options', {})
            correction_result = self.correct_geojson(geojson_data, correction_options)

        # Prepare template variables
        template_vars = {
            'input_str': query,
            'stats': json.dumps(analysis['statistics'], indent=2),
            'geojson_summary': analysis['summary'],
            'detected_issues': '\n'.join([f"- {issue}" for issue in analysis['issues']]),
            'corrections_applied': (
                '\n'.join([f"- {correction}" for correction in correction_result['corrections_log']])
                if correction_result else "No corrections applied"
            )
        }

        return await self.generator.acall(template_vars)

# Initialize the anomaly detector
from lightrag.components.model_client import OllamaClient
from IPython.display import Markdown, display

model = {
    "model_client": OllamaClient(),
    "model_kwargs": {"model": "llama3.1:8b"}
}

anomaly_detector = GeoJSONAnomalyDetector(**model)

def detect_and_correct_geojson_anomalies(geojson_data, custom_query=None, apply_corrections=True, correction_options=None):
    """
    Detect and optionally correct anomalies in GeoJSON data

    Args:
        geojson_data: Dict containing GeoJSON data
        custom_query: Optional custom query for specific analysis
        apply_corrections: Whether to apply automatic corrections
        correction_options: Dictionary of correction preferences

    Returns:
        Dictionary containing analysis results and corrected data (if requested)
    """
    if correction_options is None:
        correction_options = {
            'fix_invalid_geometries': True,
            'remove_coordinate_outliers': True,
            'fix_invalid_coordinates': True,
            'remove_property_outliers': False,
            'fill_missing_properties': True,
            'standardize_properties': True
        }

    query = custom_query or "Please analyze this GeoJSON data and identify any anomalies, outliers, or data quality issues. Focus on geometric validity, coordinate accuracy, and statistical outliers in properties."

    input_data = {
        'geojson': geojson_data,
        'query': query,
        'apply_corrections': apply_corrections,
        'correction_options': correction_options
    }

    # Get AI analysis
    result = anomaly_detector(input_data)
    display(Markdown(f"**GeoJSON Anomaly Analysis:**\n\n{result.data}"))

    # Get corrected data if requested
    corrected_result = None
    if apply_corrections:
        corrected_result = anomaly_detector.correct_geojson(geojson_data, correction_options)

        print(f"\n{'='*60}")
        print("CORRECTION SUMMARY:")
        print(f"{'='*60}")
        print(f"Original features: {len(geojson_data.get('features', []))}")
        print(f"Corrected features: {len(corrected_result['corrected_geojson'].get('features', []))}")
        print(f"Corrections applied: {len(corrected_result['corrections_log'])}")

        if corrected_result['corrections_log']:
            print(f"\nDetailed corrections:")
            for i, correction in enumerate(corrected_result['corrections_log'][:10], 1):  # Show first 10
                print(f"{i}. {correction}")

            if len(corrected_result['corrections_log']) > 10:
                print(f"... and {len(corrected_result['corrections_log']) - 10} more corrections")

    return {
        'analysis': result,
        'corrected_data': corrected_result,
        'original_data': geojson_data
    }

def save_corrected_geojson(correction_result, filename="corrected_data.geojson"):
    """Save corrected GeoJSON data to file"""
    if correction_result and 'corrected_data' in correction_result and correction_result['corrected_data']:
        corrected_geojson = correction_result['corrected_data']['corrected_geojson']

        with open(filename, 'w') as f:
            json.dump(corrected_geojson, f, indent=2)

        print(f"Corrected GeoJSON saved to: {filename}")
        return filename
    else:
        print("No corrected data available to save")
        return None

# Load and analyze your GeoJSON data
print("Loading MunshiNagarData.geojson...")

# Load from file
with open('MunshiNagarData.geojson', 'r') as f:
    your_geojson_data = json.load(f)

print(f"Loaded GeoJSON with {len(your_geojson_data.get('features', []))} features")

# Run anomaly detection and correction
print("Running comprehensive anomaly detection and correction...")
result = detect_and_correct_geojson_anomalies(
    your_geojson_data,
    custom_query="Analyze this Munshi Nagar GeoJSON data for any anomalies and apply appropriate corrections. Focus on geometric validity, coordinate accuracy, and data consistency.",
    apply_corrections=True
)

# Save corrected data
if result['corrected_data']:
    corrected_filename = save_corrected_geojson(result, "MunshiNagarData_corrected.geojson")
    print(f"\nCorrected GeoJSON data has been saved as: {corrected_filename}")

print("\n" + "="*80)
print("Enhanced GeoJSON Anomaly Detection and Correction system is ready!")
print("Available functions:")
print("- detect_and_correct_geojson_anomalies(data, query, apply_corrections, options)")
print("- save_corrected_geojson(result, filename)")
print("="*80)

Collecting shapely
  Downloading shapely-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting lightrag[ollama]
  Downloading lightrag-0.1.0b6-py3-none-any.whl.metadata (14 kB)
Collecting backoff<3.0.0,>=2.2.1 (from lightrag[ollama])
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting jsonlines<5.0.0,>=4.0.0 (from lightrag[ollama])
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting numpy<2.0.0,>=1.26.4 (from lightrag[ollama])
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ollama<0.3.0,>=0.2.1 (from lightrag[ollama])
  Downloading ollama-0.2.1-py3-none-any.whl.metadata (4.2 kB)
Collecting python-dotenv<2.0.0,>=1.0.1 (from lightrag[ollama])
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collec

INFO:backoff:Backing off call(...) for 0.1s (ollama._types.ResponseError: model 'llama3.1:8b' not found)


Loading MunshiNagarData.geojson...
Loaded GeoJSON with 212 features
Running comprehensive anomaly detection and correction...


INFO:backoff:Backing off call(...) for 0.2s (ollama._types.ResponseError: model 'llama3.1:8b' not found)
INFO:backoff:Backing off call(...) for 2.3s (ollama._types.ResponseError: model 'llama3.1:8b' not found)
INFO:backoff:Backing off call(...) for 2.4s (ollama._types.ResponseError: model 'llama3.1:8b' not found)
ERROR:backoff:Giving up call(...) after 5 tries (ollama._types.ResponseError: model 'llama3.1:8b' not found)
ERROR:lightrag.core.generator:Error calling the model: model 'llama3.1:8b' not found


**GeoJSON Anomaly Analysis:**

None


CORRECTION SUMMARY:
Original features: 212
Corrected features: 210
Corrections applied: 5

Detailed corrections:
1. Feature 0: Removed geometry due to parsing error: 'float' object is not iterable
2. Feature 1: Fixed invalid geometry
3. Feature 2: Fixed invalid geometry; Clamped latitude 90.65 to 90; Removed feature with outlier coordinates
4. Removed feature 2
5. Removed feature 0
Corrected GeoJSON saved to: MunshiNagarData_corrected.geojson

Corrected GeoJSON data has been saved as: MunshiNagarData_corrected.geojson

Enhanced GeoJSON Anomaly Detection and Correction system is ready!
Available functions:
- detect_and_correct_geojson_anomalies(data, query, apply_corrections, options)
- save_corrected_geojson(result, filename)


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
