#Processing ebird data with geopandas and vectorization for performance

In [None]:
!pip install geopandas shapely
!pip install --upgrade pandas numpy

Collecting pandas
  Downloading pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting numpy
  Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m82.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, pandas
  Attempting uninstall: numpy
    Foun

In [None]:
# required imports
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, LineString
from shapely.ops import nearest_points
import logging
from typing import Optional
import warnings
warnings.filterwarnings('ignore')

In [None]:
import zipfile
import os
from datetime import datetime

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
zip_path = '/content/drive/MyDrive/Capstone/ebd_US-DC-001_201901_202505_relApr-2025.zip'

add code to view file names from zip path

In [None]:
extract_to = "extracted_files"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extract("ebd_US-DC-001_201901_202505_relApr-2025.txt", extract_to)

In [None]:
# Load the extracted file
ebird_dc_df = pd.read_csv("/content/extracted_files/ebd_US-DC-001_201901_202505_relApr-2025.txt", sep='\t')

# # Assign zone index to each detection
# ebird_dc_df["zone_index"] = ebird_dc_df.apply(
#     lambda row: assign_zone_to_point(row["LONGITUDE"], row["LATITUDE"], zones), axis=1
# )

# # Save to CSV
# ebird_dc_df.to_csv("/content/drive/MyDrive/Capstone/ebd_US-DC-001_201901_202505_relApr-2025.csv", index=False)


In [None]:
# Save to CSV
ebird_dc_df.to_csv("/content/drive/MyDrive/Capstone/filtered_ebd_US-DC-001_201901_202505.csv", index=False)

In [None]:
ebird_dc_df.shape

(2003460, 12)

In [None]:
columns_to_keep = [
    'COMMON NAME', 'SCIENTIFIC NAME', 'OBSERVATION COUNT', 'COUNTRY', 'STATE', 'COUNTY',
    'LOCALITY', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE',
    'TIME OBSERVATIONS STARTED', 'DURATION MINUTES'
]
ebird_dc_df = ebird_dc_df[columns_to_keep]

i95 coordinates processing

In [None]:
i95_coordinates = pd.read_csv('/content/drive/MyDrive/Capstone/i95_modified.csv')
i95_sorted = i95_coordinates.sort_values(['Overall_Sequence'])
i95_coords = list(zip(i95_sorted['Latitude'], i95_sorted['Longitude']))

In [None]:
#
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class OptimizedBatchProcessor:
    """
    Heavily optimized processor using GeoPandas, spatial indexing, and vectorized operations
    Expected 10-100x performance improvement over original implementation
    """

    def __init__(self,
                 input_file: str = "/content/drive/MyDrive/Capstone/filtered_ebd_US-DC-001_201901_202505.csv",
                 output_file: str = "/content/drive/MyDrive/Capstone/filtered_DC.csv",
                 batch_size: int = 100000,
                 distance_threshold: float = None,
                 i95_coords: list = None):

        self.input_file = input_file
        self.output_file = output_file
        self.batch_size = batch_size
        self.distance_threshold = distance_threshold

        # Pre-process I-95 coordinates into optimized spatial structures
        self._setup_highway_geometry(i95_coords)

        # Statistics tracking
        self.total_rows_processed = 0
        self.total_rows_saved = 0
        self.batch_count = 0

    def _setup_highway_geometry(self, i95_coords):
        """Convert I-95 coordinates to optimized spatial structures"""
        if not i95_coords:
            raise ValueError("I-95 coordinates must be provided")

        logger.info("Setting up highway geometry with spatial indexing...")

        # Create LineString geometry from coordinates
        self.highway_line = LineString([(lon, lat) for lat, lon in i95_coords])

        # Create GeoDataFrame for the highway with spatial index
        highway_gdf = gpd.GeoDataFrame([1], geometry=[self.highway_line], crs='EPSG:4326')

        # Convert to projected CRS for accurate distance calculations (US National Grid)
        self.highway_gdf_projected = highway_gdf.to_crs('EPSG:3857')  # Web Mercator for speed
        self.highway_line_projected = self.highway_gdf_projected.geometry.iloc[0]

        logger.info("Highway geometry setup complete")

    def calculate_distances_vectorized(self, obs_gdf: gpd.GeoDataFrame) -> np.ndarray:
        """
        Vectorized distance calculation using GeoPandas
        This is the key optimization - processes all points at once
        """
        # Project observations to same CRS as highway
        obs_projected = obs_gdf.to_crs('EPSG:3857')

        # Vectorized distance calculation to highway line
        distances_meters = obs_projected.geometry.distance(self.highway_line_projected)

        # Convert meters to miles
        distances_miles = distances_meters * 0.000621371

        return distances_miles.values

    def process_batch_optimized(self, batch_df: pd.DataFrame) -> pd.DataFrame:
        """Optimized batch processing using vectorized operations"""

        # Filter out rows with invalid coordinates early
        valid_coords = batch_df.dropna(subset=['LATITUDE', 'LONGITUDE'])

        if len(valid_coords) == 0:
            logger.warning("No valid coordinates in batch")
            return pd.DataFrame()

        # Convert to numeric and filter realistic coordinate ranges
        valid_coords = valid_coords.copy()
        valid_coords['LATITUDE'] = pd.to_numeric(valid_coords['LATITUDE'], errors='coerce')
        valid_coords['LONGITUDE'] = pd.to_numeric(valid_coords['LONGITUDE'], errors='coerce')

        # Filter to reasonable coordinate bounds (roughly continental US)
        coord_filter = (
            (valid_coords['LATITUDE'].between(24, 50)) &
            (valid_coords['LONGITUDE'].between(-130, -65))
        )
        valid_coords = valid_coords[coord_filter]

        if len(valid_coords) == 0:
            logger.warning("No valid coordinates after filtering")
            return pd.DataFrame()

        # Create GeoDataFrame from observations
        geometry = [Point(lon, lat) for lon, lat in
                   zip(valid_coords['LONGITUDE'], valid_coords['LATITUDE'])]
        obs_gdf = gpd.GeoDataFrame(valid_coords, geometry=geometry, crs='EPSG:4326')

        # Calculate distances using vectorized operation
        distances = self.calculate_distances_vectorized(obs_gdf)

        # Add distances to dataframe
        result_df = valid_coords.copy()
        result_df['i95_distance'] = distances

        # Apply distance filter if specified
        if self.distance_threshold is not None:
            result_df = result_df[result_df['i95_distance'] <= self.distance_threshold]

        return result_df

    def _process_one_batch(self, batch_df: pd.DataFrame) -> pd.DataFrame:
        """Process a single batch with logging"""
        self.batch_count += 1
        self.total_rows_processed += len(batch_df)

        logger.info(f"Processing batch {self.batch_count} with {len(batch_df)} rows")

        processed_batch = self.process_batch_optimized(batch_df)
        self.total_rows_saved += len(processed_batch)

        logger.info(f"Batch {self.batch_count}: {len(batch_df)} -> {len(processed_batch)} rows")

        return processed_batch

    def run_pipeline(self) -> dict:
        """Run the complete optimized pipeline"""
        logger.info(f"Starting optimized pipeline: {self.input_file}")
        logger.info(f"Batch size: {self.batch_size}")
        logger.info(f"Distance threshold: {self.distance_threshold}")

        all_processed_data = []

        try:
            # Process in chunks with larger batch size
            chunk_iter = pd.read_csv(self.input_file, chunksize=self.batch_size)

            for chunk in chunk_iter:
                processed_chunk = self._process_one_batch(chunk)
                if not processed_chunk.empty:
                    all_processed_data.append(processed_chunk)

            # Combine and save results
            if all_processed_data:
                final_df = pd.concat(all_processed_data, ignore_index=True)
                final_df.to_csv(self.output_file, index=False)
                logger.info(f"Saved {len(final_df)} rows to {self.output_file}")
            else:
                logger.warning("No data to save")
                final_df = pd.DataFrame()

        except Exception as e:
            logger.error(f"Pipeline error: {str(e)}")
            raise

        # Calculate statistics
        stats = {
            'total_rows_processed': self.total_rows_processed,
            'total_rows_saved': self.total_rows_saved,
            'total_batches': self.batch_count,
            'output_file': self.output_file,
            'filter_efficiency': (self.total_rows_saved / self.total_rows_processed * 100)
                                if self.total_rows_processed > 0 else 0
        }

        logger.info("Optimized pipeline completed!")
        logger.info(f"Statistics: {stats}")
        return stats


In [None]:
input_file: str = "/content/drive/MyDrive/Capstone/filtered_ebd_US-DC-001_201901_202505.csv",
output_file: str = "/content/drive/MyDrive/Capstone/filtered_DC.csv",

In [None]:

if __name__ == "__main__":

    processor = OptimizedBatchProcessor(
        input_file="/content/drive/MyDrive/Capstone/filtered_ebd_US-DC-001_201901_202505.csv",
        output_file="/content/drive/MyDrive/Capstone/filtered_DC.csv",
        batch_size=100000,
        distance_threshold=25,
        i95_coords=i95_coords
    )

    results = processor.run_pipeline()
    print("Optimized processing complete!")
    print(f"Results: {results}")

Optimized processing complete!
Results: {'total_rows_processed': 2003460, 'total_rows_saved': 2003460, 'total_batches': 21, 'output_file': '/content/drive/MyDrive/Capstone/filtered_DC.csv', 'filter_efficiency': 100.0}


In [None]:
output_df = pd.read_csv("/content/drive/MyDrive/Capstone/filtered_DC.csv")

In [None]:
output_df.shape

(2003460, 13)

In [None]:
output_df.columns

Index(['COMMON NAME', 'SCIENTIFIC NAME', 'OBSERVATION COUNT', 'COUNTRY',
       'STATE', 'COUNTY', 'LOCALITY', 'LATITUDE', 'LONGITUDE',
       'OBSERVATION DATE', 'TIME OBSERVATIONS STARTED', 'DURATION MINUTES',
       'i95_distance'],
      dtype='object')

In [None]:
output_df.i95_distance.value_counts()

Unnamed: 0_level_0,count
i95_distance,Unnamed: 1_level_1
0.086041,314712
1.715896,205746
0.010148,147701
0.059427,137707
0.011724,84115
...,...
0.029660,1
0.045132,1
0.055755,1
0.001420,1
