In [None]:
# Updated notebook using the new modular structure
# preprocessing_v10.ipynb

# Cell 1: Imports
import pandas as pd
import numpy as np
import os
import sys

# Set path - adjust according to your new package location
project_root = os.path.abspath(os.path.join("..", "src"))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import the new modular package
from data_processor import process_transit_data, TransitDataProcessor, FlexibleDateProcessor

# # Cell 2: Simple processing using the convenience function
# date = '20131018'
data_folder = '../data/processed/'

# # Process the data - this replaces all the individual function calls
# result = process_transit_data(date, data_folder, save_data=True)

# # Cell 3: Access the results if needed for further analysis
# shapes_df = result['shapes']
# routes_df = result['routes']
# route_versions_df = result['route_versions']
# shape_variants_df = result['shape_variants']
# shape_variant_activations_df = result['shape_variant_activations']
# temporary_changes_df = result['temporary_changes']

# print(f"Routes: {routes_df.shape}")
# print(f"Route versions: {route_versions_df.shape}")
# print(f"Shape variants: {shape_variants_df.shape}")
# print(f"Shape variant activations: {shape_variant_activations_df.shape}")

# Cell 4: Alternative - using the class for more control
processor = TransitDataProcessor(data_folder)

# # Process without saving (for testing)
# result = processor.process_date(date, save_data=False)

# Or process multiple dates in a loop
dates_to_process = ['20131018', '20131021', '20131025']
for date in dates_to_process:
    print(f"\nProcessing {date}...")
    processor.process_date(date, save_data=True)

# Cell 5: Individual function usage (if you need more granular control)
from data_processor import (
    load_gtfs_data, load_processed_data, 
    build_service_date_mappings, build_latest_routes
)

# Load data manually
routes_txt, trips_txt, shapes_txt, calendar_txt, calendar_dates_txt = load_gtfs_data(date)
processed_data = load_processed_data(data_folder)

# Build mappings
trip_dates, trip_first_date = build_service_date_mappings(trips_txt, calendar_txt)

# Continue with individual processing steps as needed...


Processing 20131018...
Processing transit data for date: 20131018
1. Loading GTFS data...
2. Loading existing processed data...
3. Building service date mappings...
4. Processing routes...
No duplicate route_id found in routes_df.
5. Processing route versions...
6. Processing shape variants...
Removed 7629 duplicate rows where only exception_type differed (NaN vs non-NaN).
Updated shape_variants_df:
Shape: (1544, 6)

Updated shape_variant_activations_df:
Shape: (61329, 3)

Summary:
Total unique shape variants: 1544
Total shape variant activations: 61329
New variants added: 49
Shape variant IDs added: 101495 - 101543
New activations added: 1747
7. Saving processed data...
routes_df saved to ../data/processed/routes.csv
route_versions_df saved to ../data/processed/route_versions.csv
shape_variants_df saved to ../data/processed/shape_variants.csv
shape_variant_activations_df saved to ../data/processed/shape_variant_activations.csv
Processing completed successfully!

Processing 20131021..

  trips_df = pd.read_csv(trips_path)


2. Loading existing processed data...
3. Building service date mappings...
4. Processing routes...
No duplicate route_id found in routes_df.
5. Processing route versions...
6. Processing shape variants...
Removed 8899 duplicate rows where only exception_type differed (NaN vs non-NaN).
Updated shape_variants_df:
Shape: (1580, 6)

Updated shape_variant_activations_df:
Shape: (62380, 3)

Summary:
Total unique shape variants: 1580
Total shape variant activations: 62380
New variants added: 36
Shape variant IDs added: 101544 - 101579
New activations added: 1051
7. Saving processed data...
routes_df saved to ../data/processed/routes.csv
route_versions_df saved to ../data/processed/route_versions.csv
shape_variants_df saved to ../data/processed/shape_variants.csv
shape_variant_activations_df saved to ../data/processed/shape_variant_activations.csv
Processing completed successfully!

Processing 20131025...
Processing transit data for date: 20131025
1. Loading GTFS data...


  trips_df = pd.read_csv(trips_path)


2. Loading existing processed data...
3. Building service date mappings...
4. Processing routes...
No duplicate route_id found in routes_df.
5. Processing route versions...
6. Processing shape variants...
Removed 10729 duplicate rows where only exception_type differed (NaN vs non-NaN).
Updated shape_variants_df:
Shape: (1606, 6)

Updated shape_variant_activations_df:
Shape: (63494, 3)

Summary:
Total unique shape variants: 1606
Total shape variant activations: 63494
New variants added: 26
Shape variant IDs added: 101580 - 101605
New activations added: 1114
7. Saving processed data...
routes_df saved to ../data/processed/routes.csv
route_versions_df saved to ../data/processed/route_versions.csv
shape_variants_df saved to ../data/processed/shape_variants.csv
shape_variant_activations_df saved to ../data/processed/shape_variant_activations.csv
Processing completed successfully!


  trips_df = pd.read_csv(trips_path)


In [1]:
import os
import sys

# Set path - adjust according to your new package location
project_root = os.path.abspath(os.path.join("..", "src"))
if project_root not in sys.path:
    sys.path.append(project_root)

In [3]:
from data_processor import FlexibleDateProcessor
processor = FlexibleDateProcessor('../data/processed/')

# # Process just one date (your folder name)
# processor.process_dates('20131018')

# # Process a week of data
# processor.process_dates({'start': '20131018', 'days': 7})

# Process a specific month
processing_result = processor.process_dates({'start': '20131001', 'end': '20131031'})

Processing 31 date(s): 20131001 to 20131031

--- Processing 20131001 (1/31) ---
Processing transit data for date: 20131001
1. Loading GTFS data...
✗ Failed to process 20131001: [Errno 2] No such file or directory: '../data/raw/20131001\\routes.txt'

--- Processing 20131002 (2/31) ---
Processing transit data for date: 20131002
1. Loading GTFS data...
✗ Failed to process 20131002: [Errno 2] No such file or directory: '../data/raw/20131002\\routes.txt'

--- Processing 20131003 (3/31) ---
Processing transit data for date: 20131003
1. Loading GTFS data...
✗ Failed to process 20131003: [Errno 2] No such file or directory: '../data/raw/20131003\\routes.txt'

--- Processing 20131004 (4/31) ---
Processing transit data for date: 20131004
1. Loading GTFS data...
✗ Failed to process 20131004: [Errno 2] No such file or directory: '../data/raw/20131004\\routes.txt'

--- Processing 20131005 (5/31) ---
Processing transit data for date: 20131005
1. Loading GTFS data...
✗ Failed to process 20131005: [Er

  updated_shapes_df = pd.concat([shapes_df, missing_shapes], ignore_index=True)


Added 210973 shape records to shapes_df.
New shapes_df shape: (210973, 6)

=== After update Shapes Summary ===
Total shape records: 210,973
Unique shapes: 1,151
Average points per shape: 183.3
Points per shape range: 6 - 957
8. Saving processed data...
shapes_df saved to ../data/processed/shapes.csv
routes_df saved to ../data/processed/routes.csv
route_versions_df saved to ../data/processed/route_versions.csv
shape_variants_df saved to ../data/processed/shape_variants.csv
shape_variant_activations_df saved to ../data/processed/shape_variant_activations.csv
Processing completed successfully!
✓ Successfully processed 20131018

--- Processing 20131019 (19/31) ---
Processing transit data for date: 20131019
1. Loading GTFS data...
✗ Failed to process 20131019: [Errno 2] No such file or directory: '../data/raw/20131019\\routes.txt'

--- Processing 20131020 (20/31) ---
Processing transit data for date: 20131020
1. Loading GTFS data...
✗ Failed to process 20131020: [Errno 2] No such file or di

  trips_df = pd.read_csv(trips_path)


2. Loading existing processed data...
3. Building service date mappings...
4. Processing routes...
No duplicate route_id found in routes_df.
5. Processing route versions...
6. Processing shape variants...
Removed 8899 duplicate rows where only exception_type differed (NaN vs non-NaN).
return_df:  Index(['version_id', 'route_id', 'direction_id', 'is_main', 'shape_id',
       'trip_headsign', 'date', 'exception_type'],
      dtype='object')
Updated shape_variants_df:
Shape: (1178, 6)

Updated shape_variant_activations_df:
Shape: (43932, 3)

Summary:
Total unique shape variants: 1178
Total shape variant activations: 43932
New variants added: 27
Shape variant IDs added: 101151 - 101177
New activations added: 5178
7. Updating shapes data...

=== Before update Shapes Summary ===
Total shape records: 210,973
Unique shapes: 1,151
Average points per shape: 183.3
Points per shape range: 6 - 957
Found 8 missing shape_ids that need to be added.
Found 8 missing shape_ids in shapes_df.
Missing shape

  trips_df = pd.read_csv(trips_path)


2. Loading existing processed data...
3. Building service date mappings...
4. Processing routes...
No duplicate route_id found in routes_df.
5. Processing route versions...
6. Processing shape variants...
Removed 10729 duplicate rows where only exception_type differed (NaN vs non-NaN).
return_df:  Index(['version_id', 'route_id', 'direction_id', 'is_main', 'shape_id',
       'trip_headsign', 'date', 'exception_type'],
      dtype='object')
Updated shape_variants_df:
Shape: (1249, 6)

Updated shape_variant_activations_df:
Shape: (51223, 3)

Summary:
Total unique shape variants: 1249
Total shape variant activations: 51223
New variants added: 71
Shape variant IDs added: 101178 - 101248
New activations added: 7291
7. Updating shapes data...

=== Before update Shapes Summary ===
Total shape records: 212,207
Unique shapes: 1,159
Average points per shape: 183.09
Points per shape range: 6 - 957
Found 44 missing shape_ids that need to be added.
Found 44 missing shape_ids in shapes_df.
Missing s

In [17]:
processing_result["20131018"]["data"].keys()

dict_keys(['shapes', 'routes', 'route_versions', 'shape_variants', 'shape_variant_activations', 'temporary_changes', 'latest_routes', 'shape_variant_data'])