# TransitVision: Transit Data Analysis and Prediction

This notebook demonstrates the key functionality of the TransitVision package for analyzing transit data and making predictions about ridership patterns.

## Setup and Configuration

First, let's import the necessary modules and set up the environment.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

# Import TransitVision modules
from transitvision.data_processing.transit_data_processor import TransitDataProcessor
from transitvision.data_processing.feedback_processor import FeedbackProcessor
from transitvision.analysis.transit_analyzer import TransitAnalyzer
from transitvision.analysis.sentiment_analyzer import SentimentAnalyzer
from transitvision.prediction.ridership_model import RidershipModel
from transitvision.prediction.remote_work_impact import RemoteWorkImpactModel
from transitvision.utils.logger import setup_logger
from transitvision.utils.data_utils import load_data, save_data
from transitvision.utils.visualization import set_plot_style

# Set up logger
logger = setup_logger(level="INFO", console=True)

# Set visualization style
set_plot_style(style="whitegrid", context="notebook", palette="viridis")

# Display settings
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', None)

ModuleNotFoundError: No module named 'transitvision'

## Generate Sample Data

For this demonstration, we'll generate synthetic transit data to work with.

In [None]:
def generate_transit_data(n_days=90, n_routes=5, n_stops_per_route=10, seed=42):
    """Generate synthetic transit data for demonstration."""
    np.random.seed(seed)
    
    # Generate dates
    start_date = pd.Timestamp('2023-01-01')
    dates = [start_date + pd.Timedelta(days=i) for i in range(n_days)]
    
    # Generate routes and stops
    routes = [f"Route_{i}" for i in range(1, n_routes + 1)]
    stops = [f"Stop_{i}" for i in range(1, n_stops_per_route + 1)]
    
    # Create base dataframe structure
    data = []
    
    for date in dates:
        # Weekend modifier for ridership
        is_weekend = date.dayofweek >= 5
        weekend_factor = 0.7 if is_weekend else 1.0
        
        # Monthly seasonality (higher in summer)
        month = date.month
        monthly_factor = 1.0 + 0.2 * np.sin((month - 1) * np.pi / 6)
        
        # Weather effect (random daily factor)
        weather_factor = np.random.uniform(0.8, 1.2)
        
        # Remote work percentage (gradually increasing over time)
        day_index = (date - start_date).days
        remote_work_pct = 20 + 10 * (day_index / n_days)
        
        for route in routes:
            # Route-specific factors
            route_idx = int(route.split('_')[1])
            route_factor = 0.8 + 0.1 * route_idx
            
            for stop in stops:
                # Stop-specific factors
                stop_idx = int(stop.split('_')[1])
                stop_factor = 0.9 + 0.02 * stop_idx
                
                # Calculate base ridership
                base_ridership = 100 * route_factor * stop_factor
                
                # Apply modifiers
                ridership = base_ridership * weekend_factor * monthly_factor * weather_factor
                
                # Apply remote work effect (more impact on commuter routes)
                remote_work_impact = 1.0 - (0.01 * remote_work_pct * route_factor)
                ridership = ridership * remote_work_impact
                
                # Add some random noise
                ridership = max(0, int(ridership * np.random.normal(1, 0.1)))
                
                # Generate capacity (somewhat correlated with ridership)
                capacity = int(max(ridership * 1.5, 150) * np.random.uniform(0.9, 1.1))
                
                # Generate delay (correlated with ridership/capacity ratio)
                utilization = ridership / capacity
                delay_base = 2 * utilization * np.random.exponential(1)
                delay = round(max(0, delay_base), 1)
                
                # Create data point
                data.append({
                    'service_date': date,
                    'route_id': route,
                    'stop_id': stop,
                    'ridership': ridership,
                    'capacity': capacity,
                    'delay': delay,
                    'temperature': round(20 + 10 * np.sin((date.dayofweek - 1) * np.pi / 7) + np.random.normal(0, 3), 1),
                    'precipitation': max(0, round(np.random.exponential(0.5), 2)),
                    'is_holiday': date.dayofweek >= 5 or np.random.random() < 0.05,
                    'remote_work_percent': round(remote_work_pct, 1),
                })
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Add time features
    df['service_month'] = df['service_date'].dt.month
    df['service_day'] = df['service_date'].dt.day
    df['service_dayofweek'] = df['service_date'].dt.dayofweek
    df['is_weekend'] = df['service_dayofweek'] >= 5
    
    return df

def generate_feedback_data(transit_data, n_feedback=500, seed=42):
    """Generate synthetic feedback data based on transit data."""
    np.random.seed(seed)
    
    # Sample from transit data to get realistic dates and routes
    sampled_data = transit_data.sample(n=n_feedback, random_state=seed)
    
    # Positive feedback templates
    positive_templates = [
        "The {route} was on time and clean. Very satisfied with the service.",
        "Driver was friendly and helpful. {route} was punctual as always.",
        "Great experience on {route} today. Comfortable ride and efficient service.",
        "Love the new schedule for {route}, makes my commute much easier.",
        "The bus was clean and not crowded. Very pleasant ride on {route}.",
        "Excellent service on {route} this morning. Right on schedule!",
        "The {route} driver was very professional and courteous.",
        "I appreciate the reliability of {route}. Always a good experience."
    ]
    
    # Negative feedback templates
    negative_templates = [
        "The {route} was late again. Very frustrating for daily commuters.",
        "Bus was overcrowded and uncomfortable. {route} needs more frequent service.",
        "Driver was rude and unhelpful. Poor experience on {route} today.",
        "The {route} was dirty and had a bad smell. Please improve cleaning.",
        "Disappointed with {route} service. Too many delays and no communication.",
        "The air conditioning wasn't working on {route}. Terrible experience in this heat.",
        "Why is {route} always late? Need better schedule adherence.",
        "The {route} bus broke down mid-journey. Needs better maintenance."
    ]
    
    # Neutral feedback templates
    neutral_templates = [
        "Average experience on {route}. Nothing special to report.",
        "The {route} was slightly delayed but overall okay.",
        "Regular service on {route} today. No issues to mention.",
        "Standard experience on {route}. Could use some minor improvements.",
        "The {route} was adequate for my needs today.",
        "Typical journey on {route}. Neither good nor bad.",
        "The {route} was moderately crowded but manageable.",
        "Satisfactory service on {route}, though there's room for improvement."
    ]
    
    # Generate feedback
    data = []
    
    for _, row in sampled_data.iterrows():
        # Determine sentiment based on delay and ridership/capacity ratio
        delay = row['delay']
        utilization = row['ridership'] / row['capacity']
        
        # Calculate base sentiment score (-1 to 1)
        sentiment_score = 0.5 - (delay / 15) - (utilization - 0.5)
        sentiment_score += np.random.normal(0, 0.3)  # Add noise
        
        # Determine sentiment category
        if sentiment_score > 0.3:
            sentiment = "positive"
            rating = np.random.choice([4, 5], p=[0.3, 0.7])
            template = np.random.choice(positive_templates)
        elif sentiment_score < -0.3:
            sentiment = "negative"
            rating = np.random.choice([1, 2], p=[0.7, 0.3])
            template = np.random.choice(negative_templates)
        else:
            sentiment = "neutral"
            rating = np.random.choice([3, 4], p=[0.7, 0.3])
            template = np.random.choice(neutral_templates)
        
        # Format feedback text
        feedback_text = template.format(route=row['route_id'])
        
        # Add some typos or variations (10% chance per feedback)
        if np.random.random() < 0.1:
            words = feedback_text.split()
            if len(words) > 3:
                word_idx = np.random.randint(0, len(words))
                if len(words[word_idx]) > 3:
                    char_idx = np.random.randint(1, len(words[word_idx]) - 1)
                    word_list = list(words[word_idx])
                    word_list[char_idx] = np.random.choice(list('abcdefghijklmnopqrstuvwxyz'))
                    words[word_idx] = ''.join(word_list)
                    feedback_text = ' '.join(words)
        
        # Create feedback entry
        data.append({
            'feedback_text': feedback_text,
            'feedback_date': row['service_date'],
            'route_id': row['route_id'],
            'stop_id': row['stop_id'],
            'rating': rating,
            'sentiment': sentiment
        })
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    return df

# Generate the datasets
transit_data = generate_transit_data()
feedback_data = generate_feedback_data(transit_data)

# Display sample data
print(f"Transit data shape: {transit_data.shape}")
print(f"Feedback data shape: {feedback_data.shape}")

transit_data.head()

## Data Processing

Let's process the transit data using our data processing modules.

In [None]:
# Create transit data processor
transit_processor = TransitDataProcessor(
    config={
        "time_columns": [],
        "categorical_columns": ["route_id", "stop_id"],
        "numerical_columns": ["ridership", "capacity", "delay", "temperature", "precipitation", "remote_work_percent"],
        "date_columns": ["service_date"],
        "drop_na_columns": ["route_id", "stop_id", "ridership"]
    }
)

# Process the transit data
processed_transit_data = transit_processor.process_data(transit_data)

# Display the processed data
print(f"Processed transit data shape: {processed_transit_data.shape}")
processed_transit_data.head()

In [None]:
# Create feedback processor
feedback_processor = FeedbackProcessor(
    config={
        "text_column": "feedback_text",
        "date_column": "feedback_date",
        "rating_column": "rating",
        "route_column": "route_id",
        "min_feedback_length": 5
    }
)

# Process the feedback data
processed_feedback_data = feedback_processor.process_data(feedback_data)

# Display the processed data
print(f"Processed feedback data shape: {processed_feedback_data.shape}")
processed_feedback_data.head()

## Transit Data Analysis

Now let's analyze the transit data to extract insights.

In [None]:
# Create transit analyzer
transit_analyzer = TransitAnalyzer(
    config={
        "date_column": "service_date",
        "route_column": "route_id",
        "stop_column": "stop_id",
        "ridership_column": "ridership",
        "capacity_column": "capacity",
        "delay_column": "delay"
    }
)

# Analyze ridership patterns over time
ridership_patterns = transit_analyzer.analyze_ridership_patterns(
    data=processed_transit_data,
    time_grouping="daily"
)

# Plot ridership trends
transit_analyzer.plot_ridership_trends(
    data=processed_transit_data,
    time_grouping="daily"
)

In [None]:
# Compare routes by ridership
route_comparison = transit_analyzer.compare_routes(
    data=processed_transit_data,
    metric="ridership",
    time_period="weekly"
)

# Plot route comparison
transit_analyzer.plot_performance_comparison(
    data=processed_transit_data,
    metric="ridership",
    plot_type="boxplot"
)

In [None]:
# Analyze performance metrics (delay)
delay_analysis = transit_analyzer.analyze_performance_metrics(
    data=processed_transit_data,
    metric="delay"
)

# Plot delay comparison
transit_analyzer.plot_performance_comparison(
    data=processed_transit_data,
    metric="delay",
    plot_type="violin"
)

## Sentiment Analysis

Let's analyze the sentiment in the rider feedback.

In [None]:
# Create sentiment analyzer
sentiment_analyzer = SentimentAnalyzer(
    config={
        "text_column": "feedback_text",
        "date_column": "feedback_date",
        "rating_column": "rating",
        "route_column": "route_id",
        "topic_extraction_method": "keyword",
        "num_topics": 5
    }
)

# Analyze sentiment
sentiment_results = sentiment_analyzer.analyze_sentiment(processed_feedback_data)

# Extract topics
topic_results, topics = sentiment_analyzer.extract_topics(sentiment_results)

# Plot sentiment distribution
sentiment_analyzer.plot_sentiment_distribution(sentiment_results)

In [None]:
# Plot sentiment distribution by route
sentiment_analyzer.plot_sentiment_distribution(
    data=sentiment_results,
    groupby="route_id"
)

In [None]:
# Plot sentiment trends over time
sentiment_analyzer.plot_sentiment_over_time(
    data=sentiment_results,
    time_grouping="weekly"
)

In [None]:
# Plot topic distribution
sentiment_analyzer.plot_topic_distribution(
    data=topic_results,
    topics=topics
)

# Print extracted topics
print("Extracted Topics:")
for sentiment, keywords in topics.items():
    print(f"\n{sentiment.capitalize()} sentiment topics:")
    for i, keyword in enumerate(keywords, 1):
        print(f"  {i}. {keyword}")

## Ridership Prediction

Now let's build a model to predict transit ridership.

In [None]:
# Prepare data for modeling
from sklearn.model_selection import train_test_split

# Select features and target
features = [
    'service_month', 'service_day', 'service_dayofweek', 'is_weekend',
    'temperature', 'precipitation', 'is_holiday', 'remote_work_percent'
]

# Add route and stop dummy variables
route_dummies = pd.get_dummies(processed_transit_data['route_id'], prefix='route')
stop_dummies = pd.get_dummies(processed_transit_data['stop_id'], prefix='stop')

# Combine features
X = pd.concat([processed_transit_data[features], route_dummies, stop_dummies], axis=1)
y = processed_transit_data['ridership']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

In [None]:
# Create ridership model
ridership_model = RidershipModel(
    model_type="random_forest",
    model_params={
        "n_estimators": 100,
        "max_depth": 15,
        "min_samples_split": 5,
        "random_state": 42
    },
    scale_features=True
)

# Train the model
ridership_model.fit(X_train, y_train)

# Evaluate the model
evaluation = ridership_model.evaluate(X_test, y_test, plot=True)

# Print evaluation metrics
for metric, value in evaluation.items():
    print(f"{metric}: {value}")

In [None]:
# Plot predictions
# Extract a route for visualization
route_to_plot = 'Route_1'
stop_to_plot = 'Stop_1'

mask = (
    (processed_transit_data['route_id'] == route_to_plot) & 
    (processed_transit_data['stop_id'] == stop_to_plot)
)

route_data = processed_transit_data[mask].sort_values('service_date')
route_features = X[mask].reset_index(drop=True)
route_ridership = y[mask].reset_index(drop=True)

# Plot actual vs predicted
ridership_model.plot_predictions(
    route_features,
    route_ridership,
    time_column=route_data['service_date'].reset_index(drop=True)
)

In [None]:
# Get feature importance
feature_importance = ridership_model.get_feature_importance()

# Plot top 20 features
ridership_model.plot_feature_importance(
    importance=feature_importance['importance'].values,
    feature_names=feature_importance['feature'].values
)

## Remote Work Impact Analysis

Let's analyze how remote work patterns affect transit ridership.

In [None]:
# Create remote work impact model
remote_work_model = RemoteWorkImpactModel(
    model_type="elastic_net",
    model_params={
        "alpha": 0.1,
        "l1_ratio": 0.5,
        "random_state": 42
    },
    remote_work_column="remote_work_percent",
    time_features=['service_month', 'service_day', 'service_dayofweek']
)

# Train the model
remote_work_model.fit(X_train, y_train)

# Evaluate the model
remote_work_evaluation = remote_work_model.evaluate(X_test, y_test, plot=True)

# Print evaluation metrics
for metric, value in remote_work_evaluation.items():
    print(f"{metric}: {value}")

In [None]:
# Perform sensitivity analysis
sensitivity_results = remote_work_model.sensitivity_analysis(
    X=X_test.iloc[:1],  # Use first test sample as baseline
    remote_work_values=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    feature_values={
        "is_weekend": [0, 1],  # Test both weekday and weekend
        "is_holiday": [0, 1]   # Test both regular day and holiday
    }
)

# Plot impact of remote work
remote_work_model.plot_remote_work_impact(
    sensitivity_results=sensitivity_results,
    group_by="is_weekend"
)

In [None]:
# Define scenarios
scenarios = [
    {"name": "Current Trend", "remote_work_percent": 30.0, "is_weekend": 0, "is_holiday": 0},
    {"name": "Return to Office", "remote_work_percent": 15.0, "is_weekend": 0, "is_holiday": 0},
    {"name": "Fully Remote", "remote_work_percent": 80.0, "is_weekend": 0, "is_holiday": 0},
    {"name": "Hybrid (3 days in office)", "remote_work_percent": 40.0, "is_weekend": 0, "is_holiday": 0},
    {"name": "Weekend", "remote_work_percent": 30.0, "is_weekend": 1, "is_holiday": 0},
    {"name": "Holiday", "remote_work_percent": 30.0, "is_weekend": 0, "is_holiday": 1},
]

# Analyze scenarios
scenario_results = remote_work_model.scenario_analysis(
    X=X_test.iloc[:1],  # Use first test sample as baseline
    scenario_configs=scenarios
)

# Plot scenario comparison
remote_work_model.plot_scenario_comparison(scenario_results)

In [None]:
# Forecast ridership with changing remote work patterns
forecast_data = remote_work_model.time_series_forecast(
    X=X_test.iloc[:1],  # Use first test sample as baseline
    steps=24,  # Forecast for 24 time periods
    remote_work_trend=[30, 35, 40, 45, 50, 55, 60, 65, 70, 70, 70, 65,
                        60, 55, 50, 45, 40, 35, 30, 30, 30, 35, 40, 45],  # Trend pattern
    time_col="forecast_period"
)

# Create historical data for plotting
historical_data = route_data.iloc[-24:].copy().reset_index(drop=True)
historical_data['forecast_period'] = range(len(historical_data))
forecast_data['forecast_period'] = range(len(historical_data), len(historical_data) + len(forecast_data))

# Plot forecast
remote_work_model.plot_forecast(
    historical_data=historical_data,
    forecast_data=forecast_data,
    time_col="forecast_period",
    value_col="ridership"
)

## Conclusion

In this notebook, we've demonstrated the key functionality of the TransitVision package:

1. Data processing for transit and feedback data
2. Transit data analysis and visualization
3. Sentiment analysis of rider feedback
4. Ridership prediction using machine learning
5. Remote work impact analysis and forecasting

The TransitVision package provides a comprehensive set of tools for transit agencies to analyze their data and make informed decisions about service planning, customer satisfaction, and future ridership patterns.