In [2]:
# Cell 1: Import Libraries
"""
Import all necessary libraries for data processing and analysis
"""
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'seaborn'

In [None]:
# Cell 2: Define Helper Functions
"""
Helper functions for data processing and standardization
"""

def get_month_name(month_num):
    """
    Convert month number to month name
    Args:
        month_num (int): Month number (1-12)
    Returns:
        str: Month name (e.g., "January")
    """
    return datetime(2000, int(month_num), 1).strftime('%B')

def create_date_column(df, year_col='YEAR', month_col='MONTH'):
    """
    Create standardized date column
    """
    return pd.to_datetime(df[[year_col, month_col]].assign(DAY=1))

In [None]:
# Cell 4: Data Standardization
"""
Standardize date formats across all datasets
"""

# Process Bus MDBF data
bus_mdbf['date'] = create_date_column(bus_mdbf)

# Process Light Rail MDBF data
lightrail_mdbf['MONTH'] = lightrail_mdbf['MONTH'].str.strip()
lightrail_mdbf[['YEAR', 'MONTH']] = lightrail_mdbf['MONTH'].str.extract(r'(\d{4})\s+(\w+)')

# Create month mapping
month_map = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}

# Convert months to numeric
lightrail_mdbf['MONTH'] = lightrail_mdbf['MONTH'].map(month_map)
lightrail_mdbf['date'] = create_date_column(lightrail_mdbf)

# Process Rail MDBF data
rail_mdbf['MONTH'] = rail_mdbf['MONTH'].str.strip()
rail_mdbf[['YEAR', 'MONTH']] = rail_mdbf['MONTH'].str.extract(r'(\d{4})\s+(\w+)')
rail_mdbf['MONTH'] = rail_mdbf['MONTH'].map(month_map)
rail_mdbf['date'] = create_date_column(rail_mdbf)

In [None]:
# Cell 5: Data Validation and Initial Analysis
"""
Validate the standardized data and perform initial analysis
"""

def print_dataset_info(df, name):
    print(f"\n{name} Dataset Information:")
    print(f"Date Range: {df['date'].min()} to {df['date'].max()}")
    print(f"Total Records: {len(df)}")
    print(f"Missing Values:\n{df.isnull().sum()}")
    print("\nSample Data:")
    print(df.head())

# Check each dataset
print_dataset_info(bus_mdbf, "Bus MDBF")
print_dataset_info(lightrail_mdbf, "Light Rail MDBF")
print_dataset_info(rail_mdbf, "Rail MDBF")

In [None]:
# Cell 6: Save Standardized Data
"""
Save the standardized datasets
"""

try:
    # Save standardized versions
    bus_mdbf.to_csv('data/MDBF/BUS_MDBF_DATA_standardized.csv', index=False)
    lightrail_mdbf.to_csv('data/MDBF/LIGHTRAIL_MDBF_DATA_standardized.csv', index=False)
    rail_mdbf.to_csv('data/MDBF/RAIL_MDBF_DATA_standardized.csv', index=False)
    print("All standardized files saved successfully!")
except Exception as e:
    print(f"Error saving files: {e}")

In [None]:
# Cell 7: Basic Visualization
"""
Create initial visualizations to explore the data
"""

plt.figure(figsize=(15, 8))

# Plot MDBF trends for each transit type
plt.plot(bus_mdbf['date'], bus_mdbf['MDBF'], label='Bus')
plt.plot(rail_mdbf['date'], rail_mdbf['MDBF'], label='Rail')

# For Light Rail, we'll take the mean MDBF per date across all lines
lightrail_avg = lightrail_mdbf.groupby('date')['MDBF'].mean()
plt.plot(lightrail_avg.index, lightrail_avg.values, label='Light Rail (Avg)')

plt.title('MDBF Trends by Transit Type')
plt.xlabel('Date')
plt.ylabel('Mean Distance Between Failure')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Cell 8: Statistical Summary
"""
Generate statistical summaries for each transit type
"""

def generate_statistics(df, name):
    stats = {
        'Mean MDBF': df['MDBF'].mean(),
        'Median MDBF': df['MDBF'].median(),
        'Std Dev MDBF': df['MDBF'].std(),
        'Min MDBF': df['MDBF'].min(),
        'Max MDBF': df['MDBF'].max()
    }
    
    print(f"\n{name} Statistics:")
    for stat, value in stats.items():
        print(f"{stat}: {value:,.2f}")

# Generate statistics for each transit type
generate_statistics(bus_mdbf, "Bus")
generate_statistics(rail_mdbf, "Rail")
generate_statistics(lightrail_mdbf, "Light Rail")