In [4]:
# Simple Transport Demand Prediction - Error-Free Version
# This version focuses on analysis and modeling without complex plotting

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

class SimpleTransportPredictor:
    def __init__(self, data_path):
        """Initialize the predictor with data path"""
        self.data_path = data_path
        self.raw_data = None
        self.processed_data = None
        self.models = {}
        self.best_model = None
        
    def load_data(self):
        """Load and display basic information about the data"""
        print("=" * 60)
        print("TRANSPORT DEMAND PREDICTION PROJECT")
        print("=" * 60)
        
        # Load data
        self.raw_data = pd.read_csv(self.data_path)
        print(f"\n📊 Dataset loaded successfully!")
        print(f"Shape: {self.raw_data.shape}")
        print(f"Columns: {list(self.raw_data.columns)}")
        print(f"\nFirst 5 rows:")
        print(self.raw_data.head())
        
        return self.raw_data
    
    def create_target_variable(self):
        """Aggregate data to create target variable (seats sold per ride)"""
        print("\n" + "="*50)
        print("CREATING TARGET VARIABLE")
        print("="*50)
        
        # Create unique ride identifier
        self.raw_data['ride_identifier'] = (
            self.raw_data['ride_id'].astype(str) + '_' + 
            self.raw_data['travel_date'].astype(str) + '_' + 
            self.raw_data['travel_time'].astype(str) + '_' + 
            self.raw_data['travel_from'].astype(str)
        )
        
        # Aggregate to get seats sold per ride
        aggregated = self.raw_data.groupby('ride_identifier').agg({
            'ride_id': 'first',
            'travel_date': 'first',
            'travel_time': 'first',
            'travel_from': 'first',
            'travel_to': 'first',
            'car_type': 'first',
            'max_capacity': 'first',
            'payment_method': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Mpesa',
            'seat_number': 'count'  # This gives us seats sold
        }).reset_index()
        
        # Rename seat_number count to seats_sold (our target variable)
        aggregated.rename(columns={'seat_number': 'seats_sold'}, inplace=True)
        
        # Calculate occupancy rate
        aggregated['occupancy_rate'] = (aggregated['seats_sold'] / aggregated['max_capacity']) * 100
        
        self.processed_data = aggregated
        
        print(f"✅ Aggregated dataset created with {len(aggregated)} unique rides")
        print(f"Target variable: seats_sold (range: {aggregated['seats_sold'].min()} - {aggregated['seats_sold'].max()})")
        print(f"Average occupancy rate: {aggregated['occupancy_rate'].mean():.2f}%")
        
        return self.processed_data
    
    def feature_engineering(self):
        """Create features for machine learning"""
        print("\n" + "="*50)
        print("FEATURE ENGINEERING")
        print("="*50)
        
        df = self.processed_data.copy()
        
        # Extract hour from travel_time
        try:
            df['travel_hour'] = df['travel_time'].astype(str).str.extract('(\d+)').astype(float)
            df['travel_hour'].fillna(7, inplace=True)  # Default to 7 AM
        except:
            df['travel_hour'] = 7
        
        # Create simple time categories
        df['is_morning'] = (df['travel_hour'] < 12).astype(int)
        df['is_bus'] = (df['car_type'] == 'Bus').astype(int)
        df['is_mpesa'] = (df['payment_method'] == 'Mpesa').astype(int)
        
        # Route popularity (frequency of routes)
        route_counts = df['travel_from'].value_counts()
        df['route_popularity'] = df['travel_from'].map(route_counts)
        
        print("✅ Features created:")
        print("   - travel_hour: Hour of departure")
        print("   - is_morning: 1 if departure before noon, 0 otherwise")
        print("   - is_bus: 1 if Bus, 0 if shuttle")
        print("   - is_mpesa: 1 if Mpesa payment, 0 if Cash")
        print("   - route_popularity: Frequency of the route")
        
        self.processed_data = df
        return df
    
    def analyze_data(self):
        """Perform basic data analysis"""
        print("\n" + "="*50)
        print("DATA ANALYSIS")
        print("="*50)
        
        df = self.processed_data
        
        print("📊 BASIC STATISTICS:")
        print(f"   Total unique rides: {len(df)}")
        print(f"   Average seats sold: {df['seats_sold'].mean():.2f}")
        print(f"   Average occupancy rate: {df['occupancy_rate'].mean():.2f}%")
        print(f"   Seats sold range: {df['seats_sold'].min()} - {df['seats_sold'].max()}")
        
        print("\n🛣️ TOP 5 ROUTES BY AVERAGE SEATS SOLD:")
        route_performance = df.groupby('travel_from')['seats_sold'].mean().sort_values(ascending=False)
        for i, (route, avg_seats) in enumerate(route_performance.head().items(), 1):
            print(f"   {i}. {route}: {avg_seats:.2f} seats")
        
        print("\n🚌 VEHICLE TYPE PERFORMANCE:")
        vehicle_performance = df.groupby('car_type').agg({
            'seats_sold': ['count', 'mean'],
            'occupancy_rate': 'mean'
        }).round(2)
        print(vehicle_performance)
        
        print("\n💳 PAYMENT METHOD ANALYSIS:")
        payment_performance = df.groupby('payment_method').agg({
            'seats_sold': ['count', 'mean'],
            'occupancy_rate': 'mean'
        }).round(2)
        print(payment_performance)
        
        return df
    
    def prepare_features_for_modeling(self):
        """Prepare features for machine learning models"""
        print("\n" + "="*50)
        print("PREPARING FEATURES FOR MODELING")
        print("="*50)
        
        df = self.processed_data.copy()
        
        # Select numerical features
        numerical_features = ['max_capacity', 'travel_hour', 'route_popularity', 
                             'is_morning', 'is_bus', 'is_mpesa']
        
        # Select categorical features for encoding
        categorical_features = ['travel_from']
        
        # Create feature matrix
        X_numerical = df[numerical_features].fillna(0)
        
        # One-hot encode categorical variables
        if len(categorical_features) > 0:
            encoder = OneHotEncoder(drop='first', sparse_output=False)
            X_categorical = encoder.fit_transform(df[categorical_features])
            
            # Combine numerical and categorical features
            X = np.hstack([X_numerical.values, X_categorical])
            feature_names = numerical_features + list(encoder.get_feature_names_out(categorical_features))
        else:
            X = X_numerical.values
            feature_names = numerical_features
        
        # Target variable
        y = df['seats_sold'].values
        
        print(f"✅ Feature matrix created: {X.shape}")
        print(f"✅ Target variable: {y.shape}")
        print(f"✅ Features used: {feature_names}")
        
        return X, y, feature_names
    
    def train_models(self, X, y):
        """Train multiple regression models"""
        print("\n" + "="*50)
        print("TRAINING MACHINE LEARNING MODELS")
        print("="*50)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Define models
        models = {
            'Linear Regression': LinearRegression(),
            'Ridge Regression': Ridge(alpha=1.0),
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
            'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
        }
        
        results = {}
        
        print("Training models...")
        for name, model in models.items():
            print(f"\n🔄 Training {name}...")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Make predictions
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_test, y_pred)
            
            # Cross-validation
            try:
                cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
                cv_mean = cv_scores.mean()
                cv_std = cv_scores.std()
            except:
                cv_mean = r2
                cv_std = 0
            
            results[name] = {
                'model': model,
                'mae': mae,
                'mse': mse,
                'rmse': rmse,
                'r2': r2,
                'cv_mean': cv_mean,
                'cv_std': cv_std,
                'predictions': y_pred
            }
            
            print(f"   MAE: {mae:.3f}")
            print(f"   RMSE: {rmse:.3f}")
            print(f"   R²: {r2:.3f}")
            print(f"   CV Score: {cv_mean:.3f} ± {cv_std:.3f}")
        
        self.models = results
        self.X_test, self.y_test = X_test, y_test
        
        # Find best model
        best_model_name = max(results.keys(), key=lambda k: results[k]['r2'])
        self.best_model = results[best_model_name]['model']
        self.best_model_name = best_model_name
        
        print(f"\n🏆 Best model: {best_model_name} (R² = {results[best_model_name]['r2']:.3f})")
        
        return results
    
    def evaluate_models(self):
        """Evaluate and compare models"""
        print("\n" + "="*50)
        print("MODEL EVALUATION & COMPARISON")
        print("="*50)
        
        # Create comparison dataframe
        comparison_data = []
        for model_name, metrics in self.models.items():
            comparison_data.append({
                'Model': model_name,
                'MAE': f"{metrics['mae']:.3f}",
                'RMSE': f"{metrics['rmse']:.3f}",
                'R²': f"{metrics['r2']:.3f}",
                'CV Score': f"{metrics['cv_mean']:.3f} ± {metrics['cv_std']:.3f}"
            })
        
        comparison_df = pd.DataFrame(comparison_data)
        print("\n📊 MODEL COMPARISON:")
        print(comparison_df.to_string(index=False))
        
        # Feature importance for tree-based models
        if hasattr(self.best_model, 'feature_importances_'):
            print(f"\n🎯 FEATURE IMPORTANCE ({self.best_model_name}):")
            feature_names = ['max_capacity', 'travel_hour', 'route_popularity', 
                           'is_morning', 'is_bus', 'is_mpesa'] + [f'route_{i}' for i in range(len(self.best_model.feature_importances_) - 6)]
            
            importances = self.best_model.feature_importances_
            feature_importance = list(zip(feature_names[:len(importances)], importances))
            feature_importance.sort(key=lambda x: x[1], reverse=True)
            
            for feature, importance in feature_importance[:10]:  # Top 10 features
                print(f"   {feature}: {importance:.4f}")
        
        return comparison_df
    
    def generate_insights(self):
        """Generate business insights and recommendations"""
        print("\n" + "="*50)
        print("BUSINESS INSIGHTS & RECOMMENDATIONS")
        print("="*50)
        
        df = self.processed_data
        
        # Key metrics
        avg_occupancy = df['occupancy_rate'].mean()
        high_demand_routes = df.groupby('travel_from')['seats_sold'].mean().sort_values(ascending=False)
        
        print("📊 KEY BUSINESS METRICS:")
        print(f"   Average occupancy rate: {avg_occupancy:.1f}%")
        print(f"   Total rides analyzed: {len(df):,}")
        print(f"   Average seats sold per ride: {df['seats_sold'].mean():.1f}")
        
        print("\n🎯 STRATEGIC RECOMMENDATIONS:")
        
        print("\n1. ROUTE OPTIMIZATION:")
        print(f"   • Top performing route: {high_demand_routes.index[0]} ({high_demand_routes.iloc[0]:.1f} seats avg)")
        print(f"   • Lowest performing route: {high_demand_routes.index[-1]} ({high_demand_routes.iloc[-1]:.1f} seats avg)")
        
        if high_demand_routes.iloc[0] / high_demand_routes.iloc[-1] > 2:
            print("   • Consider increasing frequency on top routes")
            print("   • Review scheduling for underperforming routes")
        
        print("\n2. CAPACITY MANAGEMENT:")
        underutilized = df[df['occupancy_rate'] < 30]
        if len(underutilized) > len(df) * 0.2:
            print(f"   • {len(underutilized)} rides ({len(underutilized)/len(df)*100:.1f}%) are underutilized")
            print("   • Consider using smaller vehicles for low-demand routes")
        
        print("\n3. REVENUE OPTIMIZATION:")
        print("   • Implement dynamic pricing based on demand patterns")
        print("   • Offer early bird discounts for off-peak hours")
        print("   • Create loyalty programs for frequent routes")
        
        return {
            'avg_occupancy': avg_occupancy,
            'top_route': high_demand_routes.index[0],
            'model_performance': self.models[self.best_model_name]['r2']
        }
    
    def run_complete_analysis(self):
        """Run the complete analysis pipeline"""
        print("🚀 Starting Transport Demand Analysis...")
        
        # 1. Load data
        self.load_data()
        
        # 2. Create target variable
        self.create_target_variable()
        
        # 3. Feature engineering
        self.feature_engineering()
        
        # 4. Analyze data
        self.analyze_data()
        
        # 5. Prepare features for modeling
        X, y, feature_names = self.prepare_features_for_modeling()
        
        # 6. Train models
        self.train_models(X, y)
        
        # 7. Evaluate models
        self.evaluate_models()
        
        # 8. Generate insights
        insights = self.generate_insights()
        
        print("\n" + "="*60)
        print("🎉 ANALYSIS COMPLETE!")
        print("="*60)
        print(f"✅ Best Model: {self.best_model_name}")
        print(f"✅ Model Performance: R² = {self.models[self.best_model_name]['r2']:.3f}")
        print(f"✅ Business Impact: {insights['avg_occupancy']:.1f}% average occupancy")
        print(f"✅ Key Insight: {insights['top_route']} is the top performing route")
        
        return self

# USAGE EXAMPLE:
# ================
# predictor = SimpleTransportPredictor('train_revised.csv')
# results = predictor.run_complete_analysis()
# 
# # Access results
# print(f"Best model R²: {predictor.models[predictor.best_model_name]['r2']:.3f}")

if __name__ == "__main__":
    print("Simple Transport Demand Prediction")
    print("=================================")
    print("This version avoids plotting errors and focuses on core analysis.")
    print()
    print("To run:")
    print("predictor = SimpleTransportPredictor('train_revised.csv')")
    print("results = predictor.run_complete_analysis()")

Simple Transport Demand Prediction
This version avoids plotting errors and focuses on core analysis.

To run:
predictor = SimpleTransportPredictor('train_revised.csv')
results = predictor.run_complete_analysis()


In [5]:
# Use the error-free simple version
predictor = SimpleTransportPredictor('train_revised.csv')
results = predictor.run_complete_analysis()

# Access results
print(f"Best model R²: {predictor.models[predictor.best_model_name]['r2']:.3f}")

🚀 Starting Transport Demand Analysis...
TRANSPORT DEMAND PREDICTION PROJECT

📊 Dataset loaded successfully!
Shape: (51645, 10)
Columns: ['ride_id', 'seat_number', 'payment_method', 'payment_receipt', 'travel_date', 'travel_time', 'travel_from', 'travel_to', 'car_type', 'max_capacity']

First 5 rows:
   ride_id seat_number payment_method payment_receipt travel_date travel_time  \
0     1442         15A          Mpesa      UZUEHCBUSO    17-10-17        7:15   
1     5437         14A          Mpesa      TIHLBUSGTE    19-11-17        7:12   
2     5710          8B          Mpesa      EQX8Q5G19O    26-11-17        7:05   
3     5777         19A          Mpesa      SGP18CL0ME    27-11-17        7:10   
4     5778         11A          Mpesa      BM97HFRGL9    27-11-17        7:12   

  travel_from travel_to car_type  max_capacity  
0      Migori   Nairobi      Bus            49  
1      Migori   Nairobi      Bus            49  
2      Keroka   Nairobi      Bus            49  
3    Homa Bay   