In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from scipy.optimize import minimize

class CapacityIndependentPricingSystem:
    def __init__(self):
        self.demand_model = None
        self.price_elasticity_model = None
        self.fitted = False
        
    def fit_demand_model(self, historical_data):
        """
        Fit demand model using historical waitlist data
        
        Parameters:
        -----------
        historical_data :
            Contains columns: 'train_id', 'class', 'days_remaining', 
            'day_of_week', 'waitlist','fromStnCode','toStnCode' and optionally 'base_fare'
        """
        # Create advanced features
        X = self._create_features(historical_data)
        y = historical_data['waitlist']
        
        # Build a demand prediction model
        self.demand_model = Pipeline([
            ('scaler', StandardScaler()),
            ('gb', GradientBoostingRegressor(
                n_estimators=200, 
                learning_rate=0.05,
                max_depth=4,
                subsample=0.8,
                random_state=42
            ))
        ])
        
        # Fit the model
        self.demand_model.fit(X, y)
        
        # Get estimated capacity from waitlist data
        self._estimate_capacity(historical_data)
        
        # Train price elasticity model based on waitlist patterns
        self._train_price_elasticity_model(historical_data)
        
        self.fitted = True
        return self
    
    def _estimate_capacity(self, historical_data):
        """
        Estimate capacity from waitlist patterns
        
        Logic: When waitlist is near zero, the train is likely at capacity
        We can use the booking pattern to estimate the total capacity
        """
        # Find periods where waitlist is close to zero (equilibrium)
        equilibrium_data = historical_data[(historical_data['waitlist'] >= -2) & 
                                          (historical_data['waitlist'] <= 2)]
        
        if len(equilibrium_data) >= 3:
            # For each train/class combination, estimate capacity
            capacity_estimates = {}
            
            # Group by train and class
            if 'trainNumber' in historical_data.columns and 'classCode' in historical_data.columns:
                for (train, train_class), group in historical_data.groupby(['trainNumber', 'classCode']):
                    # Estimate full booking pattern curve
                    # If we had actual booking data, we'd use that directly
                    
                    # For now,we make an educated guess based on common patterns
                    # Typical 2A coach has 48-72 seats
                    # Typical 3A coach has 64-80 berths
                    # Sleeper class typically has 72-80 berths
                    
                    if train_class == '2A':
                        capacity_estimates[(train, train_class)] = 54
                    elif train_class == '3A':
                        capacity_estimates[(train, train_class)] = 72
                    elif train_class == 'SL':
                        capacity_estimates[(train, train_class)] = 80
                    elif train_class=='CC':
                        capacity_estimates[(train,train_class)]=80
                    else:
                        # Default for unknown classes
                        capacity_estimates[(train, train_class)] = 60
            else:
                # If no train/class info such as the presence of 2S or 3S as a class, use overall estimate.
                capacity_estimates['default'] = 60
                
            self.capacity_estimates = capacity_estimates
        else:
            # Not enough equilibrium data, use defaults
            self.capacity_estimates = {'default': 60}  # Default capacity estimate
    
    def _get_capacity(self, train_id, train_class):
        """Get estimated capacity for a train/class combination"""
        if (train_id, train_class) in self.capacity_estimates:
            return self.capacity_estimates[(train_id, train_class)]
        else:
            return self.capacity_estimates.get('default', 60)
    
    def _train_price_elasticity_model(self, historical_data):
        
        # Infer price elasticity from waitlist patterns
        
        # Without actual price variation data, we estimate elasticity based on:
        # 1. Days before departure (closer = less elastic, travel is much more certain)
        # 2. Waitlist trends (high waitlist = less elastic, people are not sensitive to price changes)
        # 3. Day of week patterns (weekends = less elastic, people are more likely to travel to and fro)
        
        # Create elasticity estimation model
        features = self._create_features(historical_data)
        
        # Create synthetic elasticity based on domain knowledge
        # This is a PLACEHOLDER for actual elasticity data
        elasticity = np.zeros(len(historical_data))
        
        for i, row in historical_data.iterrows():
            days = row['daysRemaining']
            waitlist = row['waitlist']
            
            #(closer = less elastic)
            if days <= 7:
                base_elasticity = -0.3  # Very inelastic
            elif days <= 15:
                base_elasticity = -0.7
            elif days <= 30:
                base_elasticity = -1.2
            else:
                base_elasticity = -1.8  # much more elastic
            
            # Adjust based on waitlist (higher = less elastic)
            waitlist_factor = min(max(waitlist / 10, -0.5), 0.5)
            
            # Final elasticity (constrained between -1.9 and -0.1)
            elasticity[i] = max(min(base_elasticity + waitlist_factor, -0.1), -1.9)
        
        # Train elasticity model
        self.price_elasticity_model = Pipeline([
            ('scaler', StandardScaler()),
            ('gb', GradientBoostingRegressor(n_estimators=100, max_depth=3))
        ])
        
        self.price_elasticity_model.fit(features, elasticity)
    
    def _create_features(self, data):
        """
        Create rich feature set for modeling
        """
        X = pd.DataFrame()
        
        # Lead time features
        X['daysRemaining'] = data['daysRemaining']
        X['daysRemaining_log'] = np.log1p(data['daysRemaining'])
        X['daysRemaining_Squared'] = data['daysRemaining'] ** 2
        
        # Time-based features
        # Cyclical encoding of day of week (1-7)
        X['day_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
        X['day_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)
        
        # Special periods if available
        if 'is_holiday_period' in data.columns:
            X['is_holiday_period'] = data['is_holiday_period']
        
        if 'is_weekend' in data.columns:
            X['is_weekend'] = data['is_weekend']
        else:
            X['is_weekend'] = (data['day_of_week'] >= 5).astype(int)
        
        # Train-specific features
        if 'train_id' in data.columns:
            train_dummies = pd.get_dummies(data['train_id'], prefix='train')
            X = pd.concat([X, train_dummies], axis=1)
        
        if 'class' in data.columns:
            class_dummies = pd.get_dummies(data['class'], prefix='class')
            X = pd.concat([X, class_dummies], axis=1)
        
        # Seasonality features if date is available
        if 'date' in data.columns:
            data['date'] = pd.to_datetime(data['date'])
            X['month'] = data['date'].dt.month
            X['month_sin'] = np.sin(2 * np.pi * X['month'] / 12)
            X['month_cos'] = np.cos(2 * np.pi * X['month'] / 12)
        
        return X
    
    def predict_waitlist(self, query_data):
        """
        Predict waitlist based on query parameters
        """
        if not self.fitted:
            raise ValueError("Model hasn't been fitted yet")
        
        X = self._create_features(query_data)
        return self.demand_model.predict(X)
    
    def estimate_price_elasticity(self, query_data):
        """
        Estimate price elasticity for the given scenario
        """
        if self.price_elasticity_model is not None:
            X = self._create_features(query_data)
            return self.price_elasticity_model.predict(X)
        else:
            # Rule-based fallback for elasticity estimates
            days_remaining = query_data['days_remaining'].values[0]
            
            # Elasticity typically increases (becomes less elastic) closer to departure
            if days_remaining <= 7:
                return -0.3  # Very inelastic (last minute)
            elif days_remaining <= 15:
                return -0.7  # Somewhat inelastic
            elif days_remaining <= 30:
                return -1.2  # Moderately elastic
            else:
                return -1.8  # More elastic (advance planning)
    
    def calculate_demand_index(self, waitlist):
        """
        Convert waitlist to a demand index from 0-100
        
        A high positive waitlist means very high demand (near 100)
        A large negative waitlist means very low demand (near 0)
        Waitlist near zero means moderate demand (around 50)
        """
        # Sigmoid function to convert waitlist to 0-100 scale
        return 100 / (1 + np.exp(-0.2 * waitlist))
    
    def _revenue_function(self, price_multiplier, base_price, elasticity, demand_index):
        """
        Calculate expected revenue impact at a given price point
        This is a utility function for optimization
        """
        # Convert demand index to a baseline quantity (arbitrary scale)
        baseline_quantity = demand_index
        
        # Calculate expected quantity at the given price multiplier using constant elasticity model
        # Q2 = Q1 * (P2/P1)^e where e is the price elasticity
        expected_quantity = baseline_quantity * (price_multiplier ** elasticity)
        
        # Calculate expected revenue
        revenue = base_price * price_multiplier * expected_quantity
        
        # We want to maximize revenue, but scipy.optimize minimizes
        return -revenue
    
    def calculate_optimal_price(self, query_data, base_fare):
        """
        Calculate optimal price based on demand dynamics
        
        Parameters:
        -----------
        query_data : pandas DataFrame
            Contains 'train_id', 'class', 'days_remaining', 'day_of_week'
        base_fare : float
            Base fare for the journey
            
        Returns:
        --------
        optimal_price : float
            The optimal price after applying the multiplier
        diagnostics : dict
            Diagnostic information about the calculation
        """
        if not self.fitted:
            raise ValueError("Model hasn't been fitted yet")
        
        # Predict waitlist
        predicted_waitlist = self.predict_waitlist(query_data)[0]
        
        # Convert waitlist to demand index (0-100)
        demand_index = self.calculate_demand_index(predicted_waitlist)
        
        # Estimate price elasticity
        elasticity = self.estimate_price_elasticity(query_data)[0]
        
        # Define constraint: price must be between 0.7x and 3.0x base fare
        bounds = [(0.7, 3.0)]
        
        # Find price multiplier that maximizes revenue
        result = minimize(
            lambda x: self._revenue_function(x[0], base_fare, elasticity, demand_index),
            x0=[1.0],  # Start with base fare
            bounds=bounds,
            method='L-BFGS-B'
        )
        
        optimal_multiplier = result.x[0]
        optimal_price = base_fare * optimal_multiplier
        
        # Calculate expected revenue at optimal price (for diagnostics)
        expected_revenue = -self._revenue_function(
            optimal_multiplier, base_fare, elasticity, demand_index
        )
        
        # Additional business rules
        days_remaining = query_data['days_remaining'].values[0]
        day_of_week = query_data['day_of_week'].values[0]
        
        # Implement tatkal pricing rules if close to departure
        if days_remaining <= 1:
            # Tatkal rules (premium pricing for last-minute bookings)
            optimal_price = max(optimal_price, base_fare * 1.5)
        
        # Apply day of week adjustments
        if day_of_week >= 5:  # Weekend
            if demand_index > 60:  # High demand
                optimal_price = min(optimal_price * 1.1, base_fare * 3.0)
        
        # Diagnostic information
        diagnostics = {
            'predicted_waitlist': predicted_waitlist,
            'demand_index': demand_index,
            'price_elasticity': elasticity,
            'optimal_multiplier': optimal_multiplier,
            'expected_revenue': expected_revenue,
            'optimization_success': result.success,
            'days_remaining': days_remaining
        }
        
        return optimal_price, diagnostics
    
    def get_price_curve(self, train_id, train_class, base_fare, days_range=60):
        """
        Generate a price curve over time for a specific train
        
        Parameters:
        -----------
        train_id : str
            Train identifier
        train_class : str
            Class identifier (e.g., '2A', '3A')
        base_fare : float
            Base fare for the journey
        days_range : int
            Number of days before departure to simulate
            
        Returns:
        --------
        price_curve : pandas DataFrame
            DataFrame with optimal prices for each day
        """
        results = []
        
        for days_remaining in range(days_range, 0, -1):
            # Average across all days of week
            prices_by_dow = []
            
            for day_of_week in range(1, 8):
                query_data = pd.DataFrame({
                    'train_id': [train_id],
                    'class': [train_class],
                    'days_remaining': [days_remaining],
                    'day_of_week': [day_of_week],
                })
                
                optimal_price, _ = self.calculate_optimal_price(query_data, base_fare)
                prices_by_dow.append(optimal_price)
            
            # Average price across days of week
            avg_price = np.mean(prices_by_dow)
            
            # Get single-day prediction for detailed diagnostics
            query_data = pd.DataFrame({
                'train_id': [train_id],
                'class': [train_class],
                'days_remaining': [days_remaining],
                'day_of_week': [4],  # Thursday as an average day
            })
            
            _, diagnostics = self.calculate_optimal_price(query_data, base_fare)
            
            results.append({
                'days_remaining': days_remaining,
                'predicted_waitlist': diagnostics['predicted_waitlist'],
                'demand_index': diagnostics['demand_index'],
                'price_elasticity': diagnostics['price_elasticity'],
                'optimal_price': avg_price,
                'price_multiplier': avg_price / base_fare,
            })
        
        return pd.DataFrame(results)


# Example usage
def demonstrate_capacity_independent_model():
    # Create synthetic historical data based on the chart pattern from the image
    days_remaining = list(range(60, 0, -1))
    
    # Create synthetic waitlist data based on the chart pattern
    waitlist_values = []
    for day in days_remaining:
        if day < 10:
            # High demand period (peak around day 8)
            waitlist = 19.5 - abs(day - 8) * 2.5
        elif 10 <= day < 15:
            # Medium-high demand
            waitlist = 8 - (day - 10) * 1.0
        elif 15 <= day < 30:
            # Medium demand period
            waitlist = 3 - (day - 15) * 0.15
        elif 30 <= day < 40:
            # Low demand period
            waitlist = 0 - (day - 30) * 0.4
        else:
            # Negative waitlist period (empty seats)
            waitlist = -4 + np.random.normal(0, 0.5)
            
        waitlist_values.append(waitlist)
    
    # Create DataFrame
    historical_data = pd.DataFrame({
        'train_id': ['1027'] * len(days_remaining),
        'class': ['2A'] * len(days_remaining),
        'days_remaining': days_remaining,
        'day_of_week': [day % 7 + 1 for day in range(len(days_remaining))],  # Rotating through days
        'avg_waitlist': waitlist_values
    })
    
    # Initialize and fit model
    pricing_system = CapacityIndependentPricingSystem()
    pricing_system.fit_demand_model(historical_data)
    
    # Generate pricing curve
    base_fare = 2000  # Base fare in INR
    price_curve = pricing_system.get_price_curve('1027', '2A', base_fare)
    
    return historical_data, price_curve

# Execute demo
historical_data, price_curve = demonstrate_capacity_independent_model()
print(f"Dynamic pricing curve for Train 1027, Class 2A:")
print(price_curve[['days_remaining', 'predicted_waitlist', 'demand_index', 'price_multiplier', 'optimal_price']].head(10))

KeyError: 'daysRemaining'

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from scipy.optimize import minimize