In [145]:
# model.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from datetime import datetime

In [147]:
class CancellationPredictor:
    def __init__(self):
        self.model = None
        self.scaler = StandardScaler()
        self.feature_columns = None
        
    def prepare_data(self, df):
        # Create datetime features
        df['DATE'] = pd.to_datetime(df['YEAR'].astype(str) + '-' + df['MONTH'], format='%Y-%B')
        df['YEAR_NUM'] = df['DATE'].dt.year
        df['MONTH_NUM'] = df['DATE'].dt.month
        
        # Create additional features
        df['SEASON'] = df['MONTH_NUM'].map(lambda x: 1 if x in [12,1,2] else 
                                                   2 if x in [3,4,5] else 
                                                   3 if x in [6,7,8] else 4)
        df['FAILURE_RATE'] = df['CANCEL_COUNT'] / df['TRIPS']
        df['LATE_RATE'] = df['LATES'] / df['TRIPS']
        
            # Create lag features
        for lag in [1, 3, 6]:
            df[f'CANCEL_COUNT_LAG_{lag}'] = df['CANCEL_COUNT'].shift(lag)
            df[f'FAILURE_RATE_LAG_{lag}'] = df['FAILURE_RATE'].shift(lag)
        
        # Create rolling means
        for window in [3, 6]:
            df[f'CANCEL_COUNT_ROLL_{window}'] = df['CANCEL_COUNT'].rolling(window=window).mean()
        
        return df.dropna()


In [148]:
def train(self, data_path):
        # Read data
        df = pd.read_csv(data_path)
        df = self.prepare_data(df)
        
        # Define features
        self.feature_columns = [
            'YEAR_NUM', 'MONTH_NUM', 'SEASON',
            'TRIPS', 'LATES', 'ON_TIME_PERCENTAGE', 'MEAN_DISTANCE_BEFORE_FAILURE',
            'FAILURE_RATE', 'LATE_RATE'
        ] + [col for col in df.columns if 'LAG' in col or 'ROLL' in col]
        
        X = df[self.feature_columns]
        y = df['CANCEL_COUNT']
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        # Train model
        self.model = xgb.XGBRegressor(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=6,
            random_state=42
        )
        self.model.fit(X_scaled, y)
        
        # Store last data for predictions
        self.last_data = df.iloc[-1:].copy()

In [149]:
def predict(self, year, month):
        if self.model is None:
            raise Exception("Model not trained. Call train() first.")
        
        # Create prediction data
        pred_data = pd.DataFrame({
            'YEAR_NUM': [year],
            'MONTH_NUM': [month],
            'SEASON': [1 if month in [12,1,2] else 
                      2 if month in [3,4,5] else 
                      3 if month in [6,7,8] else 4],
        })
        
        # Fill other features with last known values
        for col in self.feature_columns:
            if col not in pred_data.columns:
                pred_data[col] = self.last_data[col].values[0]
        
        # Scale and predict
        pred_data_scaled = self.scaler.transform(pred_data[self.feature_columns])
        prediction = self.model.predict(pred_data_scaled)[0]
        
        return max(0, round(prediction))