In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("merged_flight_train_with_holidays.csv")

In [None]:
# Preprocess data
df['Year'] = df['Month (YYYY-MM)'].apply(lambda x: int(x.split('-')[0]))
df['Month'] = df['Month (YYYY-MM)'].apply(lambda x: int(x.split('-')[1]))
df.drop(columns=['Month (YYYY-MM)'], inplace=True)

# For classification, we need to create target variables that represent classes
# let's say we want to classify if arrivals are above median (high) or below (low)
arrivals_median = df['Arrivals'].median()
df['High_Arrivals'] = (df['Arrivals'] > arrivals_median).astype(int)

departures_median = df['Departures'].median()
df['High_Departures'] = (df['Departures'] > departures_median).astype(int)

# Model 1: Predict if Arrivals will be high
# Features: 'Country', 'No of holidays', 'Year', 'Month', 'Departures'
X_arrivals = df[['No of holidays', 'Year', 'Month', 'Departures']]
X_arrivals = pd.concat([X_arrivals, pd.get_dummies(df['Country'], prefix='Country', drop_first=True)], axis=1)
y_arrivals = df['High_Arrivals']

# Model 2: Predict if Departures will be high
# Features: 'Country', 'No of holidays', 'Year', 'Month', 'Arrivals'
X_departures = df[['No of holidays', 'Year', 'Month', 'Arrivals']]
X_departures = pd.concat([X_departures, pd.get_dummies(df['Country'], prefix='Country', drop_first=True)], axis=1)
y_departures = df['High_Departures']

# Split data for Arrivals model
X_train_arr, X_test_arr, y_train_arr, y_test_arr = train_test_split(
    X_arrivals, y_arrivals, test_size=0.3, random_state=42
)

# Split data for Departures model
X_train_dep, X_test_dep, y_train_dep, y_test_dep = train_test_split(
    X_departures, y_departures, test_size=0.3, random_state=42
)

# Train Arrivals classification model
log_reg_arrivals = LogisticRegression(max_iter=1000, random_state=42)
log_reg_arrivals.fit(X_train_arr, y_train_arr)

# Train Departures classification model
log_reg_departures = LogisticRegression(max_iter=1000, random_state=42)
log_reg_departures.fit(X_train_dep, y_train_dep)

# Evaluate classification model
def evaluate_classification_model(model, X_test, y_test, model_name="Classification Model"):
    """
    Evaluates a classification model using standard metrics and prints the results.
    
    Parameters:
    -----------
    model : estimator object
        The trained classification model
    X_test : array-like
        Test features
    y_test : array-like
        True target values
    model_name : str
        Name of the model for printing
    """
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of positive class
    
    # Print header
    print(f"\n===== {model_name} EVALUATION =====\n")
    
    # Print classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Print confusion matrix
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    # Calculate additional metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    try:
        auc = roc_auc_score(y_test, y_pred_proba)
    except:
        auc = "N/A"
    
    # Print summary metrics
    print("\nSummary Metrics:")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"AUC-ROC:   {auc}")
    
    # Return predictions and metrics for further analysis
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc
    }
    
    return y_pred, y_pred_proba, metrics

# Visualize classification results
def plot_classification_results(y_test, y_pred, y_pred_proba, title="Classification Results"):
    """
    Creates visualization plots for classification model evaluation.
    
    Parameters:
    -----------
    y_test : array-like
        True target values
    y_pred : array-like
        Predicted class values
    y_pred_proba : array-like
        Predicted probabilities for the positive class
    title : str
        Plot title
    """
    # Create a figure with subplots
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
    axes[0].set_title(f'Confusion Matrix ({title})')
    axes[0].set_xlabel('Predicted Label')
    axes[0].set_ylabel('True Label')
    
    # Try to plot ROC curve
    try:
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        auc = roc_auc_score(y_test, y_pred_proba)
        
        axes[1].plot(fpr, tpr, label=f'AUC = {auc:.4f}')
        axes[1].plot([0, 1], [0, 1], 'k--')
        axes[1].set_xlabel('False Positive Rate')
        axes[1].set_ylabel('True Positive Rate')
        axes[1].set_title(f'ROC Curve ({title})')
        axes[1].legend(loc='lower right')
    except:
        axes[1].text(0.5, 0.5, 'ROC curve unavailable', 
                     horizontalalignment='center', verticalalignment='center')
    
    plt.tight_layout()
    plt.show()

# Evaluate both models
y_pred_arr, y_pred_proba_arr, metrics_arr = evaluate_classification_model(
    log_reg_arrivals, X_test_arr, y_test_arr, "Arrivals Classification Model"
)

y_pred_dep, y_pred_proba_dep, metrics_dep = evaluate_classification_model(
    log_reg_departures, X_test_dep, y_test_dep, "Departures Classification Model"
)

# Plot results for both models
plot_classification_results(y_test_arr, y_pred_arr, y_pred_proba_arr, "Arrivals Classification")
plot_classification_results(y_test_dep, y_pred_dep, y_pred_proba_dep, "Departures Classification")