### Import Packages

In [25]:
import pandas as pd
import seaborn as sns
import xgboost as xgb
import optuna
import sklearn
import matplotlib as plt
import numpy as np
import category_encoders as ce
import joblib
import catboost as cb


#  version info dictionary
versions = {
    "Pandas": pd.__version__,
    "Seaborn": sns.__version__,
    "XGBoost": xgb.__version__,
    "Optuna": optuna.__version__,
    "Scikit-learn": sklearn.__version__,
    "NumPy": np.__version__,
    "Matplotlib": plt.__version__,
    "Category Encoders": ce.__version__,
    "Joblib": joblib.__version__,
    "CatBoost": cb.__version__,
}

# Print versions 
print("Package Versions:")
print("-" * 40)
for package, version in versions.items():
    print(f"{package:.<25} v{version}")
print("-" * 40)

Package Versions:
----------------------------------------
Pandas................... v2.2.2
Seaborn.................. v0.13.2
XGBoost.................. v2.1.1
Optuna................... v4.0.0
Scikit-learn............. v1.5.1
NumPy.................... v1.26.4
Matplotlib............... v3.9.2
Category Encoders........ v2.6.4
Joblib................... v1.4.2
CatBoost................. v1.2.7
----------------------------------------


In [11]:
pwd 

'/Users/sadmansakib/Downloads/AdvanceML/adv_mla_at3/notebooks'

In [19]:
import pandas as pd

train_data = pd.read_csv('../data/processed/train_data.csv')
val_data = pd.read_csv('../data/processed/val_data.csv')
test_data = pd.read_csv('../data/processed/test_data.csv')

In [15]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295736 entries, 0 to 295735
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   startingAirport      295736 non-null  object 
 1   destinationAirport   295736 non-null  object 
 2   travelDuration       295736 non-null  float64
 3   isBasicEconomy       295736 non-null  int64  
 4   isRefundable         295736 non-null  bool   
 5   isNonStop            295736 non-null  bool   
 6   totalFare            295736 non-null  float64
 7   totalTravelDistance  295736 non-null  float64
 8   month                295736 non-null  int64  
 9   day                  295736 non-null  int64  
 10  day_of_week          295736 non-null  int64  
 11  week_of_year         295736 non-null  int64  
 12  date_diff            295736 non-null  int64  
 13  hour                 295736 non-null  int64  
 14  minute               295736 non-null  int64  
 15  cabin_Leg1       

In [29]:
# Convert boolean columns to integers for all datasets
for dataset in [train_data, val_data, test_data]:
    dataset['isRefundable'] = dataset['isRefundable'].astype(int)
    dataset['isNonStop'] = dataset['isNonStop'].astype(int)

In [30]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295736 entries, 0 to 295735
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   startingAirport      295736 non-null  object 
 1   destinationAirport   295736 non-null  object 
 2   travelDuration       295736 non-null  float64
 3   isBasicEconomy       295736 non-null  int64  
 4   isRefundable         295736 non-null  int64  
 5   isNonStop            295736 non-null  int64  
 6   totalFare            295736 non-null  float64
 7   totalTravelDistance  295736 non-null  float64
 8   month                295736 non-null  int64  
 9   day                  295736 non-null  int64  
 10  day_of_week          295736 non-null  int64  
 11  week_of_year         295736 non-null  int64  
 12  date_diff            295736 non-null  int64  
 13  hour                 295736 non-null  int64  
 14  minute               295736 non-null  int64  
 15  cabin_Leg1       

In [31]:
# Define target and features
target_column = 'totalFare'

# Define categorical and numerical features
categorical_features = [
    'startingAirport', 'destinationAirport',
    'cabin_Leg1', 'cabin_Leg2', 'cabin_Leg3', 'cabin_Leg4'
]

numerical_features = [
    'travelDuration', 'isBasicEconomy', 'isRefundable', 'isNonStop',
    'totalTravelDistance', 'month', 'day', 'day_of_week', 'week_of_year',
    'date_diff', 'hour', 'minute'
]

# Combine features
feature_columns = numerical_features + categorical_features

# Prepare training data
X_train = train_data[feature_columns]
y_train = train_data[target_column]

# Prepare validation data
X_val = val_data[feature_columns]
y_val = val_data[target_column]

# Prepare test data
X_test = test_data[feature_columns]
y_test = test_data[target_column]


### 1.Liner Regression 

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Create preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with preprocessor and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Fit the pipeline
model.fit(X_train, y_train)

# Make predictions on train and validation sets
train_predictions = model.predict(X_train)
val_predictions = model.predict(X_val)

# Calculate and display metrics
def display_metrics(y_true, y_pred, dataset_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    
    print(f"\n{dataset_name} Metrics:")
    print(f"RMSE: ${rmse:.2f}")
    print(f"MAE: ${mae:.2f}")

# Print metrics for both sets
display_metrics(y_train, train_predictions, "Training")
display_metrics(y_val, val_predictions, "Validation")


Training Metrics:
RMSE: $166.75
MAE: $111.97

Validation Metrics:
RMSE: $172.68
MAE: $113.01


#### Hyperparameter tuning with GridSearchCV

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
import numpy as np


# Create pipelines for different models
ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

lasso_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Lasso())
])

elastic_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', ElasticNet())
])

# Define parameter grids for each model
ridge_params = {
    'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
    'regressor__solver': ['auto', 'svd', 'cholesky', 'lsqr']
}

lasso_params = {
    'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
    'regressor__selection': ['cyclic', 'random']
}

elastic_params = {
    'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
    'regressor__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

# Perform Grid Search for each model
def train_and_evaluate_model(pipeline, params, model_name):
    grid_search = GridSearchCV(
        pipeline,
        params,
        cv=5,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"\n{model_name} Best Parameters:")
    print(grid_search.best_params_)
    print(f"{model_name} Best Score (RMSE): ${-grid_search.best_score_:.2f}")
    
    return grid_search.best_estimator_

# Train and evaluate all models
print("Training models with Grid Search CV...")
ridge_model = train_and_evaluate_model(ridge_pipeline, ridge_params, "Ridge")
lasso_model = train_and_evaluate_model(lasso_pipeline, lasso_params, "Lasso")
elastic_model = train_and_evaluate_model(elastic_pipeline, elastic_params, "ElasticNet")

# Function to evaluate model performance
def evaluate_model(model, X_train, y_train, X_val, y_val, model_name):
    # Make predictions
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    
    # Calculate metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    train_mae = mean_absolute_error(y_train, train_pred)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    val_mae = mean_absolute_error(y_val, val_pred)
    
    print(f"\n{model_name} Final Metrics:")
    print("Training Metrics:")
    print(f"RMSE: ${train_rmse:.2f}")
    print(f"MAE: ${train_mae:.2f}")
    print("\nValidation Metrics:")
    print(f"RMSE: ${val_rmse:.2f}")
    print(f"MAE: ${val_mae:.2f}")

# Evaluate all models
print("\nEvaluating final models...")
evaluate_model(ridge_model, X_train, y_train, X_val, y_val, "Ridge")
evaluate_model(lasso_model, X_train, y_train, X_val, y_val, "Lasso")
evaluate_model(elastic_model, X_train, y_train, X_val, y_val, "ElasticNet")

Training models with Grid Search CV...

Ridge Best Parameters:
{'regressor__alpha': 0.1, 'regressor__solver': 'auto'}
Ridge Best Score (RMSE): $166.89


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



Lasso Best Parameters:
{'regressor__alpha': 0.01, 'regressor__selection': 'cyclic'}
Lasso Best Score (RMSE): $167.01


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



ElasticNet Best Parameters:
{'regressor__alpha': 0.01, 'regressor__l1_ratio': 0.9}
ElasticNet Best Score (RMSE): $168.17

Evaluating final models...

Ridge Final Metrics:
Training Metrics:
RMSE: $166.75
MAE: $111.98

Validation Metrics:
RMSE: $172.68
MAE: $113.01

Lasso Final Metrics:
Training Metrics:
RMSE: $166.87
MAE: $111.98

Validation Metrics:
RMSE: $172.90
MAE: $113.06

ElasticNet Final Metrics:
Training Metrics:
RMSE: $168.09
MAE: $112.37

Validation Metrics:
RMSE: $174.26
MAE: $113.48


#### performing the best one(RIDGE) in the test set.

In [18]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Make predictions on test set
test_predictions = ridge_model.predict(X_test)

# Calculate metrics for test set
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
test_mae = mean_absolute_error(y_test, test_predictions)

# Display test metrics
print("\nTest Set Metrics:")
print(f"RMSE: ${test_rmse:.2f}")
print(f"MAE: ${test_mae:.2f}")

# Optional: Calculate and display prediction intervals (assuming normal distribution)
residuals = y_train - ridge_model.predict(X_train)
residual_std = np.std(residuals)

confidence_interval = 1.96 * residual_std  # 95% confidence interval
print(f"\nPrediction Interval (95% confidence):")
print(f"±${confidence_interval:.2f}")

# Optional: Display some example predictions vs actual values
print("\nSample Predictions vs Actual Values:")
sample_size = min(5, len(y_test))
sample_indices = np.random.choice(len(y_test), sample_size, replace=False)

print("\nActual Price vs Predicted Price:")
for idx in sample_indices:
    print(f"Actual: ${y_test.iloc[idx]:.2f}, Predicted: ${test_predictions[idx]:.2f}, " 
          f"Difference: ${abs(y_test.iloc[idx] - test_predictions[idx]):.2f}")


Test Set Metrics:
RMSE: $170.04
MAE: $113.84

Prediction Interval (95% confidence):
±$326.82

Sample Predictions vs Actual Values:

Actual Price vs Predicted Price:
Actual: $438.70, Predicted: $529.87, Difference: $91.17
Actual: $294.60, Predicted: $423.39, Difference: $128.79
Actual: $554.01, Predicted: $369.77, Difference: $184.24
Actual: $465.60, Predicted: $454.93, Difference: $10.67
Actual: $576.60, Predicted: $488.60, Difference: $88.00


### 2.Decision Tree

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Create preprocessing pipelines with unique names
dt_numeric_transformer = Pipeline(steps=[
    ('dt_scaler', StandardScaler())
])

dt_categorical_transformer = Pipeline(steps=[
    ('dt_onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# Combine preprocessing steps
dt_preprocessor = ColumnTransformer(
    transformers=[
        ('dt_num', dt_numeric_transformer, numerical_features),
        ('dt_cat', dt_categorical_transformer, categorical_features)
    ])

# Create pipeline with Decision Tree
dt_model = Pipeline(steps=[
    ('dt_preprocessor', dt_preprocessor),
    ('dt_regressor', DecisionTreeRegressor(random_state=42))
])

# Fit the pipeline
dt_model.fit(X_train, y_train)

# Make predictions
dt_train_predictions = dt_model.predict(X_train)
dt_val_predictions = dt_model.predict(X_val)

# Calculate and display metrics
def display_dt_metrics(y_true, y_pred, dataset_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    
    print(f"\n{dataset_name} Metrics:")
    print(f"RMSE: ${rmse:.2f}")
    print(f"MAE: ${mae:.2f}")

# Print metrics for both sets
display_dt_metrics(y_train, dt_train_predictions, "Training")
display_dt_metrics(y_val, dt_val_predictions, "Validation")


Training Metrics:
RMSE: $11.16
MAE: $0.49

Validation Metrics:
RMSE: $127.76
MAE: $66.24


**The model is clearly overfitted. In the training set, it performed very well with high accuracy, but in the unseen dataset (validation set), it's not performing well at all, showing poor generalization ability**

In [20]:
from sklearn.model_selection import GridSearchCV

# Create pipeline with Decision Tree
dt_pipeline_tuning = Pipeline([
    ('dt_preprocessor', dt_preprocessor),
    ('dt_regressor', DecisionTreeRegressor(random_state=42))
])

# Define parameter grid
dt_param_grid = {
    'dt_regressor__max_depth': [5, 10, 15, 20, None],
    'dt_regressor__min_samples_split': [2, 5, 10],
    'dt_regressor__min_samples_leaf': [1, 2, 4],
    'dt_regressor__max_features': ['sqrt', 'log2', None]
}

# Perform Grid Search
dt_grid_search = GridSearchCV(
    dt_pipeline_tuning,
    dt_param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

print("Training Decision Tree model with Grid Search CV...")
dt_grid_search.fit(X_train, y_train)

# Print best parameters and score
print("\nBest Parameters:")
print(dt_grid_search.best_params_)
print(f"Best Cross-Validation RMSE: ${-dt_grid_search.best_score_:.2f}")

# Get best model
dt_best_model = dt_grid_search.best_estimator_

# Make predictions with best model
dt_best_train_predictions = dt_best_model.predict(X_train)
dt_best_val_predictions = dt_best_model.predict(X_val)

# Print metrics for both sets
print("\nFinal Metrics with Best Model:")
display_dt_metrics(y_train, dt_best_train_predictions, "Training")
display_dt_metrics(y_val, dt_best_val_predictions, "Validation")

Training Decision Tree model with Grid Search CV...

Best Parameters:
{'dt_regressor__max_depth': None, 'dt_regressor__max_features': None, 'dt_regressor__min_samples_leaf': 4, 'dt_regressor__min_samples_split': 10}
Best Cross-Validation RMSE: $115.21

Final Metrics with Best Model:

Training Metrics:
RMSE: $68.70
MAE: $39.84

Validation Metrics:
RMSE: $117.08
MAE: $66.15


**The model shows less overfitting than before, as the gap between training and validation metrics has decreased.still there are room for improvment.still model is overfitted**

In [23]:
# Make predictions on test set
dt_test_predictions = dt_best_model.predict(X_test)

# Calculate metrics for test set
dt_test_rmse = np.sqrt(mean_squared_error(y_test, dt_test_predictions))
dt_test_mae = mean_absolute_error(y_test, dt_test_predictions)

# Display test metrics
print("\nDecision Tree Test Set Metrics:")
print(f"RMSE: ${dt_test_rmse:.2f}")
print(f"MAE: ${dt_test_mae:.2f}")

# Optional: Show some example predictions
print("\nSample Predictions vs Actual Values:")
sample_size = min(5, len(y_test))
sample_indices = np.random.choice(len(y_test), sample_size, replace=False)

print("\nActual Price vs Predicted Price:")
for idx in sample_indices:
    print(f"Actual: ${y_test.iloc[idx]:.2f}, Predicted: ${dt_test_predictions[idx]:.2f}, "
          f"Difference: ${abs(y_test.iloc[idx] - dt_test_predictions[idx]):.2f}")

# Simple feature importance for numerical features only
dt_feature_importance = pd.DataFrame({
    'feature': numerical_features,
    'importance': dt_best_model.named_steps['dt_regressor'].feature_importances_[:len(numerical_features)]
})
dt_feature_importance = dt_feature_importance.sort_values('importance', ascending=False)

print("\nTop Numerical Features by Importance:")
print(dt_feature_importance)


Decision Tree Test Set Metrics:
RMSE: $110.75
MAE: $66.28

Sample Predictions vs Actual Values:

Actual Price vs Predicted Price:
Actual: $351.60, Predicted: $351.03, Difference: $0.57
Actual: $176.60, Predicted: $211.79, Difference: $35.19
Actual: $359.10, Predicted: $371.49, Difference: $12.39
Actual: $658.10, Predicted: $621.93, Difference: $36.17
Actual: $200.60, Predicted: $237.45, Difference: $36.85

Top Numerical Features by Importance:
                feature  importance
4   totalTravelDistance    0.339155
0        travelDuration    0.077766
9             date_diff    0.049508
7           day_of_week    0.037546
1        isBasicEconomy    0.032346
10                 hour    0.029176
6                   day    0.029101
8          week_of_year    0.028906
11               minute    0.024061
5                 month    0.003093
3             isNonStop    0.000180
2          isRefundable    0.000002


### 3.Random forest

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Create preprocessing pipelines with unique names
rf_numeric_transformer = Pipeline(steps=[
    ('rf_scaler', StandardScaler())
])

rf_categorical_transformer = Pipeline(steps=[
    ('rf_onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# Combine preprocessing steps
rf_preprocessor = ColumnTransformer(
    transformers=[
        ('rf_num', rf_numeric_transformer, numerical_features),
        ('rf_cat', rf_categorical_transformer, categorical_features)
    ])

# Create pipeline with Random Forest
rf_model = Pipeline(steps=[
    ('rf_preprocessor', rf_preprocessor),
    ('rf_regressor', RandomForestRegressor(random_state=42))
])

# Fit the pipeline
rf_model.fit(X_train, y_train)

# Make predictions
rf_train_predictions = rf_model.predict(X_train)
rf_val_predictions = rf_model.predict(X_val)

# Calculate and display metrics
def display_rf_metrics(y_true, y_pred, dataset_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    
    print(f"\n{dataset_name} Metrics:")
    print(f"RMSE: ${rmse:.2f}")
    print(f"MAE: ${mae:.2f}")

# Print metrics for both sets
display_rf_metrics(y_train, rf_train_predictions, "Training")
display_rf_metrics(y_val, rf_val_predictions, "Validation")


Training Metrics:
RMSE: $34.20
MAE: $19.76

Validation Metrics:
RMSE: $94.22
MAE: $52.53


In [33]:
from sklearn.model_selection import GridSearchCV

# Create pipeline with Random Forest
rf_pipeline_tuning = Pipeline([
    ('rf_preprocessor', rf_preprocessor),
    ('rf_regressor', RandomForestRegressor(random_state=42))
])

# Define parameter grid
rf_param_grid = {
    'rf_regressor__n_estimators': [100, 200],
    'rf_regressor__max_depth': [10, 20, None],
    'rf_regressor__min_samples_split': [2, 5],
    'rf_regressor__min_samples_leaf': [1, 2],
    'rf_regressor__max_features': ['sqrt', 'log2']
}

# Perform Grid Search
rf_grid_search = GridSearchCV(
    rf_pipeline_tuning,
    rf_param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

print("Training Random Forest model with Grid Search CV...")
rf_grid_search.fit(X_train, y_train)

# Print best parameters and score
print("\nBest Parameters:")
print(rf_grid_search.best_params_)
print(f"Best Cross-Validation RMSE: ${-rf_grid_search.best_score_:.2f}")

# Get best model
rf_best_model = rf_grid_search.best_estimator_

# Make predictions with best model
rf_best_train_predictions = rf_best_model.predict(X_train)
rf_best_val_predictions = rf_best_model.predict(X_val)

# Print metrics for both sets
print("\nFinal Metrics with Best Model:")
display_rf_metrics(y_train, rf_best_train_predictions, "Training")
display_rf_metrics(y_val, rf_best_val_predictions, "Validation")

Training Random Forest model with Grid Search CV...


  _data = np.array(data, dtype=dtype, copy=copy,



Best Parameters:
{'rf_regressor__max_depth': None, 'rf_regressor__max_features': 'sqrt', 'rf_regressor__min_samples_leaf': 1, 'rf_regressor__min_samples_split': 2, 'rf_regressor__n_estimators': 200}
Best Cross-Validation RMSE: $93.78

Final Metrics with Best Model:

Training Metrics:
RMSE: $34.85
MAE: $20.76

Validation Metrics:
RMSE: $97.22
MAE: $55.46


**A better result was achieved in the unseen dataset, but there is room for improvement**

In [34]:
# Make predictions on test set
rf_test_predictions = rf_best_model.predict(X_test)

# Calculate metrics for test set
rf_test_rmse = np.sqrt(mean_squared_error(y_test, rf_test_predictions))
rf_test_mae = mean_absolute_error(y_test, rf_test_predictions)

# Display test metrics
print("\nRandom Forest Test Set Metrics:")
print(f"RMSE: ${rf_test_rmse:.2f}")
print(f"MAE: ${rf_test_mae:.2f}")

# Show some example predictions
print("\nSample Predictions vs Actual Values:")
sample_size = min(5, len(y_test))
sample_indices = np.random.choice(len(y_test), sample_size, replace=False)

print("\nActual Price vs Predicted Price:")
for idx in sample_indices:
    print(f"Actual: ${y_test.iloc[idx]:.2f}, Predicted: ${rf_test_predictions[idx]:.2f}, "
          f"Difference: ${abs(y_test.iloc[idx] - rf_test_predictions[idx]):.2f}")

# Simple feature importance for numerical features
rf_feature_importance = pd.DataFrame({
    'feature': numerical_features,
    'importance': rf_best_model.named_steps['rf_regressor'].feature_importances_[:len(numerical_features)]
})
rf_feature_importance = rf_feature_importance.sort_values('importance', ascending=False)

print("\nTop Numerical Features by Importance:")
print(rf_feature_importance)


Random Forest Test Set Metrics:
RMSE: $91.79
MAE: $55.84

Sample Predictions vs Actual Values:

Actual Price vs Predicted Price:
Actual: $221.58, Predicted: $262.56, Difference: $40.98
Actual: $483.60, Predicted: $484.91, Difference: $1.31
Actual: $326.20, Predicted: $382.03, Difference: $55.83
Actual: $296.20, Predicted: $358.29, Difference: $62.09
Actual: $268.60, Predicted: $272.90, Difference: $4.30

Top Numerical Features by Importance:
                feature  importance
4   totalTravelDistance    0.188719
0        travelDuration    0.095137
9             date_diff    0.064717
11               minute    0.051243
10                 hour    0.048961
6                   day    0.042555
7           day_of_week    0.042107
8          week_of_year    0.035996
1        isBasicEconomy    0.024900
5                 month    0.013659
3             isNonStop    0.007210
2          isRefundable    0.000979
