In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
df = pd.read_csv('../../data/clean/clean_combined.csv')

print(df.dtypes)

Unnamed: 0                int64
airline_name             object
flight_code              object
departure_city           object
arrival_city             object
flight_duration          object
stops                     int64
price                     int64
class                    object
days_left               float64
departure_time_group     object
arrival_time_group       object
flight_date              object
departure_time           object
arrival_time             object
dtype: object


In [3]:
# Separate the dataset into economy and business
df_economy = df[df['class'] == 'Economy']
df_business = df[df['class'] == 'Business']


In [4]:
print(f"Rows in economy class: {df_economy.shape[0]}")
print(f"Rows in business class: {df_business.shape[0]}")

Rows in economy class: 206567
Rows in business class: 93487


In [5]:
# Define a function to preprocess the data
def preprocess_data(df):
    # Handle datetime columns
    df['flight_date'] = pd.to_datetime(df['flight_date'], errors='coerce')
    df['departure_time'] = pd.to_datetime(df['departure_time'], errors='coerce')
    df['arrival_time'] = pd.to_datetime(df['arrival_time'], errors='coerce')
    
    # Check for missing values in datetime columns and handle them
    df['flight_date'] = df['flight_date'].fillna(pd.Timestamp('2024-01-01'))
    df['departure_time'] = df['departure_time'].fillna(pd.Timestamp('2024-01-01 00:00:00'))
    df['arrival_time'] = df['arrival_time'].fillna(pd.Timestamp('2024-01-01 00:00:00'))
    
    # Extract useful features from the datetime columns (e.g., hour, day, etc.)
    df['departure_hour'] = df['departure_time'].dt.hour
    df['arrival_hour'] = df['arrival_time'].dt.hour
    df['day_of_week'] = df['flight_date'].dt.dayofweek
    
    # Drop original datetime columns to avoid redundancy
    df = df.drop(['departure_time', 'arrival_time', 'flight_date'], axis=1)
    
    # Drop 'Unnamed: 0' as it's just an index column (check if it's present)
    if 'Unnamed: 0' in df.columns:
        df = df.drop('Unnamed: 0', axis=1)
    
    return df

# Preprocess both datasets
df_economy = preprocess_data(df_economy)
df_business = preprocess_data(df_business)

# Now let's set up the feature columns (X) and target column (y)
X_economy = df_economy.drop('price', axis=1)
y_economy = df_economy['price']

X_business = df_business.drop('price', axis=1)
y_business = df_business['price']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['flight_date'] = pd.to_datetime(df['flight_date'], errors='coerce')
  df['departure_time'] = pd.to_datetime(df['departure_time'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['departure_time'] = pd.to_datetime(df['departure_time'], errors='coerce')
  df['arrival_time'] = pd.to_datetime(df['arrival_time'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:

In [6]:
if df_economy.empty:
    print("No economy class data available.")
else:
    # Proceed with train/test split for economy class
    X_economy = df_economy.drop('price', axis=1)
    y_economy = df_economy['price']
    X_train_economy, X_test_economy, y_train_economy, y_test_economy = train_test_split(X_economy, y_economy, test_size=0.2, random_state=42)

if df_business.empty:
    print("No business class data available.")
else:
    # Proceed with train/test split for business class
    X_business = df_business.drop('price', axis=1)
    y_business = df_business['price']
    X_train_business, X_test_business, y_train_business, y_test_business = train_test_split(X_business, y_business, test_size=0.2, random_state=42)


In [7]:
# Make sure to define these features before using them in the transformer
numerical_features = X_economy.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_economy.select_dtypes(include=['object']).columns


# Create transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler())  # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing values
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine transformers in a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [8]:
n_estimators = 10

# Define the model pipeline (RandomForestRegressor)
model_economy = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=n_estimators, max_depth=10, random_state=42, n_jobs=-1))  # Parallelize with n_jobs
])

model_business = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=n_estimators, max_depth=10, random_state=42, n_jobs=-1))  # Parallelize with n_jobs
])

In [9]:
# Fit the models
model_economy.fit(X_train_economy, y_train_economy)

# Predict on the test data
y_pred_economy = model_economy.predict(X_test_economy)

# Evaluate the models (using Mean Absolute Error)
mae_economy = mean_absolute_error(y_test_economy, y_pred_economy)

print(f'Mean Absolute Error for Economy Class: {mae_economy}')

Mean Absolute Error for Economy Class: 1820.475922027355


In [10]:
# Fit the models
model_business.fit(X_train_business, y_train_business)

# Predict on the test data
y_pred_business = model_business.predict(X_test_business)

# Evaluate the models (using Mean Absolute Error)
mae_business = mean_absolute_error(y_test_business, y_pred_business)

print(f'Mean Absolute Error for Business Class: {mae_business}')


Mean Absolute Error for Business Class: 4665.124922956034


In [11]:
# Optionally: Cross-validation (5-fold) for both models
cv_scores_economy = cross_val_score(model_economy, X_economy, y_economy, cv=5)
cv_scores_business = cross_val_score(model_business, X_business, y_business, cv=5)

print(f'Cross-validation accuracy for Economy Class: {cv_scores_economy.mean()}')
print(f'Cross-validation accuracy for Business Class: {cv_scores_business.mean()}')

Cross-validation accuracy for Economy Class: 0.32788754266625136
Cross-validation accuracy for Business Class: 0.327964739688304
