In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [3]:
df = pd.read_csv('../../data/clean/clean_dataset_df.csv')

print(df.dtypes)

airline_name            object
flight_code             object
departure_city          object
arrival_city            object
flight_duration          int64
stops                    int64
price                    int64
class                   object
days_left                int64
departure_time_group    object
arrival_time_group      object
dtype: object


In [4]:
df_economy = df[df['class'] == 'Economy']
df_business = df[df['class'] == 'Business']

In [5]:
print(f'Rows in economy class: {df_economy.shape[0]}')
print(f'Rows in business class: {df_business.shape[0]}')

Rows in economy class: 206666
Rows in business class: 93487


In [None]:
# preprocess the data
def preprocess_data(df):
    # handle datetime columns
    # df['flight_date'] = pd.to_datetime(df['flight_date'], errors='coerce')
    df['departure_time_group'] = pd.to_datetime(df['departure_time_group'], errors='coerce')
    df['arrival_time_group'] = pd.to_datetime(df['arrival_time_group'], errors='coerce')
    
    # handle missing values
    # df['flight_date'] = df['flight_date'].fillna(pd.Timestamp('2024-01-01'))
    df['departure_time_group'] = df['departure_time_group'].fillna(pd.Timestamp('2024-01-01 00:00:00'))
    df['arrival_time_group'] = df['arrival_time_group'].fillna(pd.Timestamp('2024-01-01 00:00:00'))
    
    # extract datetime features
    df['departure_hour'] = df['departure_time_group'].dt.hour
    df['arrival_hour'] = df['arrival_time_group'].dt.hour
    # df['day_of_week'] = df['flight_date'].dt.dayofweek
    
    df = df.drop(['departure_time_group', 'arrival_time_group'], axis=1) # drop original datetime columns
    
    if 'Unnamed: 0' in df.columns:
        df = df.drop('Unnamed: 0', axis=1)
    
    return df

# preprocess
df_economy = preprocess_data(df_economy)
df_business = preprocess_data(df_business)

# feature and target columns
X_economy = df_economy.drop('price', axis=1)
y_economy = df_economy['price']

X_business = df_business.drop('price', axis=1)
y_business = df_business['price']


In [7]:
if df_economy.empty:
    print('No economy class data available.')
else:
    # split for economy class
    X_economy = df_economy.drop('price', axis=1)
    y_economy = df_economy['price']
    X_train_economy, X_test_economy, y_train_economy, y_test_economy = train_test_split(X_economy, y_economy, test_size=0.2, random_state=42)

if df_business.empty:
    print('No business class data available.')
else:
    # split for business class
    X_business = df_business.drop('price', axis=1)
    y_business = df_business['price']
    X_train_business, X_test_business, y_train_business, y_test_business = train_test_split(X_business, y_business, test_size=0.2, random_state=42)


In [8]:
# define features before transformer
numerical_features = X_economy.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_economy.select_dtypes(include=['object']).columns


# transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # handle missing values
    ('scaler', StandardScaler())  # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # handle missing values
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [None]:
n_estimators = 10

# model pipeline (Random Forest Regressor)
model_economy = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=n_estimators, max_depth=10, random_state=42, n_jobs=-1))
])

model_business = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=n_estimators, max_depth=10, random_state=42, n_jobs=-1))
])

In [None]:
# fit the models
model_economy.fit(X_train_economy, y_train_economy)

# predict
y_pred_economy = model_economy.predict(X_test_economy)

# evaluate
mae_economy = mean_absolute_error(y_test_economy, y_pred_economy)

print(f'Mean Absolute Error for Economy Class: {mae_economy}')

# calculate r2
r2_economy = r2_score(y_test_economy, y_pred_economy)

print(f'R-squared for Economy Class: {r2_economy}')

Mean Absolute Error for Economy Class: 1200.1764278931853
R-squared for Economy Class: 0.7469571170006852


In [None]:
# fit the models
model_business.fit(X_train_business, y_train_business)

# predict
y_pred_business = model_business.predict(X_test_business)

# evaluate
mae_business = mean_absolute_error(y_test_business, y_pred_business)

print(f'Mean Absolute Error for Business Class: {mae_business}')

# calculate r2
r2_business = r2_score(y_test_business, y_pred_business)

print(f'R-squared for Business Class: {r2_business}')


Mean Absolute Error for Business Class: 4649.520237863189
R-squared for Business Class: 0.7254690281023917


In [None]:
# corss validation (5 fold)
cv_scores_business = cross_val_score(model_business, X_business, y_business, cv=5)

print(f'Cross-validation accuracy for Business Class: {cv_scores_business.mean()}')

Cross-validation accuracy for Business Class: 0.4092693057087353


In [None]:
# cross validation (5 fold)
cv_scores_economy = cross_val_score(model_economy, X_economy, y_economy, cv=5)

print(f'Cross-validation accuracy for Economy Class: {cv_scores_economy.mean()}')

Cross-validation accuracy for Economy Class: 0.6473783970626904
