In [1]:
import pandas as pd
import numpy as np

## Import Dataset

In [2]:
df_2 = pd.read_csv('../../data/interim/df_set_2.csv')

In [3]:
df_2

Unnamed: 0,startingAirport,destinationAirport,flightDate,searchDate,segmentsDepartureTimeRaw,segmentsCabinCode,totalFare,totalTravelDistance
0,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T12:57:00.000-04:00,coach,248.6,947.0
1,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T06:30:00.000-04:00,coach,248.6,947.0
2,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T11:35:00.000-04:00,coach,248.6,947.0
3,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T13:59:00.000-04:00,coach,248.6,947.0
4,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T09:59:00.000-04:00,coach,248.6,947.0
...,...,...,...,...,...,...,...,...
10464622,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T09:45:00.000-06:00||2022-06-04T12:4...,coach||coach,506.6,1696.0
10464623,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T20:1...,coach||coach,562.2,1696.0
10464624,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T21:3...,coach||coach,562.2,1696.0
10464625,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T15:4...,coach||coach,586.6,1696.0


In [4]:
## Seperate Each segment 

In [5]:
df_copy = df_2.copy()

In [6]:
segments = df_copy['segmentsDepartureTimeRaw'].str.split(r'\|\|', n=1, expand=True)
df_copy['DepatureTime'] = segments[0]

In [7]:
# Split different segmentsCabinCode
segments = df_copy['segmentsCabinCode'].str.split(r'\|\|', n=4, expand=True)
df_copy[['Cabin_Leg1', 'Cabin_Leg2','Cabin_Leg3', 'Cabin_Leg4']] = segments

In [8]:
# Create Number of Stops feature
stop_counts = df_copy['segmentsCabinCode'].str.count(r'\|\|')
df_copy['n_stops'] = stop_counts

In [9]:
# Replace None with 'no_stop'
df_copy['Cabin_Leg1'] = df_copy['Cabin_Leg1'].fillna('no_stop')
df_copy['Cabin_Leg2'] = df_copy['Cabin_Leg2'].fillna('no_stop')
df_copy['Cabin_Leg3'] = df_copy['Cabin_Leg3'].fillna('no_stop')
df_copy['Cabin_Leg4'] = df_copy['Cabin_Leg4'].fillna('no_stop')

In [10]:
# Drop the segmentsDepartureTimeRaw and segmentsCabinCode
df_copy = df_copy.drop(['segmentsDepartureTimeRaw', 'segmentsCabinCode'], axis = 1)

In [11]:
df_copy

Unnamed: 0,startingAirport,destinationAirport,flightDate,searchDate,totalFare,totalTravelDistance,DepatureTime,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4,n_stops
0,ATL,BOS,2022-04-17,2022-04-16,248.6,947.0,2022-04-17T12:57:00.000-04:00,coach,no_stop,no_stop,no_stop,0
1,ATL,BOS,2022-04-17,2022-04-16,248.6,947.0,2022-04-17T06:30:00.000-04:00,coach,no_stop,no_stop,no_stop,0
2,ATL,BOS,2022-04-17,2022-04-16,248.6,947.0,2022-04-17T11:35:00.000-04:00,coach,no_stop,no_stop,no_stop,0
3,ATL,BOS,2022-04-17,2022-04-16,248.6,947.0,2022-04-17T13:59:00.000-04:00,coach,no_stop,no_stop,no_stop,0
4,ATL,BOS,2022-04-17,2022-04-16,248.6,947.0,2022-04-17T09:59:00.000-04:00,coach,no_stop,no_stop,no_stop,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10464622,DEN,SFO,2022-06-04,2022-05-19,506.6,1696.0,2022-06-04T09:45:00.000-06:00,coach,coach,no_stop,no_stop,1
10464623,DEN,SFO,2022-06-04,2022-05-19,562.2,1696.0,2022-06-04T12:25:00.000-06:00,coach,coach,no_stop,no_stop,1
10464624,DEN,SFO,2022-06-04,2022-05-19,562.2,1696.0,2022-06-04T12:25:00.000-06:00,coach,coach,no_stop,no_stop,1
10464625,DEN,SFO,2022-06-04,2022-05-19,586.6,1696.0,2022-06-04T12:25:00.000-06:00,coach,coach,no_stop,no_stop,1


## Feature Engineering

In [12]:
# Convert DepatureTime to usable features
df_copy['DepatureTime'] = pd.to_datetime(df_copy['DepatureTime'], utc = True)

In [13]:
# Extract features from the 'DepatureTime' column
df_copy['month'] = df_copy['DepatureTime'].dt.month
df_copy['day'] = df_copy['DepatureTime'].dt.day
df_copy['hour'] = df_copy['DepatureTime'].dt.hour
df_copy['minute'] = df_copy['DepatureTime'].dt.minute
df_copy['day_of_week'] = df_copy['DepatureTime'].dt.dayofweek  # Monday=0, Sunday=6
df_copy['week_of_year'] = df_copy['DepatureTime'].dt.isocalendar().week  # Week of the year

In [14]:
df_copy['flightDate'] = pd.to_datetime(df_copy['flightDate'])
df_copy['searchDate'] = pd.to_datetime(df_copy['searchDate'])
df_copy['date_diff'] = (df_copy['flightDate'] - df_copy['searchDate']).dt.days

In [15]:
# Get median distance of flight
# Group by 'startingAirport', 'destinationAirport', and 'n_stops', then calculate the median
median_travel_distance = (
    df_copy.groupby(['startingAirport', 'destinationAirport'])['totalTravelDistance']
    .median()
    .reset_index()
    .rename(columns={'totalTravelDistance': 'medianTravelDistance'})
)

In [16]:
median_travel_distance

Unnamed: 0,startingAirport,destinationAirport,medianTravelDistance
0,ATL,BOS,947.0
1,ATL,CLT,228.0
2,ATL,DEN,1375.0
3,ATL,DFW,725.0
4,ATL,DTW,932.0
5,ATL,EWR,762.0
6,ATL,IAD,541.0
7,ATL,JFK,762.0
8,ATL,LAX,2034.0
9,ATL,LGA,762.0


In [17]:
# Get a CSV file for APP
median_travel_distance.to_csv('../../data/external/median_travel_distance.csv')

In [18]:
# Merge the median travel distance back into the original DataFrame
df_copy = df_copy.merge(median_travel_distance, on=['startingAirport', 'destinationAirport'], how='left')


In [19]:
df_copy

Unnamed: 0,startingAirport,destinationAirport,flightDate,searchDate,totalFare,totalTravelDistance,DepatureTime,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4,n_stops,month,day,hour,minute,day_of_week,week_of_year,date_diff,medianTravelDistance
0,ATL,BOS,2022-04-17,2022-04-16,248.6,947.0,2022-04-17 16:57:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,16,57,6,15,1,947.0
1,ATL,BOS,2022-04-17,2022-04-16,248.6,947.0,2022-04-17 10:30:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,10,30,6,15,1,947.0
2,ATL,BOS,2022-04-17,2022-04-16,248.6,947.0,2022-04-17 15:35:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,15,35,6,15,1,947.0
3,ATL,BOS,2022-04-17,2022-04-16,248.6,947.0,2022-04-17 17:59:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,17,59,6,15,1,947.0
4,ATL,BOS,2022-04-17,2022-04-16,248.6,947.0,2022-04-17 13:59:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,13,59,6,15,1,947.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10464622,DEN,SFO,2022-06-04,2022-05-19,506.6,1696.0,2022-06-04 15:45:00+00:00,coach,coach,no_stop,no_stop,1,6,4,15,45,5,22,16,1033.0
10464623,DEN,SFO,2022-06-04,2022-05-19,562.2,1696.0,2022-06-04 18:25:00+00:00,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16,1033.0
10464624,DEN,SFO,2022-06-04,2022-05-19,562.2,1696.0,2022-06-04 18:25:00+00:00,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16,1033.0
10464625,DEN,SFO,2022-06-04,2022-05-19,586.6,1696.0,2022-06-04 18:25:00+00:00,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16,1033.0


In [20]:
X = df_copy.drop(['totalFare', 'flightDate', 'searchDate', 'DepatureTime', 'totalTravelDistance'], axis = 1)

In [21]:
y = df_copy['totalFare']

In [22]:
X

Unnamed: 0,startingAirport,destinationAirport,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4,n_stops,month,day,hour,minute,day_of_week,week_of_year,date_diff,medianTravelDistance
0,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,16,57,6,15,1,947.0
1,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,10,30,6,15,1,947.0
2,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,15,35,6,15,1,947.0
3,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,17,59,6,15,1,947.0
4,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,13,59,6,15,1,947.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10464622,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,15,45,5,22,16,1033.0
10464623,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16,1033.0
10464624,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16,1033.0
10464625,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16,1033.0


In [26]:
# Split data into train, val test
from sklearn.model_selection import train_test_split

In [27]:
# First split: create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split: create training and validation sets from the training set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [28]:
X_train.to_csv('../../data/processed/X_train.csv', index = False)
X_val.to_csv('../../data/processed/X_val.csv', index = False)
X_test.to_csv('../../data/processed/X_test.csv', index = False)

y_train.to_csv('../../data/processed/y_train.csv', index = False)
y_val.to_csv('../../data/processed/y_val.csv', index = False)
y_test.to_csv('../../data/processed/y_test.csv', index = False)