## 1. import dataset

In [1]:
import zipfile
import pandas as pd
import os
import re

# Directory where the ZIP files are located (assuming it's relative)
base_directory = '../data/raw/itineraries_csv'  

# Create an empty DataFrame to store the results
result_df = pd.DataFrame()

# Define a regular expression pattern to match filenames ending with two lowercase letters
file_pattern = re.compile(r'.*[a-z][a-z]\.zip')

# List all airport folders in the base directory
airport_names = [name for name in os.listdir(base_directory) if os.path.isdir(os.path.join(base_directory, name))]

dfs = []  # Create a list to store DataFrames

for airport_name in airport_names:
    # Directory path for the current airport
    zip_directory = os.path.join(base_directory, airport_name)
    
    # List all files in the airport's folder
    file_list = os.listdir(zip_directory)

    for filename in file_list:
        if file_pattern.match(filename):
            zip_file_path = os.path.join(zip_directory, filename)
            csv_file_path_inside_zip = filename.replace('.zip', '.feather')
            
            # Create a ZipFile object and read the CSV file
            with zipfile.ZipFile(zip_file_path, 'r') as zf:
                df = pd.read_feather(zf.open(csv_file_path_inside_zip))

            # Append the DataFrame for this filename to the list of DataFrames
            dfs.append(df)

# Use pandas.concat to concatenate the list of DataFrames into a single DataFrame
# df = pd.concat(dfs, ignore_index=True)

In [2]:
df = pd.concat(dfs, ignore_index=True)

In [3]:

# legId                                      0
# searchDate                                0
# flightDate                                0
# startingAirport                           0
# destinationAirport                        0
# travelDuration                            0
# isBasicEconomy                            0
# isRefundable                              0
# totalFare                                 0 (y)
# totalTravelDistance                  959619
# segmentsDepartureTimeEpochSeconds         0
# segmentsArrivalTimeEpochSeconds           0
# segmentsArrivalAirportCode                0
# segmentsDepartureAirportCode              0
# segmentsAirlineCode                       0
# segmentsEquipmentDescription         262676
# segmentsDurationInSeconds                 0 -> sum
# segmentsDistance                          0 -> sum
# segmentsCabinCode                         0
# -------
# travelLayover (travelDuration - segmentsDurationInSeconds)
# datediff (flightDate - searchDate)
# transitAirportCode (list) -> check arrival departure 
# numberOfTransit -> count (transitAirportCode)

## 2. Data preprocessing

### Create `travelLayover` column

**Convert `travelDuration` into second**

In [4]:
# Function to convert a duration string to seconds
def convert_duration_to_seconds(duration):
    match = re.match(r'PT(\d+)H(\d+)M', duration)
    
    if match:
        hours = int(match.group(1))
        minutes = int(match.group(2))
        total_seconds = hours * 3600 + minutes * 60
        return total_seconds
    else:
        return None

# Apply the function to the 'travelDuration' column and create a new column 'travelDurationInSeconds'
df['travelDurationInSeconds'] = df['travelDuration'].apply(convert_duration_to_seconds)


In [5]:
df['travelDurationInSeconds'].head()

0    28320.0
1    22500.0
2    32760.0
3    22620.0
4    51120.0
Name: travelDurationInSeconds, dtype: float64

**Convert `durationinsecond`**`

In [6]:
# Function to split and sum the values
def split_and_sum(segment_duration):
    segments = segment_duration.split('||')
    return sum(map(int, segments))

# Apply the function to the 'segmentsDurationInSeconds' column and create a new column 'totalDurationInSeconds'
df['totalDurationInSeconds'] = df['segmentsDurationInSeconds'].apply(split_and_sum)


In [7]:
df['totalDurationInSeconds'].head()

0    19800
1    20520
2    20520
3    19560
4    25080
Name: totalDurationInSeconds, dtype: int64

**Calculate `travelLayover`**

In [8]:
df['travelLayover'] = df['travelDurationInSeconds'] - df['totalDurationInSeconds']
df['travelLayover'].head()

0     8520.0
1     1980.0
2    12240.0
3     3060.0
4    26040.0
Name: travelLayover, dtype: float64

In [9]:
segment_cols = ['segmentsDepartureTimeEpochSeconds',
 'segmentsArrivalTimeEpochSeconds',
 'segmentsArrivalAirportCode',
 'segmentsDepartureAirportCode',
 'segmentsAirlineName',
 'segmentsAirlineCode',
 'segmentsEquipmentDescription',
 'segmentsDurationInSeconds',
 'segmentsDistance',
 'segmentsCabinCode']

In [10]:
df['segmentsDepartureTimeEpochSeconds'] = df['segmentsDepartureTimeEpochSeconds'].apply(lambda x: re.split(r'\|\|', x))
df['segmentsArrivalTimeEpochSeconds'] = df['segmentsArrivalTimeEpochSeconds'].apply(lambda x: re.split(r'\|\|', x))

In [11]:
df['segmentsArrivalAirportCode'] = df['segmentsArrivalAirportCode'].apply(lambda x: re.split(r'\|\|', x))
df['segmentsDepartureAirportCode'] = df['segmentsDepartureAirportCode'].apply(lambda x: re.split(r'\|\|', x))

In [12]:
df['segmentsAirlineName'] = df['segmentsAirlineName'].apply(lambda x: re.split(r'\|\|', x))
df['segmentsAirlineCode'] = df['segmentsAirlineCode'].apply(lambda x: re.split(r'\|\|', x))

In [13]:
print(df['segmentsEquipmentDescription'].apply(type))

0           <class 'str'>
1           <class 'str'>
2           <class 'str'>
3           <class 'str'>
4           <class 'str'>
                ...      
13519994    <class 'str'>
13519995    <class 'str'>
13519996    <class 'str'>
13519997    <class 'str'>
13519998    <class 'str'>
Name: segmentsEquipmentDescription, Length: 13519999, dtype: object


In [14]:
# Define a function to split the string and create a list
def split_duration(segment):
    return [int(value) for value in re.split(r'\|\|', segment)]

# Apply the function to the 'segmentsDurationInSeconds' column
df['segmentsDurationInSeconds'] = df['segmentsDurationInSeconds'].apply(split_duration)


In [15]:
def split_description(segment):
    if segment and isinstance(segment, str):
        return [description.strip() for description in re.split(r'\|\|', segment) if description]
    else:
        return []

# Apply the function to the 'segmentsEquipmentDescription' column
df['segmentsEquipmentDescription'] = df['segmentsEquipmentDescription'].apply(split_description)


In [16]:
df['segmentsDistance'] = df['segmentsDistance'].apply(lambda x: re.split(r'\|\|', x))
df['segmentsCabinCode'] = df['segmentsCabinCode'].apply(lambda x: re.split(r'\|\|', x))

In [17]:
### Find Sum of duration
df['SumsegmentsDurationInSeconds'] = df['segmentsDurationInSeconds'].apply(lambda x: [pd.to_numeric(value, errors='coerce') for value in x])
df['SumsegmentsDurationInSeconds'] = df['SumsegmentsDurationInSeconds'].apply(sum)

In [18]:
def get_first_element(x):
    return x[0] if len(x) > 0 else None

def get_last_element(x):
    return x[-1] if len(x) > 0 else None

df['Departure'] = df['segmentsDepartureAirportCode'].map(get_first_element)
df['Arrival'] = df['segmentsArrivalAirportCode'].map(get_last_element)

In [19]:
df['Departure'] = df['segmentsDepartureAirportCode'].apply(lambda x: [x[0]])
df['Arrival'] = df['segmentsArrivalAirportCode'].apply(lambda x: [x[-1]])

In [20]:
#create new col to collect only transit airport
import pandas as pd

# Assuming df is your DataFrame

def process_code_list(code_list):
    if len(code_list) != 1:
        code_list = code_list[1:] 
    return code_list

df['transitAirportCode'] = df['segmentsDepartureAirportCode'].apply(process_code_list)

In [21]:
### Find Sum of duration
df['SumsegmentsDistance'] = df['segmentsDistance'].apply(lambda x: [pd.to_numeric(value, errors='coerce') for value in x])
df['SumsegmentsDistance'] = df['SumsegmentsDistance'].apply(sum)

In [22]:
df['AllAirport'] = df['transitAirportCode'] + df['Departure'] + df['Arrival']


In [24]:
dfreset_index().to_feather('../data/df.feather')

## 3. Feature Engineering


In [1]:
import zipfile
import pandas as pd
import os
import re
import numpy as np

In [2]:
df_cleaned = pd.read_feather('../../data/df.feather')

In [3]:
df_cleaned = df_cleaned.drop(columns=['Departure', 'Arrival', 'segmentsDistance', 'segmentsDepartureTimeRaw', 'segmentsArrivalTimeRaw', 'segmentsDurationInSeconds', 'segmentsDistance', 'travelDuration', 'SumsegmentsDurationInSeconds'])

In [4]:
df_cleaned.rename(columns={'totalDurationInSeconds':'segment_totalDurationInSeconds', 'SumsegmentsDistance':'segment_totalDistance'}, inplace=True)

In [5]:
#encoding -> label encoder, standard encoder -> save -> corr -> save result diff notebook -> split clean notebook

In [6]:
#transform date column: searchDate
df_cleaned['searchDate'] = pd.to_datetime(df_cleaned['searchDate'])
df_cleaned['searchDate_day'] = df_cleaned['searchDate'].dt.day
df_cleaned['searchDate_month'] = df_cleaned['searchDate'].dt.month
df_cleaned['searchDate_year'] = df_cleaned['searchDate'].dt.year

#transform date column: flightDate
df_cleaned['flightDate'] = pd.to_datetime(df_cleaned['flightDate'])
df_cleaned['flightDate_day'] = df_cleaned['flightDate'].dt.day
df_cleaned['flightDate_month'] = df_cleaned['flightDate'].dt.month
df_cleaned['flightDate_year'] = df_cleaned['flightDate'].dt.year

#drop date cols
df_cleaned = df_cleaned.drop(columns=['searchDate', 'flightDate'])

In [7]:
df_cleaned['segmentsArrivalAirportCode'] = df_cleaned['segmentsArrivalAirportCode'].apply(lambda x: pd.factorize(x)[0])
df_cleaned['segmentsDepartureAirportCode'] = df_cleaned['segmentsDepartureAirportCode'].apply(lambda x: pd.factorize(x)[0])

In [8]:
df_cleaned['segmentsAirlineName'] = df_cleaned['segmentsAirlineName'].apply(lambda x: pd.factorize(x)[0])
df_cleaned['segmentsAirlineCode'] = df_cleaned['segmentsAirlineCode'].apply(lambda x: pd.factorize(x)[0])

In [9]:
df_cleaned['segmentsEquipmentDescription'] = df_cleaned['segmentsEquipmentDescription'].apply(lambda x: pd.factorize(x)[0])

In [10]:
df_cleaned['segmentsCabinCode'] = df_cleaned['segmentsCabinCode'].apply(lambda x: pd.factorize(x)[0])

In [11]:
df_cleaned['AllAirport'] = df_cleaned['AllAirport'].apply(lambda x: pd.factorize(x)[0])

In [12]:
df_cleaned['transitAirportCode'] = df_cleaned['transitAirportCode'].apply(lambda x: pd.factorize(x)[0])

In [13]:
df_cleaned['segmentsDepartureTimeEpochSeconds']

0           [1653098280, 1653115980]
1           [1653062160, 1653069060]
2           [1653051900, 1653069060]
3           [1653105360, 1653114000]
4           [1653108060, 1653141600]
                      ...           
13519994    [1655294700, 1655327100]
13519995    [1655321160, 1655338500]
13519996    [1655306100, 1655326500]
13519997    [1655292000, 1655312400]
13519998    [1655292000, 1655326500]
Name: segmentsDepartureTimeEpochSeconds, Length: 13519999, dtype: object

In [14]:
df_cleaned['DepartTime'] = df_cleaned['segmentsDepartureTimeEpochSeconds'].apply(lambda x: x[0])

In [15]:
df_cleaned['DepartTime'] = pd.to_datetime(df_cleaned['DepartTime'], unit='s')

In [16]:

# Extract and create new columns for hours, minutes, and seconds
df_cleaned['DepartTime_hour'] = df_cleaned['DepartTime'].dt.hour
df_cleaned['DepartTime_minute'] = df_cleaned['DepartTime'].dt.minute
df_cleaned['DepartTime_second'] = df_cleaned['DepartTime'].dt.second


In [17]:
df_cleaned = df_cleaned.drop(columns=['DepartTime'])

In [18]:
df_cleaned = df_cleaned[['totalTravelDistance', 'isNonStop', 'isBasicEconomy', 'startingAirport', 'destinationAirport', 'segmentsCabinCode','flightDate_day', 'flightDate_month', 'flightDate_year',
                         'DepartTime_hour', 'DepartTime_minute', 'DepartTime_second','totalFare']]

In [19]:
#defined num_cols and cat_cols
import numpy as np
cols = df_cleaned.columns.to_list()
num_cols = df_cleaned.select_dtypes(np.number).columns.to_list()
segment_cols = ['segmentsDepartureTimeEpochSeconds', 'segmentsArrivalTimeEpochSeconds', 'segmentsArrivalAirportCode', 'segmentsDepartureAirportCode', 'segmentsAirlineName', 'segmentsAirlineCode',
                'segmentsEquipmentDescription', 'segmentsCabinCode', 'transitAirportCode', 'AllAirport']
cat_cols = list(set(cols) - set(num_cols) - set(segment_cols))

In [20]:
df_cleaned['totalTravelDistance']= df_cleaned['totalTravelDistance'].fillna(df_cleaned['totalTravelDistance'].mean())

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_cleaned[cat_cols] = df_cleaned[cat_cols].apply(le.fit_transform)



In [22]:
#scale numeric column
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df_cleaned[num_cols] = scaler.fit_transform(df_cleaned[num_cols])

In [23]:
df_cleanedreset_index().to_feather('../../data/processed/df_cleaned2_select_cols.feather')

## 4. Split dataset

In [50]:
df_cleaned = pd.read_feather("../../data/processed/df_cleaned2_select_cols.feather")

In [68]:
import sys
print(sys.path)

['/Users/baiporthn/Projects/Adv_ml/flight-streamlit-at3/flight-prediction/notebooks/TP_notebooks', '/Users/baiporthn/opt/anaconda3/lib/python39.zip', '/Users/baiporthn/opt/anaconda3/lib/python3.9', '/Users/baiporthn/opt/anaconda3/lib/python3.9/lib-dynload', '', '/Users/baiporthn/.local/lib/python3.9/site-packages', '/Users/baiporthn/opt/anaconda3/lib/python3.9/site-packages', '/Users/baiporthn/opt/anaconda3/lib/python3.9/site-packages/aeosa', '/Users/baiporthn/opt/anaconda3/lib/python3.9/site-packages/IPython/extensions', '/Users/baiporthn/.ipython', '../../src/', '../../src/', '../../src/', '../../src/', '../../src/', '../../src/']


In [52]:
from data.make_dataset import *
features, target = pop_target(df_cleaned, 'totalFare')

In [53]:
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(features, target, test_ratio=0.2)

In [65]:
def save_sets(X_train=None, y_train=None, X_val=None, y_val=None, X_test=None, y_test=None, path='../data/processed/'):

    import pandas as pd
    import os.path

    if X_train is not None:
        X_train.reset_index().to_feather(f'{path}X_train.feather')
    if X_val is not None:
        X_val.reset_index().to_feather(f'{path}X_val.feather')
    if X_test is not None:
        X_test.reset_index().to_feather(f'{path}X_test.feather')
    if y_train is not None:
        y_train = y_train.to_frame()
        y_train.reset_index().to_feather(f'{path}y_train.feather')
    if y_val is not None:
        y_val = y_val.to_frame()
        y_val.reset_index().to_feather(f'{path}y_val.feather')
    if y_test is not None:
        y_test = y_test.to_frame()
        y_test.reset_index().to_feather(f'{path}y_test.feather')


In [None]:
save_sets(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, X_test=X_test, y_test=y_test, path='../../data/processed/')

In [71]:
def load_sets(path='../data/processed/'):
    """Load the different locally save sets

    Parameters
    ----------
    path : str
        Path to the folder where the sets are saved (default: '../data/processed/')

    Returns
    -------
    Numpy Array
        Features for the training set
    Numpy Array
        Target for the training set
    Numpy Array
        Features for the validation set
    Numpy Array
        Target for the validation set
    Numpy Array
        Features for the testing set
    Numpy Array
        Target for the testing set
    """
    import numpy as np
    import os.path

    X_train = pd.read_feather(f'{path}X_train.feather') if os.path.isfile(f'{path}X_train.feather') else None
    X_val   = pd.read_feather(f'{path}X_val.feather') if os.path.isfile(f'{path}X_val.feather')   else None
    X_test  = pd.read_feather(f'{path}X_test.feather') if os.path.isfile(f'{path}X_test.feather')  else None
    y_train = pd.read_feather(f'{path}y_train.feather') if os.path.isfile(f'{path}y_train.feather') else None
    y_val   = pd.read_feather(f'{path}y_val.feather') if os.path.isfile(f'{path}y_val.feather')   else None
    y_test  = pd.read_feather(f'{path}y_test.feather') if os.path.isfile(f'{path}y_test.feather')  else None

    return X_train, y_train, X_val, y_val, X_test, y_test

In [72]:
#split ratio 80:20 -> train val test
# from data.make_dataset import *
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../../data/processed/')