## 1. import dataset

In [101]:
import zipfile
import pandas as pd
import os
import re

# Directory where the ZIP files are located (assuming it's relative)
base_directory = '../data/raw/itineraries_csv'  

# Create an empty DataFrame to store the results
result_df = pd.DataFrame()

# Define a regular expression pattern to match filenames ending with two lowercase letters
file_pattern = re.compile(r'.*[a-z][a-z]\.zip')

# List all airport folders in the base directory
airport_names = [name for name in os.listdir(base_directory) if os.path.isdir(os.path.join(base_directory, name))]

dfs = []  # Create a list to store DataFrames

for airport_name in airport_names:
    # Directory path for the current airport
    zip_directory = os.path.join(base_directory, airport_name)
    
    # List all files in the airport's folder
    file_list = os.listdir(zip_directory)

    for filename in file_list:
        if file_pattern.match(filename):
            zip_file_path = os.path.join(zip_directory, filename)
            csv_file_path_inside_zip = filename.replace('.zip', '.csv')
            
            # Create a ZipFile object and read the CSV file
            with zipfile.ZipFile(zip_file_path, 'r') as zf:
                df = pd.read_csv(zf.open(csv_file_path_inside_zip))

            # Append the DataFrame for this filename to the list of DataFrames
            dfs.append(df)

# Use pandas.concat to concatenate the list of DataFrames into a single DataFrame
all_airport = pd.concat(dfs, ignore_index=True)

In [26]:

# legId                                      0
# searchDate                                0
# flightDate                                0
# startingAirport                           0
# destinationAirport                        0
# travelDuration                            0
# isBasicEconomy                            0
# isRefundable                              0
# totalFare                                 0 (y)
# totalTravelDistance                  959619
# segmentsDepartureTimeEpochSeconds         0
# segmentsArrivalTimeEpochSeconds           0
# segmentsArrivalAirportCode                0
# segmentsDepartureAirportCode              0
# segmentsAirlineCode                       0
# segmentsEquipmentDescription         262676
# segmentsDurationInSeconds                 0 -> sum
# segmentsDistance                          0 -> sum
# segmentsCabinCode                         0
# -------
# travelLayover (travelDuration - segmentsDurationInSeconds)
# datediff (flightDate - searchDate)
# transitAirportCode (list) -> check arrival departure 
# numberOfTransit -> count (transitAirportCode)

## 2. Data preprocessing

### Create `travelLayover` column

**Convert `travelDuration` into second**

In [102]:
# Function to convert a duration string to seconds
def convert_duration_to_seconds(duration):
    match = re.match(r'PT(\d+)H(\d+)M', duration)
    
    if match:
        hours = int(match.group(1))
        minutes = int(match.group(2))
        total_seconds = hours * 3600 + minutes * 60
        return total_seconds
    else:
        return None

# Apply the function to the 'travelDuration' column and create a new column 'travelDurationInSeconds'
all_airport['travelDurationInSeconds'] = all_airport['travelDuration'].apply(convert_duration_to_seconds)


In [103]:
all_airport['travelDurationInSeconds'].head()

0    28320.0
1    22500.0
2    32760.0
3    22620.0
4    51120.0
Name: travelDurationInSeconds, dtype: float64

**Convert `durationinsecond`**`

In [104]:
# Function to split and sum the values
def split_and_sum(segment_duration):
    segments = segment_duration.split('||')
    return sum(map(int, segments))

# Apply the function to the 'segmentsDurationInSeconds' column and create a new column 'totalDurationInSeconds'
all_airport['totalDurationInSeconds'] = all_airport['segmentsDurationInSeconds'].apply(split_and_sum)


In [105]:
all_airport['totalDurationInSeconds'].head()

0    19800
1    20520
2    20520
3    19560
4    25080
Name: totalDurationInSeconds, dtype: int64

**Calculate `travelLayover`**

In [106]:
all_airport['travelLayover'] = all_airport['travelDurationInSeconds'] - all_airport['totalDurationInSeconds']
all_airport['travelLayover'].head()

0     8520.0
1     1980.0
2    12240.0
3     3060.0
4    26040.0
Name: travelLayover, dtype: float64

In [None]:
#sepearte col by ||
segment_cols = [col for col in all_airport.columns if col.startswith('segments')]
for col in segment_cols:
    all_airport[col] = all_airport[col].apply(lambda x: x.split(r'\|\|'))

In [32]:
### Find Sum of duration
all_airport['SumsegmentsDurationInSeconds'] = all_airport['segmentsDurationInSeconds'].apply(lambda x: [pd.to_numeric(value, errors='coerce') for value in x])
all_airport['SumsegmentsDurationInSeconds'] = all_airport['SumsegmentsDurationInSeconds'].apply(sum)

In [33]:
#find departure and arrival
all_airport['Departure'] = all_airport['segmentsDepartureAirportCode'].apply(lambda x: x[0])
all_airport['Arrival'] = all_airport['segmentsArrivalAirportCode'].apply(lambda x: x[-1])

In [34]:
#create new col to collect only transit airport
import pandas as pd

# Assuming all_airport is your DataFrame

def process_code_list(code_list):
    if len(code_list) != 1:
        code_list = code_list[1:] 
    return code_list

all_airport['transitAirportCode'] = all_airport['segmentsDepartureAirportCode'].apply(process_code_list)

In [37]:
### Find Sum of duration
all_airport['SumsegmentsDistance'] = all_airport['segmentsDistance'].apply(lambda x: [pd.to_numeric(value, errors='coerce') for value in x])
all_airport['SumsegmentsDistance'] = all_airport['SumsegmentsDistance'].apply(sum)

In [38]:
#create a new col to collect all airport
all_airport['AllAirport'] = all_airport['transitAirportCode']
all_airport['AllAirport'] = all_airport.apply(lambda row: [row['Departure']] + row['AllAirport'], axis=1)
all_airport['AllAirport'] = all_airport.apply(lambda row: row['AllAirport'] + [row['Arrival']], axis=1)

In [None]:
#drop column

In [68]:
all_airport_cleaned = all_airport.drop(columns=['Departure', 'Arrival', 'segmentsDistance', 'segmentsDepartureTimeRaw', 'segmentsArrivalTimeRaw', 'segmentsDurationInSeconds', 'segmentsDistance', 'travelDuration', 'SumsegmentsDurationInSeconds'])

In [69]:
all_airport_cleaned.rename(columns={'totalDurationInSeconds':'segment_totalDurationInSeconds', 'SumsegmentsDistance':'segment_totalDistance'}, inplace=True)

In [48]:
all_airport_cleaned.to_feather('../data/all_airport_cleaned_allcols.feather')

In [None]:
#encoding -> label encoder, standard encoder -> save -> corr -> save result diff notebook -> split clean notebook

In [71]:
#transform date column: searchDate
all_airport_cleaned['searchDate'] = pd.to_datetime(all_airport_cleaned['searchDate'])
all_airport_cleaned['searchDate_day'] = all_airport_cleaned['searchDate'].dt.day
all_airport_cleaned['searchDate_month'] = all_airport_cleaned['searchDate'].dt.month
all_airport_cleaned['searchDate_year'] = all_airport_cleaned['searchDate'].dt.year

#transform date column: flightDate
all_airport_cleaned['flightDate'] = pd.to_datetime(all_airport_cleaned['flightDate'])
all_airport_cleaned['flightDate_day'] = all_airport_cleaned['flightDate'].dt.day
all_airport_cleaned['flightDate_month'] = all_airport_cleaned['flightDate'].dt.month
all_airport_cleaned['flightDate_year'] = all_airport_cleaned['flightDate'].dt.year

#drop date cols
all_airport_cleaned = all_airport_cleaned.drop(columns=['searchDate', 'flightDate'])

In [73]:
#defined num_cols and cat_cols
import numpy as np
cols = all_airport_cleaned.columns.to_list()
num_cols = all_airport_cleaned.select_dtypes(np.number).columns.to_list()
cat_cols = list(set(cols) - set(num_cols) - set(segment_cols))

In [98]:
for col in segment_cols:
    all_airport_cleaned[col] = all_airport_cleaned[col].apply(lambda x: pd.factorize(x)[0])


KeyboardInterrupt: 

In [76]:
all_airport_cleaned[cat_cols] = all_airport_cleaned[cat_cols].apply(le.fit_transform)



TypeError: Encoders require their input to be uniformly strings or numbers. Got ['list']

In [None]:
#scale numeric column
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
all_airport_cleaned[num_cols] = scaler.fit_transform(all_airport_cleaned[num_cols])

In [None]:
all_airport_cleaned.to_feather('../data/clean_before_split.feature')

In [114]:
import sys
sys.path.append('../src/')
print(sys.path)

['/Users/baiporthn/Projects/Adv_ml/flight-streamlit-at3/flight-prediction/notebooks', '/Users/baiporthn/opt/anaconda3/lib/python39.zip', '/Users/baiporthn/opt/anaconda3/lib/python3.9', '/Users/baiporthn/opt/anaconda3/lib/python3.9/lib-dynload', '', '/Users/baiporthn/.local/lib/python3.9/site-packages', '/Users/baiporthn/opt/anaconda3/lib/python3.9/site-packages', '/Users/baiporthn/opt/anaconda3/lib/python3.9/site-packages/aeosa', '/Users/baiporthn/opt/anaconda3/lib/python3.9/site-packages/IPython/extensions', '/Users/baiporthn/.ipython', '../src/']


In [None]:
#corr

In [None]:
from data.make_dataset
feature, target = pop_target(all_airport_cleaned, 'totalFare')

X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(features, target, test_ratio=0.2)
save_sets(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, X_test=X_test, y_test=y_test, path='../data/processed/')

In [None]:
#split ratio 80:20 -> train val test