## 1. import dataset

In [1]:
import zipfile
import pandas as pd
import os
import re

# Directory where the ZIP files are located (assuming it's relative)
base_directory = '../data/raw/itineraries_csv'  

# Create an empty DataFrame to store the results
result_df = pd.DataFrame()

# Define a regular expression pattern to match filenames ending with two lowercase letters
file_pattern = re.compile(r'.*[a-z][a-z]\.zip')

# List all airport folders in the base directory
airport_names = [name for name in os.listdir(base_directory) if os.path.isdir(os.path.join(base_directory, name))]

dfs = []  # Create a list to store DataFrames

for airport_name in airport_names:
    # Directory path for the current airport
    zip_directory = os.path.join(base_directory, airport_name)
    
    # List all files in the airport's folder
    file_list = os.listdir(zip_directory)

    for filename in file_list:
        if file_pattern.match(filename):
            zip_file_path = os.path.join(zip_directory, filename)
            csv_file_path_inside_zip = filename.replace('.zip', '.csv')
            
            # Create a ZipFile object and read the CSV file
            with zipfile.ZipFile(zip_file_path, 'r') as zf:
                df = pd.read_csv(zf.open(csv_file_path_inside_zip))

            # Append the DataFrame for this filename to the list of DataFrames
            dfs.append(df)

# Use pandas.concat to concatenate the list of DataFrames into a single DataFrame
all_airport = pd.concat(dfs, ignore_index=True)

In [2]:

# legId                                      0
# searchDate                                0
# flightDate                                0
# startingAirport                           0
# destinationAirport                        0
# travelDuration                            0
# isBasicEconomy                            0
# isRefundable                              0
# totalFare                                 0 (y)
# totalTravelDistance                  959619
# segmentsDepartureTimeEpochSeconds         0
# segmentsArrivalTimeEpochSeconds           0
# segmentsArrivalAirportCode                0
# segmentsDepartureAirportCode              0
# segmentsAirlineCode                       0
# segmentsEquipmentDescription         262676
# segmentsDurationInSeconds                 0 -> sum
# segmentsDistance                          0 -> sum
# segmentsCabinCode                         0
# -------
# travelLayover (travelDuration - segmentsDurationInSeconds)
# datediff (flightDate - searchDate)
# transitAirportCode (list) -> check arrival departure 
# numberOfTransit -> count (transitAirportCode)

## 2. Data preprocessing

In [3]:
import sys
sys.path.insert(0, '../src')

### Create `travelLayover` column

**Convert `travelDuration` into second**

In [4]:
from features.build_features import convert_duration_to_seconds

In [5]:
# Apply the function to the 'travelDuration' column and create a new column 'travelDurationInSeconds'
all_airport['travelDurationInSeconds'] = all_airport['travelDuration'].apply(convert_duration_to_seconds)


In [6]:
all_airport['travelDurationInSeconds'].head()

0    28320.0
1    22500.0
2    32760.0
3    22620.0
4    51120.0
Name: travelDurationInSeconds, dtype: float64

**Convert `durationinsecond`**`

In [7]:
from features.build_features import split_and_sum
# Apply the function to the 'segmentsDurationInSeconds' column and create a new column 'totalDurationInSeconds'
all_airport['totalDurationInSeconds'] = all_airport['segmentsDurationInSeconds'].apply(split_and_sum)


In [8]:
all_airport['totalDurationInSeconds'].head()

0    19800
1    20520
2    20520
3    19560
4    25080
Name: totalDurationInSeconds, dtype: int64

**Calculate `travelLayover`**

In [9]:
all_airport['travelLayover'] = all_airport['travelDurationInSeconds'] - all_airport['totalDurationInSeconds']
all_airport['travelLayover'].head()

0     8520.0
1     1980.0
2    12240.0
3     3060.0
4    26040.0
Name: travelLayover, dtype: float64

Calculate **`SumDurationinseconds`**

In [10]:
### Find Sum of duration
all_airport['SumsegmentsDurationInSeconds'] = all_airport['segmentsDurationInSeconds'].str.split(r'\|\|').apply(lambda x: [pd.to_numeric(value, errors='coerce') for value in x])
all_airport['SumsegmentsDurationInSeconds'] = all_airport['SumsegmentsDurationInSeconds'].apply(sum)

In [11]:
#sepearte col by ||
all_airport['segmentsArrivalAirportCode'] = all_airport['segmentsArrivalAirportCode'].str.split(r'\|\|')
all_airport['segmentsDepartureAirportCode'] = all_airport['segmentsDepartureAirportCode'].str.split(r'\|\|')
#find departure and arrival
all_airport['Departure'] = all_airport['segmentsDepartureAirportCode'].apply(lambda x: x[0])
all_airport['Arrival'] = all_airport['segmentsArrivalAirportCode'].apply(lambda x: x[-1])

calculate `DateDiff`
`days = flightDate- searchDate` show how many day before book the flight

For streamlit we might can use the get current date and do flightDate - currentDate

In [12]:
# Convert 'flightDate' and 'searchDate' to datetime objects
all_airport['flightDate'] = pd.to_datetime(all_airport['flightDate'])
all_airport['searchDate'] = pd.to_datetime(all_airport['searchDate'])

# Calculate the time difference in days and store it in a new column 'datediff'
all_airport['datediff'] = (all_airport['flightDate'] - all_airport['searchDate']).dt.days

In [13]:
all_airport['segmentsDistance']

0            943||1207
1           None||None
2           None||None
3           None||None
4            672||2178
               ...    
13519994     596||2135
13519995    1104||2566
13519996    1104||2566
13519997    1104||2566
13519998    1104||2566
Name: segmentsDistance, Length: 13519999, dtype: object

In [14]:
### Find Sum of duration
all_airport['SumsegmentsDistance'] = all_airport['segmentsDistance'].str.split(r'\|\|').apply(lambda x: [pd.to_numeric(value, errors='coerce') for value in x])
all_airport['SumsegmentsDistance'] = all_airport['SumsegmentsDistance'].apply(sum)

In [15]:
#create a new col to collect all airport
all_airport['AllAirport'] = all_airport['transitAirportCode']
all_airport['AllAirport'] = all_airport.apply(lambda row: [row['Departure']] + row['AllAirport'], axis=1)
all_airport['AllAirport'] = all_airport.apply(lambda row: row['AllAirport'] + [row['Arrival']], axis=1)

KeyError: 'transitAirportCode'

In [None]:
all_airport['segmentsEquipmentDescription']= all_airport['segmentsEquipmentDescription'].str.split(r'\|\|')

In [None]:
all_airport['segmentsEquipmentDescription'].head()

0                                      [, Airbus A320]
1                  [, AIRBUS INDUSTRIE A320 SHARKLETS]
2    [AIRBUS INDUSTRIE A320 SHARKLETS, AIRBUS INDUS...
3       [AIRBUS INDUSTRIE A320 SHARKLETS, Airbus A319]
4                     [Boeing 737-900, Boeing 737-900]
Name: segmentsEquipmentDescription, dtype: object

In [None]:
all_airport['segmentsCabinCode']= all_airport['segmentsCabinCode'].str.split(r'\|\|')

Create column `numberofTransit`

In [None]:
all_airport['numberOfTransit'] = all_airport['transitAirportCode'].apply(lambda x: len(x))

In [None]:
#drop column

In [None]:
all_airport2 = all_airport.drop(columns=['Departure', 'Arrival', 'segmentsDistance', 'segmentsDepartureTimeRaw', 'segmentsArrivalTimeRaw', 'segmentsDurationInSeconds', 'segmentsDistance', 'travelDuration', 'SumsegmentsDurationInSeconds'])

In [None]:
all_airport2.rename(columns={'totalDurationInSeconds':'segment_totalDurationInSeconds', 'SumsegmentsDistance':'segment_totalDistance'}, inplace=True)

In [None]:
all_airport2.to_feather('../data/processed/all_airport_cleaned.feather')

In [None]:
#encoding -> label encoder, standard encoder -> save -> corr -> save result diff notebook -> split clean notebook

In [None]:
from data.make_dataset import pop_target, split_sets_random, save_sets, load_sets, label_encode_columns
from sklearn.preprocessing import StandardScaler

In [None]:
all_airport2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13519999 entries, 0 to 13519998
Data columns (total 26 columns):
 #   Column                             Dtype         
---  ------                             -----         
 0   legId                              object        
 1   searchDate                         datetime64[ns]
 2   flightDate                         datetime64[ns]
 3   startingAirport                    object        
 4   destinationAirport                 object        
 5   isBasicEconomy                     bool          
 6   isRefundable                       bool          
 7   isNonStop                          bool          
 8   totalFare                          float64       
 9   totalTravelDistance                float64       
 10  segmentsDepartureTimeEpochSeconds  object        
 11  segmentsArrivalTimeEpochSeconds    object        
 12  segmentsArrivalAirportCode         object        
 13  segmentsDepartureAirportCode       object        
 14  

In [None]:
all_airport2 = all_airport2.drop(columns=['segmentsAirlineName'],axis=1)

In [None]:
# cat_cols = ['legId','startingAirport', 'destinationAirport', 'isBasicEconomy', 'isRefundable','isNonStop','segmentsDepartureTimeEpochSeconds','segmentsArrivalTimeEpochSeconds', 'segmentsArrivalAirportCode', 'segmentsDepartureAirportCode', 'segmentsAirlineCode', 'segmentsEquipmentDescription', 'segmentsCabinCode', 'AllAirport']

In [None]:
all_airport2.columns

Index(['legId', 'searchDate', 'flightDate', 'startingAirport',
       'destinationAirport', 'isBasicEconomy', 'isRefundable', 'isNonStop',
       'totalFare', 'totalTravelDistance', 'segmentsDepartureTimeEpochSeconds',
       'segmentsArrivalTimeEpochSeconds', 'segmentsArrivalAirportCode',
       'segmentsDepartureAirportCode', 'segmentsAirlineCode',
       'segmentsEquipmentDescription', 'segmentsCabinCode',
       'travelDurationInSeconds', 'segment_totalDurationInSeconds',
       'travelLayover', 'transitAirportCode', 'segment_totalDistance',
       'AllAirport', 'numberOfTransit', 'dateDifference'],
      dtype='object')

In [None]:
num_cols = list(all_airport2.select_dtypes('number').columns)
cat_cols = list(set(all_airport2.columns) - set(num_cols))

In [None]:
list_to_str = lambda x: ', '.join(map(str, x))

In [None]:
all_airport2.head()

Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,...,segmentsEquipmentDescription,segmentsCabinCode,travelDurationInSeconds,segment_totalDurationInSeconds,travelLayover,transitAirportCode,segment_totalDistance,AllAirport,numberOfTransit,dateDifference
0,1517251,2022-04-19,2022-05-20,12,ATL,False,0,False,103.98,2150.0,...,||Airbus A320,coach||coach,28320.0,19800,8520.0,[DEN],2150.0,"[OAK, DEN, ATL]",1,31
1,1452356,2022-04-19,2022-05-20,12,ATL,False,0,False,216.58,,...,||AIRBUS INDUSTRIE A320 SHARKLETS,coach||coach,22500.0,20520,1980.0,[LAX],,"[OAK, LAX, ATL]",1,31
2,1566096,2022-04-19,2022-05-20,12,ATL,False,0,False,216.58,,...,AIRBUS INDUSTRIE A320 SHARKLETS||AIRBUS INDUST...,coach||coach,32760.0,20520,12240.0,[LAX],,"[OAK, LAX, ATL]",1,31
3,1290376,2022-04-19,2022-05-20,12,ATL,False,0,False,237.58,,...,AIRBUS INDUSTRIE A320 SHARKLETS||Airbus A319,coach||coach,22620.0,19560,3060.0,[LAS],,"[OAK, LAS, ATL]",1,31
4,499252,2022-04-19,2022-05-20,12,ATL,False,0,False,307.21,2850.0,...,Boeing 737-900||Boeing 737-900,coach||coach,51120.0,25080,26040.0,[SEA],2850.0,"[OAK, SEA, ATL]",1,31


In [None]:
scaler = StandardScaler()
features = label_encode_columns(all_airport2, cat_cols)
features[num_cols] = scaler.fit_transform(all_airport2[num_cols])

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['list']

In [None]:
features = label_encode_columns(df, cat_cols)
features[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
#corr

In [None]:
#split ratio 80:20 -> train val test