In [2]:
import zipfile
import pandas as pd
import os
import re

# Directory where the ZIP files are located (assuming it's relative)
base_directory = '../../data/raw/itineraries_csv/'

# Create an empty DataFrame to store the results
result_df = pd.DataFrame()

# Define a regular expression pattern to match filenames ending with two lowercase letters
file_pattern = re.compile(r'.*[a-z][a-z]\.zip')

# List all airport folders in the base directory
airport_names = [name for name in os.listdir(base_directory) if os.path.isdir(os.path.join(base_directory, name))]

dfs = []  # Create a list to store DataFrames

for airport_name in airport_names:
    # Directory path for the current airport
    zip_directory = os.path.join(base_directory, airport_name)
    
    # List all files in the airport's folder
    file_list = os.listdir(zip_directory)

    for filename in file_list:
        if file_pattern.match(filename):
            zip_file_path = os.path.join(zip_directory, filename)
            csv_file_path_inside_zip = filename.replace('.zip', '.csv')
            
            # Create a ZipFile object and read the CSV file
            with zipfile.ZipFile(zip_file_path, 'r') as zf:
                df = pd.read_csv(zf.open(csv_file_path_inside_zip))

            # Append the DataFrame for this filename to the list of DataFrames
            dfs.append(df)

# Use pandas.concat to concatenate the list of DataFrames into a single DataFrame
all_airport = pd.concat(dfs, ignore_index=True)

legId                                      0
searchDate                                0
flightDate                                0
startingAirport                           0
destinationAirport                        0
travelDuration                            0
isBasicEconomy                            0
isRefundable                              0
totalFare                                 0 (y)
totalTravelDistance                  959619
segmentsDepartureTimeEpochSeconds         0
segmentsArrivalTimeEpochSeconds           0
segmentsArrivalAirportCode                0
segmentsDepartureAirportCode              0
segmentsAirlineCode                       0
segmentsEquipmentDescription         262676
segmentsDurationInSeconds                 0 -> sum
segmentsDistance                          0 -> sum
segmentsCabinCode                         0
-------
travelLayover (travelDuration - segmentsDurationInSeconds)
datediff (flightDate - searchDate)
transitAirportCode (list) -> check arrival departure 
numberOfTransit -> count (transitAirportCode)

In [3]:
all_airport['SumsegmentsDurationInSeconds'] = all_airport['segmentsDurationInSeconds'].str.split(r'\|\|').apply(lambda x: [pd.to_numeric(value, errors='coerce') for value in x])

In [4]:
all_airport['SumsegmentsDurationInSeconds'] = all_airport['SumsegmentsDurationInSeconds'].apply(sum)

In [5]:
all_airport['segmentsArrivalAirportCode'] = all_airport['segmentsArrivalAirportCode'].str.split(r'\|\|')
all_airport['segmentsDepartureAirportCode'] = all_airport['segmentsDepartureAirportCode'].str.split(r'\|\|')

In [6]:
all_airport['Departure'] = all_airport['segmentsDepartureAirportCode'].apply(lambda x: x[0])

In [7]:
all_airport['Arrival'] = all_airport['segmentsArrivalAirportCode'].apply(lambda x: x[-1])

In [35]:
import pandas as pd

# Assuming all_airport is your DataFrame

def process_code_list(code_list):
    if len(code_list) != 1:
        code_list = code_list[1:] 
    return code_list

all_airport['transitAirportCode'] = all_airport['segmentsDepartureAirportCode'].apply(process_code_list)

In [38]:
all_airport['AllAirport'] = all_airport['transitAirportCode']

In [42]:
all_airport['AllAirport'] = all_airport.apply(lambda row: [row['Departure']] + row['AllAirport'], axis=1)
all_airport['AllAirport'] = all_airport.apply(lambda row: row['AllAirport'] + [row['Arrival']], axis=1)

In [43]:
all_airport['AllAirport'].iloc[8164713]

['JFK', 'IAD', 'IAH', 'DFW']

In [44]:
all_airport['segmentsDepartureAirportCode'].iloc[8164713]

['JFK', 'IAD', 'IAH']