In [14]:
import zipfile
import pandas as pd
import os
import re

# Directory where the ZIP files are located
base_directory = '../data/raw/itineraries_csv'

# Create an empty DataFrame to store the results
result_df = pd.DataFrame()

# Define a regular expression pattern to match filenames ending with two lowercase letters
file_pattern = re.compile(r'.*[a-z][a-z]\.zip')

# List all airport folders in the base directory
airport_names = [name for name in os.listdir(base_directory) if os.path.isdir(os.path.join(base_directory, name))]

dfs = []  # Create a list to store DataFrames

for airport_name in airport_names:
    # Directory path for the current airport
    zip_directory = os.path.join(base_directory, airport_name)
    
    # List all files in the airport's folder
    file_list = os.listdir(zip_directory)

    for filename in file_list:
        if file_pattern.match(filename):
            zip_file_path = os.path.join(zip_directory, filename)
            csv_file_path_inside_zip = filename.replace('.zip', '.csv')
            
            # Create a ZipFile object and read the CSV file
            with zipfile.ZipFile(zip_file_path, 'r') as zf:
                df = pd.read_csv(zf.open(csv_file_path_inside_zip))

            # Append the DataFrame for this filename to the list of DataFrames
            dfs.append(df)

# Use pandas.concat to concatenate the list of DataFrames into a single DataFrame
all_airport = pd.concat(dfs, ignore_index=True)

## Feature Engineering

In [15]:
all_airport['flightDate'] = pd.to_datetime(all_airport['flightDate'])

In [16]:
all_airport['searchDate'] = pd.to_datetime(all_airport['searchDate'])

## 1. dateDifference

`days = flightDate- searchDate` show how many day before book the flight

For streamlit we might can use the get current date and do flightDate - currentDate

In [None]:
# Calculate the date difference and create a new column 'dateDifference'
all_airport['dateDifference'] =all_airport['flightDate'] - all_airport['searchDate']

In [None]:
all_airport['dateDifference'] = all_airport['dateDifference'].dt.days

In [None]:
all_airport['dateDifference'].unique()

In [None]:
all_airport['dateDifference'].sample(10)

## List for legs

In [None]:
all_airport['segmentsAirlineCode'] = all_airport['segmentsAirlineCode'].apply(lambda x: re.split(r'\|\|', x))

In [None]:
all_airport['segmentsAirlineCode'].sample(10)

In [None]:
all_airport['segmentsAirlineCode'].apply(lambda x: len(x)).unique()

In [None]:
from datetime import datetime

In [None]:
# Split the column using '||' as the delimiter and convert to seconds
all_airport['segmentsDepartureTimeEpochSeconds'] = all_airport['segmentsDepartureTimeEpochSeconds'].str.split(r'\|\|').apply(lambda x: [int(value) for value in x])

# Convert the seconds to datetime objects
all_airport['segmentsDepartureTimeEpochSeconds'] = all_airport['segmentsDepartureTimeEpochSeconds'].apply(lambda x: [datetime.fromtimestamp(seconds) for seconds in x])


In [None]:
all_airport['segmentsDepartureTimeEpochSeconds']

In [None]:
# Convert the list of datetime objects to a list of Unix timestamps (in seconds)
all_airport['segmentsDepartureTimeEpochSeconds'] = all_airport['segmentsDepartureTimeEpochSeconds'].apply(lambda x: [int(dt.timestamp()) for dt in x])


In [None]:
all_airport['segmentsDepartureTimeEpochSeconds']

In [None]:
columns = ['segmentsDepartureTimeEpochSeconds', 'segmentsDepartureTimeRaw',
       'segmentsArrivalTimeEpochSeconds', 'segmentsArrivalTimeRaw',
       'segmentsArrivalAirportCode', 'segmentsDepartureAirportCode',
       'segmentsAirlineName', 'segmentsAirlineCode',
       'segmentsEquipmentDescription', 'segmentsDurationInSeconds',
       'segmentsDistance', 'segmentsCabinCode']

In [None]:
selected_columns = all_airport[columns]

In [None]:
selected_columns

In [None]:
all_airport.info()

## Split with regular expression

In [None]:
columns = ['segmentsDepartureTimeEpochSeconds', 'segmentsDepartureTimeRaw',
           'segmentsArrivalTimeEpochSeconds', 'segmentsArrivalTimeRaw',
           'segmentsArrivalAirportCode', 'segmentsDepartureAirportCode', 'segmentsAirlineCode',
           'segmentsEquipmentDescription', 'segmentsDurationInSeconds',
           'segmentsDistance', 'segmentsCabinCode']

In [None]:
# # Apply the split operation to each column using a lambda function
# for column in columns:
#     all_airport[column] = all_airport[column].apply(lambda x: re.split(r'\|\|', x))

In [None]:
# all_airport.describe()

- legId                                      0
- searchDate                                0
- flightDate                                0
- startingAirport                           0
- destinationAirport                        0
- travelDuration                            0
- isBasicEconomy                            0
isRefundable                              0
totalFare                                 0 (y)
totalTravelDistance                  959619
segmentsDepartureTimeEpochSeconds         0
segmentsArrivalTimeEpochSeconds           0
segmentsArrivalAirportCode                0
segmentsDepartureAirportCode              0
segmentsAirlineCode                       0
segmentsEquipmentDescription         262676
segmentsDurationInSeconds                 0 -> sum
segmentsDistance                          0 -> sum
segmentsCabinCode                         0
-------
travelLayover (travelDuration - segmentsDurationInSeconds)
datediff (flightDate - searchDate)
transitAirportCode (list) -> check arrival departure 
numberOfTransit -> count (transitAirportCode)

## Visualisation

In [None]:
import matplotlib.pyplot as plt
# Calculate the value counts for 'segmentsAirlineCode'
value_counts = all_airport['segmentsAirlineCode'].value_counts()

# Filter for values with more than 10 counts
filtered_value_counts = value_counts[value_counts > 10000]

# Create a bar plot for the filtered value counts
filtered_value_counts.plot(kind='bar')

# Set labels and title
plt.xlabel('Airline Code')
plt.ylabel('Count')
plt.title('Airline Codes with More Than 10 Occurrences')

# Show the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
# Calculate the value counts for 'segmentsAirlineCode'
value_counts = all_airport['segmentsCabinCode'].value_counts()

# Filter for values with more than 10 counts
filtered_value_counts = value_counts[value_counts > 100]

# Create a bar plot for the filtered value counts
filtered_value_counts.plot(kind='bar')

# Set labels and title
plt.xlabel('Airline Code')
plt.ylabel('Count')
plt.title('Airline Codes with More Than 10 Occurrences')

# Show the plot
plt.show()