In [1]:
import pandas as pd
import numpy as np
import random
import time

# Dictionary of airlines and their abbreviations for random selection
airlines = {
    "United Airlines": "UA",
    "American Airlines": "AA",
    "Southwest Airlines": "SW",
    "Delta Air Lines": "DL"
}

airports = ["SFO", "ORD", "MKE", "LAX"]

# List of specific airplane numbers
airplane_numbers = [920, 360, 481, 720, 526, 755, 192, 214, 388, 823, 109]

# Mapping of airports to delay spike intervals (in seconds)
airport_delay_intervals = {
    "SFO": 3 * 3600,
    "ORD": 6 * 3600,
    "MKE": 9 * 3600,
}

# Generate a set of unique flight names
flight_names = set(f"{airline_code}{airplane_number}" for airline_name, airline_code in airlines.items() for airplane_number in airplane_numbers)

# Start and end timestamps (24 hours from now)
start_time = int(time.time()) - 24*3600*30*3
end_time = int(time.time()) + 24*3600*30*3

flights = []

for flight_name in flight_names:
    airline_name = next((name for name, code in airlines.items() if flight_name.startswith(code)), None)
    
    # Check if it's first flight for the given flight name
    if len([flight for flight in flights if flight["Flight Name"] == flight_name]) == 0:
        departure_airport = random.choice(airports)
    else:
        # For the subsequent flights, the departure airport becomes the arrival airport of the previous flight
        departure_airport = flights[-1]["Arrival Airport"]
        if departure_airport == "MKE":
            departure_airport = "ORD"

    expected_takeoff_time = start_time
    
    while True:
        # Get arrival airport, different from departure airport
        if departure_airport == "ORD":
            arrival_airport = "MKE"
        else:
            arrival_airport = random.choice([airport for airport in airports if airport != departure_airport and airport != "MKE"])

        # Generate flight time
        flight_time = np.random.randint(3600, 6*3600)  # Flight time

        # Calculate delay
        if arrival_airport == "ORD" or airline_name == "United Airlines":
            delay_time = np.random.randint(1800, 10800)  # Delay between 30 minutes and 3 hours
        else:
            delay_time = np.random.randint(0, 1800)
        
        
        # if departure_airport in airport_delay_intervals:
        #     delay_interval = airport_delay_intervals[departure_airport]
        #     if expected_takeoff_time % delay_interval < 3600:  # If within a delay spike period
        #         delay_time = np.random.randint(1800, 10800)  # Delay between 30 minutes and 3 hours
        #     else:
        #         delay_time = np.random.randint(0, 1800)  # Delay between 0 and 30 minutes
        # else:
        #     delay_time = np.random.randint(0, 1800)  # Delay between 0 and 30 minutes

        actual_takeoff_time = expected_takeoff_time + delay_time
        # If the flight goes beyond the 24-hour window, break the loop
        if actual_takeoff_time + flight_time > end_time:
            break

        # Add flight to the flights list
        flights.append({
            "Flight Name": flight_name,
            "Expected Takeoff Time": expected_takeoff_time,
            "Actual Takeoff Time": actual_takeoff_time,
            "Delay Time": delay_time,
            "Flight Time": flight_time,
            "Airline": airline_name,
            "Departure Airport": departure_airport,
            "Arrival Airport": arrival_airport
        })
        if len(flights) >= 10000:
            break
        
        # Move to the next flight
        departure_airport = arrival_airport
        # Wait time between flights is randomly selected between 0 and 3 hours
        wait_time = np.random.randint(0, 3*3600)
        expected_takeoff_time = actual_takeoff_time + flight_time + wait_time
        
    if len(flights) >= 10000:  # Stop if we've generated enough flights
        break

# Create DataFrame
df = pd.DataFrame(flights)

# Save to CSV
df.to_csv("flight_takeoff_data.csv", index=False)

In [2]:
# Sort DataFrame
df = df.sort_values(by=['Airline', 'Departure Airport', 'Arrival Airport', 'Expected Takeoff Time'])

# Group the DataFrame
grouped = df.groupby(['Airline', 'Departure Airport', 'Arrival Airport'])

# Create 'Next Flight' and 'Next Flight Expected Takeoff' columns
df['Next Flight'] = grouped['Flight Name'].shift(-1).fillna('None')
df['Next Flight Expected Takeoff'] = grouped['Expected Takeoff Time'].shift(-1).fillna('None')
df['Next Flight Info'] = df.apply(lambda row: f"{row['Next Flight']}, {row['Next Flight Expected Takeoff']}", axis=1)

df.drop(columns=['Next Flight', 'Next Flight Expected Takeoff'], inplace=True)

df = df.sort_values(by="Actual Takeoff Time")
df.reset_index(drop=True, inplace=True)

# Save the DataFrame to a CSV
df.to_csv("flight_takeoff_data_next.csv", index=False)

In [3]:
print(len(df))
df

10000


Unnamed: 0,Flight Name,Expected Takeoff Time,Actual Takeoff Time,Delay Time,Flight Time,Airline,Departure Airport,Arrival Airport,Next Flight Info
0,AA214,1682959659,1682960097,438,9658,American Airlines,LAX,SFO,"AA920, 1683127612.0"
1,DL823,1682959659,1682960352,693,16638,Delta Air Lines,MKE,SFO,"DL526, 1683011211.0"
2,SW109,1682959659,1682961024,1365,15734,Southwest Airlines,ORD,MKE,"SW109, 1682999739.0"
3,SW360,1682959659,1682961133,1474,9610,Southwest Airlines,MKE,LAX,"SW360, 1683034737.0"
4,UA481,1682959659,1682961502,1843,20136,United Airlines,ORD,MKE,"UA109, 1682976006.0"
...,...,...,...,...,...,...,...,...,...
9995,UA481,1698485335,1698496048,10713,10812,United Airlines,MKE,LAX,"None, None"
9996,AA920,1698495518,1698496517,999,14918,American Airlines,LAX,SFO,"None, None"
9997,AA214,1698496646,1698497771,1125,10130,American Airlines,ORD,MKE,"None, None"
9998,DL192,1698490433,1698500435,10002,7275,Delta Air Lines,SFO,ORD,"None, None"
