# Bayesian Spatiotemporal Graph Transformer Network (B-STAR) for Multi-Aircraft Trajectory Prediction

## Part 1: IFF ASDE-X Flight Track Data Processing

Author: Yutian Pang, Arizona State University

Email: yutian.pang@asu.edu

### Environment Requirements

This file is test functional with,
- Ubuntu 20.04 LTS
- Python 3.8.5 with Anaconda
- Spark 3.1.1 with Hadoop3.2

The required packages are, 
- PySpark
- Sedona(GeoSpark)

In [1]:
# Set spark environments
import os, glob
os.environ["SPARK_HOME"] = '/home/ypang6/spark-3.1.1-bin-hadoop3.2'
os.environ["PYTHONPATH"] = '/home/ypang6/anaconda3/bin/python3.8'
os.environ['PYSPARK_PYTHON'] = '/home/ypang6/anaconda3/bin/python3.8'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/home/ypang6/anaconda3/bin/python3.8'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer

In [3]:
def load_schema():
    myschema = StructType([
        StructField("recType", ShortType(), True),  # 1  //track point record type number
        StructField("recTime", StringType(), True),  # 2  //seconds since midnigght 1/1/70 UTC
        StructField("fltKey", LongType(), True),  # 3  //flight key
        StructField("bcnCode", IntegerType(), True),  # 4  //digit range from 0 to 7
        StructField("cid", IntegerType(), True),  # 5  //computer flight id
        StructField("Source", StringType(), True),  # 6  //source of the record
        StructField("msgType", StringType(), True),  # 7
        StructField("acId", StringType(), True),  # 8  //call sign
        StructField("recTypeCat", IntegerType(), True),  # 9
        StructField("lat", DoubleType(), True),  # 10
        StructField("lon", DoubleType(), True),  # 11
        StructField("alt", DoubleType(), True),  # 12  //in 100s of feet
        StructField("significance", ShortType(), True),  # 13 //digit range from 1 to 10
        StructField("latAcc", DoubleType(), True),  # 14
        StructField("lonAcc", DoubleType(), True),  # 15
        StructField("altAcc", DoubleType(), True),  # 16
        StructField("groundSpeed", IntegerType(), True),  # 17 //in knots
        StructField("course", DoubleType(), True),  # 18  //in degrees from true north
        StructField("rateOfClimb", DoubleType(), True),  # 19  //in feet per minute
        StructField("altQualifier", StringType(), True),  # 20  //Altitude qualifier (the “B4 character”)
        StructField("altIndicator", StringType(), True),  # 21  //Altitude indicator (the “C4 character”)
        StructField("trackPtStatus", StringType(), True),  # 22  //Track point status (e.g., ‘C’ for coast)
        StructField("leaderDir", IntegerType(), True),  # 23  //int 0-8 representing the direction of the leader line
        StructField("scratchPad", StringType(), True),  # 24
        StructField("msawInhibitInd", ShortType(), True),  # 25 // MSAW Inhibit Indicator (0=not inhibited, 1=inhibited)
        StructField("assignedAltString", StringType(), True),  # 26
        StructField("controllingFac", StringType(), True),  # 27
        StructField("controllingSec", StringType(), True),  # 28
        StructField("receivingFac", StringType(), True),  # 29
        StructField("receivingSec", StringType(), True),  # 30
        StructField("activeContr", IntegerType(), True),  # 31  // the active control number
        StructField("primaryContr", IntegerType(), True),
        # 32  //The primary(previous, controlling, or possible next)controller number
        StructField("kybrdSubset", StringType(), True),  # 33  //identifies a subset of controller keyboards
        StructField("kybrdSymbol", StringType(), True),  # 34  //identifies a keyboard within the keyboard subsets
        StructField("adsCode", IntegerType(), True),  # 35  //arrival departure status code
        StructField("opsType", StringType(), True),  # 36  //Operations type (O/E/A/D/I/U)from ARTS and ARTS 3A data
        StructField("airportCode", StringType(), True),  # 37
        StructField("trackNumber", IntegerType(), True),  # 38
        StructField("tptReturnType", StringType(), True),  # 39
        StructField("modeSCode", StringType(), True)  # 40
    ])
    return myschema

In [4]:
def load_data(date):
    file_path = glob.glob("/media/ypang6/paralab/Research/data/ATL/IFF_ATL+ASDEX_{}*.csv".format(date))[0]
    df = spark.read.csv(file_path, header=False, sep=",", schema=iff_schema)
    print("Date: {} Number of Lines: {}".format(date, df.count()))
    return df

In [5]:
spark = SparkSession.\
        builder.\
        master("local[*]").\
        appName("Sector_IFF_Parser").\
        config("spark.serializer", KryoSerializer.getName).\
        config("spark.kryo.registrator", SedonaKryoRegistrator.getName) .\
        config("spark.jars.packages", "org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.0-incubating,org.datasyslab:geotools-wrapper:geotools-24.0") .\
        getOrCreate()

SedonaRegistrator.registerAll(spark)
sc = spark.sparkContext
iff_schema = load_schema()

In [6]:
# load data into Sedona
date = 20190807
df = load_data(date)

Date: 20190807 Number of Lines: 1699663


In [7]:
# select columns
cols = ['recType', 'recTime', 'acId', 'lat', 'lon', 'alt']
df = df.select(*cols).filter(df['recType']==3).withColumn("recTime", df['recTime'].cast(IntegerType()))
df.show(5)

+-------+----------+----+--------+---------+-----+
|recType|   recTime|acId|     lat|      lon|  alt|
+-------+----------+----+--------+---------+-----+
|      3|1565149748|UNKN|33.63186|-84.44462|10.06|
|      3|1565149749|UNKN|33.63186|-84.44455|10.06|
|      3|1565149752|UNKN|33.63183|-84.44448|10.06|
|      3|1565149753|UNKN|33.63183|-84.44443|10.06|
|      3|1565149754|UNKN|33.63183|-84.44438|10.06|
+-------+----------+----+--------+---------+-----+
only showing top 5 rows



### Build Spatial Dataframe with Sedona

In [8]:
# time window to query
duration = 4 # hours
t_start = 1564668000 + (date-20190801)*24*3600 # June 1st, 2pm, 2019, UTC
t_end = t_start + 3600*duration

In [9]:
# register pyspark df in SQL
df.registerTempTable("pointtable")

# create shape column in geospark
spatialdf = spark.sql(
  """
  SELECT ST_Point(CAST(lat AS Decimal(24, 20)), CAST(lon AS Decimal(24, 20))) AS geom, recTime, acId, alt
  FROM pointtable
  WHERE recTime>={} AND recTime<={}
  """.format(t_start, t_end))

spatialdf.createOrReplaceTempView("spatialdf")
spatialdf.show(5, truncate=False)

+--------------------------+----------+-------+-----+
|geom                      |recTime   |acId   |alt  |
+--------------------------+----------+-------+-----+
|POINT (33.63759 -84.43789)|1565186555|DAL1350|10.06|
|POINT (33.63753 -84.43788)|1565186556|DAL1350|10.06|
|POINT (33.63747 -84.43786)|1565186557|DAL1350|10.06|
|POINT (33.63737 -84.43784)|1565186558|DAL1350|10.06|
|POINT (33.63733 -84.43785)|1565186560|DAL1350|10.06|
+--------------------------+----------+-------+-----+
only showing top 5 rows



In [10]:
spatialdf.count()

368875

### Range Rectangular Query around KATL

In [11]:
katl = [33.6366996, -84.4278640, 10.06]  #https://www.airnav.com/airport/katl
r = 0.5  # rectangular query

In [12]:
# ST_PolygonFromEnvelope (MinX:decimal, MinY:decimal, MaxX:decimal, MaxY:decimal, UUID1, UUID2, ...)
range_query_result = spark.sql(
  """
    SELECT DISTINCT acId
    FROM spatialdf
    WHERE ST_Contains(ST_PolygonFromEnvelope({}, {}, {}, {}), geom) AND alt>{}
  """.format(katl[0]-r, katl[1]-r, katl[0]+r, katl[1]+r, katl[2]))

In [13]:
range_query_result.count()

581

In [14]:
range_query_result.show(5)

+-------+
|   acId|
+-------+
|DAL2041|
| UAL241|
|GJS4507|
|DAL1154|
|SKW3742|
+-------+
only showing top 5 rows



In [15]:
df_result = spark.sql(
  """
    SELECT *
    FROM spatialdf
    WHERE ST_Contains(ST_PolygonFromEnvelope({}, {}, {}, {}), geom) AND alt>{}
  """.format(katl[0]-r, katl[1]-r, katl[0]+r, katl[1]+r, katl[2]))

df_result.count()

156122

### Organize the format of DF

In [16]:
# create relevant timestamp column
df_result.createOrReplaceTempView("spatialdf")
df = spark.sql(
"""
    SELECT acId, recTime-{} AS t, geom, alt
    FROM spatialdf
""".format(t_start)
)
df.show(5, False)

+-------+---+--------------------------+-----+
|acId   |t  |geom                      |alt  |
+-------+---+--------------------------+-----+
|DAL1350|448|POINT (33.63582 -84.41124)|10.38|
|DAL1350|449|POINT (33.63583 -84.41117)|10.5 |
|DAL1350|450|POINT (33.63576 -84.41111)|10.25|
|DAL1350|451|POINT (33.63577 -84.41103)|10.13|
|DAL1350|665|POINT (33.63472 -84.41763)|10.75|
+-------+---+--------------------------+-----+
only showing top 5 rows



In [17]:
# change acId Into integers
# from pyspark.sql.window import Window
# from pyspark.sql.functions import dense_rank

# #windowSpec  = Window.partitionBy("acId").orderBy("t")
# windowSpec = Window.partitionBy("t").orderBy("acId")

# df.withColumn("dense_rank",dense_rank().over(windowSpec)).show(100, False)

In [18]:
# change acId Into integers
# have to use pyspark ml features
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="acId", outputCol="FacId")
df_new = indexer.fit(df).transform(df).drop('acId')
df_new.show(5, False)

+---+--------------------------+-----+-----+
|t  |geom                      |alt  |FacId|
+---+--------------------------+-----+-----+
|448|POINT (33.63582 -84.41124)|10.38|57.0 |
|449|POINT (33.63583 -84.41117)|10.5 |57.0 |
|450|POINT (33.63576 -84.41111)|10.25|57.0 |
|451|POINT (33.63577 -84.41103)|10.13|57.0 |
|665|POINT (33.63472 -84.41763)|10.75|57.0 |
+---+--------------------------+-----+-----+
only showing top 5 rows



In [19]:
df_new.createOrReplaceTempView("spatialdf")

df = spark.sql(
"""
    SELECT t, CAST(FacId AS Integer), ST_X(geom) as lat, ST_Y(geom) as lon, alt
    FROM spatialdf
"""
)

### Mask Columns in Data
- Unix Timestamp
- Flight Callsign

In [20]:
df.show(5, False)

+---+-----+--------+---------+-----+
|t  |FacId|lat     |lon      |alt  |
+---+-----+--------+---------+-----+
|448|57   |33.63582|-84.41124|10.38|
|449|57   |33.63583|-84.41117|10.5 |
|450|57   |33.63576|-84.41111|10.25|
|451|57   |33.63577|-84.41103|10.13|
|665|57   |33.63472|-84.41763|10.75|
+---+-----+--------+---------+-----+
only showing top 5 rows



### Modification 1: 
#### Resample the time series with an interval $dt$

In [21]:
dt = 10
t_start = 0
t_end = 3600 * duration

t_interval = list(range(t_start, t_end, dt))
df = df[df.t.isin(t_interval)]

In [22]:
df.show(15)

+---+-----+--------+---------+-----+
|  t|FacId|     lat|      lon|  alt|
+---+-----+--------+---------+-----+
|450|   57|33.63576|-84.41111|10.25|
|670|   57|33.63468|  -84.421|10.38|
|690|   57|33.63457|-84.43889|14.06|
|700|   57| 33.6344|-84.44845|19.13|
|710|   57| 33.6344|-84.45818| 23.5|
|720|   57|33.63442|-84.46875|25.31|
|730|   57|33.63246|   -84.48| 28.5|
|740|   57|33.62812|-84.49124|31.06|
|750|   57| 33.6228|-84.50281|34.63|
|760|   57|33.61735|-84.51479|39.31|
|770|   57|33.61187|-84.52725|44.13|
|780|   57|33.60656|-84.54127|48.63|
|790|   57|33.60111|-84.55442|54.25|
|800|   57|33.59564|-84.56774|59.44|
|810|   57| 33.5874|-84.57992| 64.0|
+---+-----+--------+---------+-----+
only showing top 15 rows



### Modification 2:
#### Change the origin of the coordinate system to the airport center
#### void in real case

In [23]:
df.createOrReplaceTempView("spatialdf")

df2 = spark.sql(
"""
    SELECT t, FacId, lat-{} AS Lat, lon-{} AS Lon, alt
    FROM spatialdf
""".format(katl[0], katl[1])
)

In [24]:
df2.show(5, False)

+---+-----+----------------------+---------+
|t  |FacId|Lat                   |lon      |
+---+-----+----------------------+---------+
|450|57   |-9.396000000023719E-4 |-84.41111|
|670|57   |-0.0020195999999970127|-84.421  |
|690|57   |-0.0021296000000035065|-84.43889|
|700|57   |-0.0022996000000006234|-84.44845|
|710|57   |-0.0022996000000006234|-84.45818|
+---+-----+----------------------+---------+
only showing top 5 rows



### Save into csv

In [25]:
# Make sure the type is correct
df.toPandas()
df.dtypes

[('t', 'int'),
 ('FacId', 'int'),
 ('lat', 'double'),
 ('lon', 'double'),
 ('alt', 'double')]

In [26]:
# save data with altitude
csv_name = 'KATL_r_{}_date_{}_range_{}_wAltitude.csv'.format(r, date, duration)
df.toPandas().T.to_csv(csv_name, sep=',', index=False, header=False)
# df.coalesce(1).write.csv(csv_name, sep=',')

# save data without altitude dimension
csv_name = 'KATL_r_{}_date_{}_range_{}.csv'.format(r, date, duration)
df.drop('alt').toPandas().T.to_csv(csv_name, sep=',', index=False, header=False)
# df.coalesce(1).write.csv(csv_name, sep=',')