In [1]:
# ------------------------------------------------------------------------------------------
# IMPORTENT!!! --> Before runing this code make sure to run - "01_flights_each_day_week_14_data.ipynb"
# ------------------------------------------------------------------------------------------

In [6]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from pyspark.sql.functions import *
import os

In [7]:
cwd = os.getcwd()
spark = SparkSession.builder.appName('SourceTargetCount').getOrCreate()

In [12]:
# Step 1 - Week 14 Flights data (OpenSky data)

file = f'{cwd}/data/stg/flights_each_day_week_14_data/*.csv'
fdf = spark.read.csv(path=file, header=True)
fdf = fdf.withColumn("filename", input_file_name())
fdf.createOrReplaceTempView('FDF')
# fdf.printSchema()

# Schema:
# root
#  |-- flight_date: string (nullable = true)
#  |-- flight_dayofweek: string (nullable = true)
#  |-- airplane_code: string (nullable = true)
#  |-- airport_source_code: string (nullable = true)
#  |-- airport_target_code: string (nullable = true)
#  |-- flights_count: string (nullable = true)
#  |-- flight_year: string (nullable = true)
#  |-- filename: string (nullable = false)


In [13]:
# Step 2 - Check Null percentage for each attribute

# spark.sql("""

# select  count(*)            as row_count,
#         cast(sum( case when flight_date         is not null then 1 else 0 end) as double) / count(*) * 100 as NN_flight_date     ,
#         cast(sum( case when flight_dayofweek    is not null then 1 else 0 end) as double) / count(*) * 100 as NN_flight_dayofweek,
#         cast(sum( case when airplane_code       is not null then 1 else 0 end) as double) / count(*) * 100 as NN_airplane_code   ,
#         cast(sum( case when airport_source_code is not null then 1 else 0 end) as double) / count(*) * 100 as NN_airport_source_code,
#         cast(sum( case when airport_target_code is not null then 1 else 0 end) as double) / count(*) * 100 as NN_airport_target_code,
#         cast(sum( case when flights_count       is not null then 1 else 0 end) as double) / count(*) * 100 as NN_flights_count   ,
#         cast(sum( case when year(flight_date)   is not null then 1 else 0 end) as double) / count(*) * 100 as NN_flight_year
# from    FDF

# """).show(10,False)


# +---------+--------------+-------------------+-----------------+---------+---------+----------------+--------------+
# |row_count|NN_flight_date|NN_flight_dayofweek|NN_airplane_code |NN_Source|NN_Target|NN_flights_count|NN_flight_year|
# +---------+--------------+-------------------+-----------------+---------+---------+----------------+--------------+
# |502819   |100.0         |100.0              |63.76867222598987|100.0    |100.0    |100.0           |100.0         |
# +---------+--------------+-------------------+-----------------+---------+---------+----------------+--------------+

# As we can see must of our attributes are at 100% except airplane_code ~63%

In [14]:
# Step 3 - Aggregate data for entire week 14 per year 

# source_target_count_week_14_per_year = spark.sql("""

# select  airplane_code               as airplane_code,
#         airport_source_code         as Source,
#         airport_target_code         as Target,
#         sum( flights_count )        as Weight,
#         year(flight_date) || '_W14' as week_14_year
# from    FDF
# group by 1,2,3,5

# """).show(10,False)

source_target_count_week_14_per_year = spark.sql("""

select  airport_source_code  as Source,
        airport_target_code  as Target,
        sum( flights_count ) as flights_count,
        year(flight_date)    as flight_year,
        year(flight_date)    as year
from    FDF
group by 1,2,4,5

""")

source_target_count_week_14_per_year.count()


187989

In [15]:
source_target_count_week_14_per_year.repartition(1).write \
.partitionBy('year') \
.mode('overwrite') \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.option("quoteAll", "true") \
.save("./data/stg/source_target_count_week_14_per_year")

In [17]:
# Copy new file into proper data folder location
!cp ./data/stg/source_target_count_week_14_per_year/*2019*/*.csv ./data/stg/source_target_count_week_14_per_year/flights_2019.csv 
# Delete Spark output folder
!rm -rf ./data/stg/source_target_count_week_14_per_year/year=2019*
# Check that the folder deleted
!ls ./data/stg/source_target_count_week_14_per_year/year=2019*/*.csv

zsh:1: no matches found: ./data/stg/source_target_count_week_14_per_year/year=2019*/*.csv


In [18]:
# Copy new file into proper data folder location
!cp ./data/stg/source_target_count_week_14_per_year/*2020*/*.csv ./data/stg/source_target_count_week_14_per_year/flights_2020.csv 
# Delete Spark output folder
!rm -rf ./data/stg/source_target_count_week_14_per_year/year=2020*
# Check that the folder deleted
!ls ./data/stg/source_target_count_week_14_per_year/year=2020*/*.csv

zsh:1: no matches found: ./data/stg/source_target_count_week_14_per_year/year=2020*/*.csv


In [19]:
# Copy new file into proper data folder location
!cp ./data/stg/source_target_count_week_14_per_year/*2021*/*.csv ./data/stg/source_target_count_week_14_per_year/flights_2021.csv 
# Delete Spark output folder
!rm -rf ./data/stg/source_target_count_week_14_per_year/year=2021*
# Check that the folder deleted
!ls ./data/stg/source_target_count_week_14_per_year/year=2021*/*.csv

zsh:1: no matches found: ./data/stg/source_target_count_week_14_per_year/year=2021*/*.csv
