In [0]:
# import libraries
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler

from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, LogisticRegression, GBTClassifier 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from sparkdl.xgboost import XgboostClassifier

In [0]:
# set up storage
blob_container = "container1" # The name of your container created in https://portal.azure.com
storage_account = "w261sec6group3" # The name of your Storage account created in https://portal.azure.com
secret_scope = "w261sec6group3_scope" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261sec6group3_key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

### Load Data

In [0]:
# read preprocessed dataset from blob storage
df_final = spark.read.parquet(f"{blob_url}/final_vectorized_new")

In [0]:
df_final.display()

quarter,month,day_of_month,day_of_week,fl_date,op_unique_carrier,op_carrier_airline_id,op_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin_airport_seq_id,origin_city_market_id,origin,origin_city_name,origin_state_abr,origin_state_fips,origin_state_nm,origin_wac,dest_airport_id,dest_airport_seq_id,dest_city_market_id,dest,dest_city_name,dest_state_abr,dest_state_fips,dest_state_nm,dest_wac,crs_dep_time,dep_time,dep_delay,dep_delay_new,dep_del15,dep_delay_group,dep_time_blk,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,arr_delay_new,arr_del15,arr_delay_group,arr_time_blk,cancelled,diverted,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,distance_group,div_airport_landings,year,IATA,station_id,name,icao,lat,lon,altitude,station_lon,station_lat,timezone,type,crs_dep_hour,flight_hour,flight_time,flight_time_utc,flight_time_utc_hour,flight_time_utc_year,flight_time_utc_month,flight_time_utc_date,is_holiday,weather_station,weather_station_name,metar_hour,avg_hourly_dew_point_temp,avg_hourly_dry_bulb_temp,avg_hourly_relative_humidity,avg_hourly_station_pressure,avg_hourly_visibility,avg_hourly_wind_direction,avg_hourly_wind_speed,avg_hourly_precipitation_ordinal,pagerank,avg_hourly_dew_point_temp_lag6,avg_hourly_dry_bulb_temp_lag6,avg_hourly_relative_humidity_lag6,avg_hourly_station_pressure_lag6,avg_hourly_visibility_lag6,avg_hourly_wind_direction_lag6,avg_hourly_wind_speed_lag6,avg_hourly_precipitation_ordinal_lag6,avg_hourly_dew_point_temp_lag12,avg_hourly_dry_bulb_temp_lag12,avg_hourly_relative_humidity_lag12,avg_hourly_station_pressure_lag12,avg_hourly_visibility_lag12,avg_hourly_wind_direction_lag12,avg_hourly_wind_speed_lag12,avg_hourly_precipitation_ordinal_lag12,arr_del15_lag1,origin_lag1,frequent_delay,total_flights,quarterIndex,quarterclassVec,day_of_weekIndex,day_of_weekclassVec,op_unique_carrierIndex,op_unique_carrierclassVec,tail_numIndex,tail_numclassVec,originIndex,originclassVec,origin_state_abrIndex,origin_state_abrclassVec,destIndex,destclassVec,dest_state_abrIndex,dest_state_abrclassVec,crs_dep_hourIndex,crs_dep_hourclassVec,flight_time_utc_hourIndex,flight_time_utc_hourclassVec,flight_time_utc_yearIndex,flight_time_utc_yearclassVec,flight_time_utc_monthIndex,flight_time_utc_monthclassVec,avg_hourly_wind_directionIndex,avg_hourly_wind_directionclassVec,avg_hourly_wind_direction_lag6Index,avg_hourly_wind_direction_lag6classVec,avg_hourly_wind_direction_lag12Index,avg_hourly_wind_direction_lag12classVec,arr_del15_lag1Index,arr_del15_lag1classVec,origin_lag1Index,origin_lag1classVec,is_holidayIndex,is_holidayclassVec,frequent_delayIndex,frequent_delayclassVec,features
1,1,30,5,2015-01-30,WN,19393,WN,N218WN,2428,10423,1042302,30423,AUS,"Austin, TX",TX,48,Texas,74,11259,1125903,30194,DAL,"Dallas, TX",TX,48,Texas,74,2145,2141,-4.0,0.0,0.0,-1,2100-2159,15.0,2156,2231,4.0,2240,2235,-5.0,0.0,0.0,-1,2200-2259,0.0,0.0,55.0,54.0,35.0,1.0,189.0,1,0,2015,AUS,72254013904,Austin Bergstrom International Airport,KAUS,30.194499969482425,-97.6698989868164,542,-97.68,30.183,America/Chicago,large_airport,21.0,2145,2015-01-30T21:45:00.000+0000,2015-01-31T03:45:00.000+0000,3,2015,1,31,0,72254013904,"AUSTIN BERGSTROM INTERNATIONAL AIRPORT, TX US",2015-01-31T00:00:00.000+0000,40.0,50.0,68.0,29.709999084472656,10.0,70.0,5.0,0.0,5.0,39.0,51.0,64.0,29.75,10.0,40.0,6.0,0.0,40.0,57.0,53.0,29.76000022888184,10.0,40.0,9.0,0.0,0.0,ATL,2,446585,1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 6, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 19, indices -> List(0), values -> List(1.0))",120.0,"Map(vectorType -> sparse, length -> 8399, indices -> List(120), values -> List(1.0))",32.0,"Map(vectorType -> sparse, length -> 375, indices -> List(32), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 50, indices -> List(1), values -> List(1.0))",27.0,"Map(vectorType -> sparse, length -> 383, indices -> List(27), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 52, indices -> List(1), values -> List(1.0))",15.0,"Map(vectorType -> sparse, length -> 23, indices -> List(15), values -> List(1.0))",17.0,"Map(vectorType -> sparse, length -> 23, indices -> List(17), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 11, indices -> List(2), values -> List(1.0))",36.0,"Map(vectorType -> sparse, length -> 1231, indices -> List(36), values -> List(1.0))",33.0,"Map(vectorType -> sparse, length -> 1287, indices -> List(33), values -> List(1.0))",33.0,"Map(vectorType -> sparse, length -> 1260, indices -> List(33), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 375, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13532, indices -> List(1, 3, 9, 148, 8459, 8803, 8879, 9236, 9302, 9327, 9336, 9342, 9387, 10615, 11902, 13129, 13130, 13505, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13516, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529, 13530), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 55.0, 5.0, 446585.0, 40.0, 50.0, 68.0, 29.709999084472656, 10.0, 5.0, 39.0, 51.0, 64.0, 29.75, 10.0, 6.0, 40.0, 57.0, 53.0, 29.760000228881836, 10.0, 9.0))"
1,1,20,2,2015-01-20,OO,20304,OO,N746SK,6198,11292,1129202,30325,DEN,"Denver, CO",CO,8,Colorado,82,11603,1160302,31603,EUG,"Eugene, OR",OR,41,Oregon,92,1850,1850,0.0,0.0,0.0,0,1800-1859,13.0,1903,2022,4.0,2043,2026,-17.0,0.0,0.0,-2,2000-2059,0.0,0.0,173.0,156.0,139.0,1.0,996.0,4,0,2015,DEN,72565003017,Denver International Airport,KDEN,39.861698150635,-104.672996521,5431,-104.658,39.833,America/Denver,large_airport,18.0,1850,2015-01-20T18:50:00.000+0000,2015-01-21T01:50:00.000+0000,1,2015,1,21,1,72565003017,"DENVER INTERNATIONAL AIRPORT, CO US",2015-01-20T22:00:00.000+0000,22.0,29.0,75.0,24.709999084472656,10.0,30.0,5.0,0.0,5.0,23.0,28.0,81.0,24.700000762939453,10.0,110.0,6.0,0.0,15.0,42.0,33.0,24.649999618530277,10.0,0.0,0.0,0.0,0.0,PSP,2,446585,1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",5.0,"Map(vectorType -> sparse, length -> 6, indices -> List(5), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 19, indices -> List(3), values -> List(1.0))",837.0,"Map(vectorType -> sparse, length -> 8399, indices -> List(837), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 375, indices -> List(3), values -> List(1.0))",6.0,"Map(vectorType -> sparse, length -> 50, indices -> List(6), values -> List(1.0))",124.0,"Map(vectorType -> sparse, length -> 383, indices -> List(124), values -> List(1.0))",23.0,"Map(vectorType -> sparse, length -> 52, indices -> List(23), values -> List(1.0))",9.0,"Map(vectorType -> sparse, length -> 23, indices -> List(9), values -> List(1.0))",14.0,"Map(vectorType -> sparse, length -> 23, indices -> List(14), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 11, indices -> List(2), values -> List(1.0))",32.0,"Map(vectorType -> sparse, length -> 1231, indices -> List(32), values -> List(1.0))",27.0,"Map(vectorType -> sparse, length -> 1287, indices -> List(27), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1260, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",78.0,"Map(vectorType -> sparse, length -> 375, indices -> List(78), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13532, indices -> List(1, 8, 12, 865, 8430, 8808, 8976, 9258, 9296, 9324, 9336, 9342, 9383, 10609, 11869, 13129, 13208, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13516, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 173.0, 5.0, 446585.0, 22.0, 29.0, 75.0, 24.709999084472656, 10.0, 5.0, 23.0, 28.0, 81.0, 24.700000762939453, 10.0, 6.0, 15.0, 42.0, 33.0, 24.649999618530273, 10.0))"
1,1,3,6,2015-01-03,WN,19393,WN,N289CT,2775,12889,1288903,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,11884,1188402,31884,GEG,"Spokane, WA",WA,53,Washington,93,1410,1415,5.0,5.0,0.0,0,1400-1459,13.0,1428,1632,3.0,1635,1635,0.0,0.0,0.0,0,1600-1659,0.0,0.0,145.0,140.0,124.0,1.0,806.0,4,0,2015,LAS,72386023169,McCarran International Airport,KLAS,36.08010101,-115.1520004,2181,-115.163,36.072,America/Los_Angeles,large_airport,14.0,1410,2015-01-03T14:10:00.000+0000,2015-01-03T22:10:00.000+0000,22,2015,1,3,1,72386023169,"MCCARRAN INTERNATIONAL AIRPORT, NV US",2015-01-03T19:00:00.000+0000,16.0,44.0,32.0,27.93000030517578,10.0,210.0,7.0,0.0,5.0,15.0,48.0,27.0,27.86000061035156,10.0,150.0,3.0,0.0,13.0,47.0,25.0,27.8700008392334,10.0,160.0,3.0,0.0,1.0,MAF,2,446585,1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",6.0,"Map(vectorType -> sparse, length -> 6, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 19, indices -> List(0), values -> List(1.0))",174.0,"Map(vectorType -> sparse, length -> 8399, indices -> List(174), values -> List(1.0))",11.0,"Map(vectorType -> sparse, length -> 375, indices -> List(11), values -> List(1.0))",12.0,"Map(vectorType -> sparse, length -> 50, indices -> List(12), values -> List(1.0))",75.0,"Map(vectorType -> sparse, length -> 383, indices -> List(75), values -> List(1.0))",12.0,"Map(vectorType -> sparse, length -> 52, indices -> List(12), values -> List(1.0))",12.0,"Map(vectorType -> sparse, length -> 23, indices -> List(12), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 23, indices -> List(7), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 11, indices -> List(2), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 1231, indices -> List(7), values -> List(1.0))",18.0,"Map(vectorType -> sparse, length -> 1287, indices -> List(18), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 1260, indices -> List(7), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",95.0,"Map(vectorType -> sparse, length -> 375, indices -> List(95), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13532, indices -> List(1, 9, 202, 8438, 8814, 8927, 9247, 9299, 9317, 9336, 9342, 9358, 10600, 11876, 13225, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13516, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529, 13530), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 145.0, 5.0, 446585.0, 16.0, 44.0, 32.0, 27.93000030517578, 10.0, 7.0, 15.0, 48.0, 27.0, 27.860000610351562, 10.0, 3.0, 13.0, 47.0, 25.0, 27.8700008392334, 10.0, 3.0))"
1,1,30,5,2015-01-30,UA,19977,UA,N490UA,382,14524,1452401,34524,RIC,"Richmond, VA",VA,51,Virginia,38,13930,1393003,30977,ORD,"Chicago, IL",IL,17,Illinois,41,945,947,2.0,2.0,0.0,0,0900-0959,11.0,958,1038,3.0,1059,1041,-18.0,0.0,0.0,-2,1000-1059,0.0,0.0,134.0,114.0,100.0,1.0,642.0,3,0,2015,RIC,72401013740,Richmond International Airport,KRIC,37.50519943237305,-77.3197021484375,167,-77.323,37.512,America/New_York,large_airport,9.0,945,2015-01-30T09:45:00.000+0000,2015-01-30T14:45:00.000+0000,14,2015,1,30,0,72401013740,"RICHMOND INTERNATIONAL AIRPORT, VA US",2015-01-30T11:00:00.000+0000,20.0,44.0,38.0,29.850000381469727,10.0,310.0,15.0,0.0,5.0,27.0,43.0,53.0,29.82999992370605,10.0,340.0,14.0,0.0,25.0,41.0,53.0,29.739999771118164,10.0,220.0,5.0,0.0,0.0,ORD,2,446585,1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 6, indices -> List(0), values -> List(1.0))",4.0,"Map(vectorType -> sparse, length -> 19, indices -> List(4), values -> List(1.0))",1664.0,"Map(vectorType -> sparse, length -> 8399, indices -> List(1664), values -> List(1.0))",57.0,"Map(vectorType -> sparse, length -> 375, indices -> List(57), values -> List(1.0))",8.0,"Map(vectorType -> sparse, length -> 50, indices -> List(8), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 383, indices -> List(1), values -> List(1.0))",4.0,"Map(vectorType -> sparse, length -> 52, indices -> List(4), values -> List(1.0))",8.0,"Map(vectorType -> sparse, length -> 23, indices -> List(8), values -> List(1.0))",5.0,"Map(vectorType -> sparse, length -> 23, indices -> List(5), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 11, indices -> List(2), values -> List(1.0))",12.0,"Map(vectorType -> sparse, length -> 1231, indices -> List(12), values -> List(1.0))",22.0,"Map(vectorType -> sparse, length -> 1287, indices -> List(22), values -> List(1.0))",6.0,"Map(vectorType -> sparse, length -> 1260, indices -> List(6), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 375, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13532, indices -> List(1, 3, 13, 1692, 8484, 8810, 8853, 9239, 9295, 9315, 9336, 9342, 9363, 10604, 11875, 13129, 13131, 13505, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13516, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529, 13530), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 134.0, 5.0, 446585.0, 20.0, 44.0, 38.0, 29.850000381469727, 10.0, 15.0, 27.0, 43.0, 53.0, 29.829999923706055, 10.0, 14.0, 25.0, 41.0, 53.0, 29.739999771118164, 10.0, 5.0))"
1,1,20,2,2015-01-20,WN,19393,WN,N8309C,181,14570,1457002,34570,RNO,"Reno, NV",NV,32,Nevada,85,12889,1288903,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,700,657,-3.0,0.0,0.0,-1,0700-0759,16.0,713,806,8.0,825,814,-11.0,0.0,0.0,-1,0800-0859,0.0,0.0,85.0,77.0,53.0,1.0,345.0,2,0,2015,RNO,72488023185,Reno Tahoe International Airport,KRNO,39.49909973144531,-119.76799774169922,4415,-119.771,39.484,America/Los_Angeles,large_airport,7.0,700,2015-01-20T07:00:00.000+0000,2015-01-20T15:00:00.000+0000,15,2015,1,20,1,72488023185,"RENO AIRPORT, NV US",2015-01-20T12:00:00.000+0000,25.0,49.0,39.0,25.68000030517578,10.0,0.0,0.0,0.0,5.0,28.0,38.0,68.0,25.75,10.0,0.0,0.0,0.0,26.0,31.0,82.0,25.71999931335449,10.0,0.0,0.0,0.0,0.0,SAN,2,446585,1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",5.0,"Map(vectorType -> sparse, length -> 6, indices -> List(5), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 19, indices -> List(0), values -> List(1.0))",1349.0,"Map(vectorType -> sparse, length -> 8399, indices -> List(1349), values -> List(1.0))",69.0,"Map(vectorType -> sparse, length -> 375, indices -> List(69), values -> List(1.0))",12.0,"Map(vectorType -> sparse, length -> 50, indices -> List(12), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 383, indices -> List(7), values -> List(1.0))",11.0,"Map(vectorType -> sparse, length -> 52, indices -> List(11), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 23, indices -> List(1), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 23, indices -> List(3), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 11, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1231, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1287, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1260, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",25.0,"Map(vectorType -> sparse, length -> 375, indices -> List(25), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13532, indices -> List(1, 8, 9, 1377, 8496, 8814, 8859, 9246, 9288, 9313, 9336, 9342, 9351, 10582, 11869, 13129, 13155, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13518, 13519, 13520, 13521, 13522, 13525, 13526, 13527, 13528, 13529), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 85.0, 5.0, 446585.0, 25.0, 49.0, 39.0, 25.68000030517578, 10.0, 28.0, 38.0, 68.0, 25.75, 10.0, 26.0, 31.0, 82.0, 25.719999313354492, 10.0))"
1,1,15,4,2015-01-15,WN,19393,WN,N557WN,677,14570,1457002,34570,RNO,"Reno, NV",NV,32,Nevada,85,12889,1288903,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,1920,1917,-3.0,0.0,0.0,-1,1900-1959,6.0,1923,2017,6.0,2030,2023,-7.0,0.0,0.0,-1,2000-2059,0.0,0.0,70.0,66.0,54.0,1.0,345.0,2,0,2015,RNO,72488023185,Reno Tahoe International Airport,KRNO,39.49909973144531,-119.76799774169922,4415,-119.771,39.484,America/Los_Angeles,large_airport,19.0,1920,2015-01-15T19:20:00.000+0000,2015-01-16T03:20:00.000+0000,3,2015,1,16,1,72488023185,"RENO AIRPORT, NV US",2015-01-16T00:00:00.000+0000,27.0,37.0,67.0,25.71999931335449,10.0,0.0,0.0,0.0,5.0,27.0,40.0,60.0,25.76000022888184,10.0,360.0,3.0,0.0,25.0,48.0,41.0,25.770000457763672,10.0,150.0,3.0,0.0,0.0,LAS,2,446585,1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 6, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 19, indices -> List(0), values -> List(1.0))",502.0,"Map(vectorType -> sparse, length -> 8399, indices -> List(502), values -> List(1.0))",69.0,"Map(vectorType -> sparse, length -> 375, indices -> List(69), values -> List(1.0))",12.0,"Map(vectorType -> sparse, length -> 50, indices -> List(12), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 383, indices -> List(7), values -> List(1.0))",11.0,"Map(vectorType -> sparse, length -> 52, indices -> List(11), values -> List(1.0))",13.0,"Map(vectorType -> sparse, length -> 23, indices -> List(13), values -> List(1.0))",17.0,"Map(vectorType -> sparse, length -> 23, indices -> List(17), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 11, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1231, indices -> List(0), values -> List(1.0))",28.0,"Map(vectorType -> sparse, length -> 1287, indices -> List(28), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 1260, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 375, indices -> List(7), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13532, indices -> List(1, 4, 9, 530, 8496, 8814, 8859, 9246, 9300, 9327, 9336, 9342, 9351, 10610, 11888, 13129, 13137, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529, 13530), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 70.0, 5.0, 446585.0, 27.0, 37.0, 67.0, 25.719999313354492, 10.0, 27.0, 40.0, 60.0, 25.760000228881836, 10.0, 3.0, 25.0, 48.0, 41.0, 25.770000457763672, 10.0, 3.0))"
1,1,30,5,2015-01-30,WN,19393,WN,N231WN,346,14679,1467903,33570,SAN,"San Diego, CA",CA,6,California,91,13796,1379603,32457,OAK,"Oakland, CA",CA,6,California,91,1300,1259,-1.0,0.0,0.0,-1,1300-1359,9.0,1308,1412,8.0,1430,1420,-10.0,0.0,0.0,-1,1400-1459,0.0,0.0,90.0,81.0,64.0,1.0,446.0,2,0,2015,SAN,72290023188,San Diego International Airport,KSAN,32.7336006165,-117.190002441,17,-117.183,32.734,America/Los_Angeles,large_airport,13.0,1300,2015-01-30T13:00:00.000+0000,2015-01-30T21:00:00.000+0000,21,2015,1,30,0,72290023188,"SAN DIEGO INTERNATIONAL AIRPORT, CA US",2015-01-30T18:00:00.000+0000,52.0,62.0,70.0,29.8700008392334,10.0,0.0,0.0,0.0,5.0,49.0,63.0,60.0,29.850000381469727,10.0,300.0,5.0,0.0,55.0,62.0,78.0,29.93000030517578,10.0,0.0,0.0,0.0,0.0,SAT,2,446585,1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 6, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 19, indices -> List(0), values -> List(1.0))",185.0,"Map(vectorType -> sparse, length -> 8399, indices -> List(185), values -> List(1.0))",24.0,"Map(vectorType -> sparse, length -> 375, indices -> List(24), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 50, indices -> List(0), values -> List(1.0))",38.0,"Map(vectorType -> sparse, length -> 383, indices -> List(38), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 52, indices -> List(0), values -> List(1.0))",10.0,"Map(vectorType -> sparse, length -> 23, indices -> List(10), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 23, indices -> List(0), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 11, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1231, indices -> List(0), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 1287, indices -> List(7), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1260, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",46.0,"Map(vectorType -> sparse, length -> 375, indices -> List(46), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13532, indices -> List(1, 3, 9, 213, 8451, 8802, 8890, 9235, 9297, 9310, 9336, 9342, 9351, 10589, 11869, 13129, 13176, 13505, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 90.0, 5.0, 446585.0, 52.0, 62.0, 70.0, 29.8700008392334, 10.0, 49.0, 63.0, 60.0, 29.850000381469727, 10.0, 5.0, 55.0, 62.0, 78.0, 29.93000030517578, 10.0))"
1,1,11,7,2015-01-11,WN,19393,WN,N963WN,4831,14679,1467903,33570,SAN,"San Diego, CA",CA,6,California,91,13796,1379603,32457,OAK,"Oakland, CA",CA,6,California,91,2110,2246,96.0,96.0,1.0,6,2100-2159,10.0,2256,4,6.0,2240,10,90.0,90.0,1.0,6,2200-2259,0.0,0.0,90.0,84.0,68.0,1.0,446.0,2,0,2015,SAN,72290023188,San Diego International Airport,KSAN,32.7336006165,-117.190002441,17,-117.183,32.734,America/Los_Angeles,large_airport,21.0,2110,2015-01-11T21:10:00.000+0000,2015-01-12T05:10:00.000+0000,5,2015,1,12,0,72290023188,"SAN DIEGO INTERNATIONAL AIRPORT, CA US",2015-01-12T02:00:00.000+0000,57.0,60.333333333333336,89.66666666666667,30.11000061035156,3.1666666666666665,0.0,0.0,1.0,5.0,55.25,61.0,83.25,30.09000015258789,10.0,0.0,0.0,0.0,56.0,61.0,85.0,30.020000457763672,10.0,326.6666666666667,4.333333333333333,0.0,1.0,LAS,2,446585,1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",4.0,"Map(vectorType -> sparse, length -> 6, indices -> List(4), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 19, indices -> List(0), values -> List(1.0))",23.0,"Map(vectorType -> sparse, length -> 8399, indices -> List(23), values -> List(1.0))",24.0,"Map(vectorType -> sparse, length -> 375, indices -> List(24), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 50, indices -> List(0), values -> List(1.0))",38.0,"Map(vectorType -> sparse, length -> 383, indices -> List(38), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 52, indices -> List(0), values -> List(1.0))",15.0,"Map(vectorType -> sparse, length -> 23, indices -> List(15), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 11, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1231, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1287, indices -> List(0), values -> List(1.0))",127.0,"Map(vectorType -> sparse, length -> 1260, indices -> List(127), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",7.0,"Map(vectorType -> sparse, length -> 375, indices -> List(7), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13532, indices -> List(1, 7, 9, 51, 8451, 8802, 8890, 9235, 9302, 9329, 9336, 9342, 9351, 10582, 11996, 13137, 13505, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13517, 13518, 13519, 13520, 13521, 13522, 13525, 13526, 13527, 13528, 13529, 13530), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 90.0, 5.0, 446585.0, 57.0, 60.333333333333336, 89.66666666666667, 30.110000610351562, 3.1666666666666665, 1.0, 55.25, 61.0, 83.25, 30.09000015258789, 10.0, 56.0, 61.0, 85.0, 30.020000457763672, 10.0, 4.333333333333333))"
1,1,12,1,2015-01-12,WN,19393,WN,N345SA,4329,14679,1467903,33570,SAN,"San Diego, CA",CA,6,California,91,13796,1379603,32457,OAK,"Oakland, CA",CA,6,California,91,1445,1450,5.0,5.0,0.0,0,1400-1459,18.0,1508,1619,5.0,1610,1624,14.0,14.0,0.0,0,1600-1659,0.0,0.0,85.0,94.0,71.0,1.0,446.0,2,0,2015,SAN,72290023188,San Diego International Airport,KSAN,32.7336006165,-117.190002441,17,-117.183,32.734,America/Los_Angeles,large_airport,14.0,1445,2015-01-12T14:45:00.000+0000,2015-01-12T22:45:00.000+0000,22,2015,1,12,0,72290023188,"SAN DIEGO INTERNATIONAL AIRPORT, CA US",2015-01-12T19:00:00.000+0000,54.0,61.0,78.0,30.100000381469727,10.0,330.0,3.0,0.0,5.0,55.0,62.0,78.0,30.09000015258789,10.0,310.0,8.0,0.0,55.5,64.0,75.0,30.15999984741211,10.0,0.0,0.0,0.0,0.0,LAS,2,446585,1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 19, indices -> List(0), values -> List(1.0))",4985.0,"Map(vectorType -> sparse, length -> 8399, indices -> List(4985), values -> List(1.0))",24.0,"Map(vectorType -> sparse, length -> 375, indices -> List(24), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 50, indices -> List(0), values -> List(1.0))",38.0,"Map(vectorType -> sparse, length -> 383, indices -> List(38), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 52, indices -> List(0), values -> List(1.0))",12.0,"Map(vectorType -> sparse, length -> 23, indices -> List(12), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 23, indices -> List(7), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 11, indices -> List(2), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 1231, indices -> List(19), values -> List(1.0))",12.0,"Map(vectorType -> sparse, length -> 1287, indices -> List(12), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1260, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 375, indices -> List(7), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13532, indices -> List(1, 5, 9, 5013, 8451, 8802, 8890, 9235, 9299, 9317, 9336, 9342, 9370, 10594, 11869, 13129, 13137, 13505, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13516, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 85.0, 5.0, 446585.0, 54.0, 61.0, 78.0, 30.100000381469727, 10.0, 3.0, 55.0, 62.0, 78.0, 30.09000015258789, 10.0, 8.0, 55.5, 64.0, 75.0, 30.15999984741211, 10.0))"
1,1,29,4,2015-01-29,WN,19393,WN,N432WN,107,14679,1467903,33570,SAN,"San Diego, CA",CA,6,California,91,13796,1379603,32457,OAK,"Oakland, CA",CA,6,California,91,1655,1705,10.0,10.0,0.0,0,1600-1659,7.0,1712,1815,6.0,1825,1821,-4.0,0.0,0.0,-1,1800-1859,0.0,0.0,90.0,76.0,63.0,1.0,446.0,2,0,2015,SAN,72290023188,San Diego International Airport,KSAN,32.7336006165,-117.190002441,17,-117.183,32.734,America/Los_Angeles,large_airport,16.0,1655,2015-01-29T16:55:00.000+0000,2015-01-30T00:55:00.000+0000,0,2015,1,30,0,72290023188,"SAN DIEGO INTERNATIONAL AIRPORT, CA US",2015-01-29T21:00:00.000+0000,54.0,63.0,73.0,29.989999771118164,10.0,160.0,3.0,0.0,5.0,54.0,64.0,70.0,30.0,10.0,0.0,0.0,0.0,49.0,68.0,51.0,29.989999771118164,10.0,200.0,6.0,0.0,0.0,SMF,2,446585,1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 6, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 19, indices -> List(0), values -> List(1.0))",161.0,"Map(vectorType -> sparse, length -> 8399, indices -> List(161), values -> List(1.0))",24.0,"Map(vectorType -> sparse, length -> 375, indices -> List(24), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 50, indices -> List(0), values -> List(1.0))",38.0,"Map(vectorType -> sparse, length -> 383, indices -> List(38), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 52, indices -> List(0), values -> List(1.0))",11.0,"Map(vectorType -> sparse, length -> 23, indices -> List(11), values -> List(1.0))",12.0,"Map(vectorType -> sparse, length -> 23, indices -> List(12), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 11, indices -> List(2), values -> List(1.0))",5.0,"Map(vectorType -> sparse, length -> 1231, indices -> List(5), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1287, indices -> List(0), values -> List(1.0))",4.0,"Map(vectorType -> sparse, length -> 1260, indices -> List(4), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",39.0,"Map(vectorType -> sparse, length -> 375, indices -> List(39), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13532, indices -> List(1, 4, 9, 189, 8451, 8802, 8890, 9235, 9298, 9322, 9336, 9342, 9356, 10582, 11873, 13129, 13169, 13505, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13516, 13518, 13519, 13520, 13521, 13522, 13525, 13526, 13527, 13528, 13529, 13530), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 90.0, 5.0, 446585.0, 54.0, 63.0, 73.0, 29.989999771118164, 10.0, 3.0, 54.0, 64.0, 70.0, 30.0, 10.0, 49.0, 68.0, 51.0, 29.989999771118164, 10.0, 6.0))"


In [0]:
# keep only the necessary columns for training and testing (year and quarter features are used for CV)
df_model = df_final['features', 'dep_del15', 'flight_time_utc_year', 'quarter']
df_model = df_model.withColumnRenamed("dep_del15", "label")
df_model = df_model.withColumnRenamed("flight_time_utc_year", "year")

# split the dataset into train and test
df_train = df_model.filter(F.col("year") < 2021)
df_test = df_model.filter(F.col("year") == 2021)

In [0]:
df_train.display()

features,label,year,quarter
"Map(vectorType -> sparse, length -> 13532, indices -> List(1, 3, 9, 148, 8459, 8803, 8879, 9236, 9302, 9327, 9336, 9342, 9387, 10615, 11902, 13129, 13130, 13505, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13516, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529, 13530), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 55.0, 5.0, 446585.0, 40.0, 50.0, 68.0, 29.709999084472656, 10.0, 5.0, 39.0, 51.0, 64.0, 29.75, 10.0, 6.0, 40.0, 57.0, 53.0, 29.760000228881836, 10.0, 9.0))",0.0,2015,1
"Map(vectorType -> sparse, length -> 13532, indices -> List(1, 8, 12, 865, 8430, 8808, 8976, 9258, 9296, 9324, 9336, 9342, 9383, 10609, 11869, 13129, 13208, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13516, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 173.0, 5.0, 446585.0, 22.0, 29.0, 75.0, 24.709999084472656, 10.0, 5.0, 23.0, 28.0, 81.0, 24.700000762939453, 10.0, 6.0, 15.0, 42.0, 33.0, 24.649999618530273, 10.0))",0.0,2015,1
"Map(vectorType -> sparse, length -> 13532, indices -> List(1, 9, 202, 8438, 8814, 8927, 9247, 9299, 9317, 9336, 9342, 9358, 10600, 11876, 13225, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13516, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529, 13530), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 145.0, 5.0, 446585.0, 16.0, 44.0, 32.0, 27.93000030517578, 10.0, 7.0, 15.0, 48.0, 27.0, 27.860000610351562, 10.0, 3.0, 13.0, 47.0, 25.0, 27.8700008392334, 10.0, 3.0))",0.0,2015,1
"Map(vectorType -> sparse, length -> 13532, indices -> List(1, 3, 13, 1692, 8484, 8810, 8853, 9239, 9295, 9315, 9336, 9342, 9363, 10604, 11875, 13129, 13131, 13505, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13516, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529, 13530), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 134.0, 5.0, 446585.0, 20.0, 44.0, 38.0, 29.850000381469727, 10.0, 15.0, 27.0, 43.0, 53.0, 29.829999923706055, 10.0, 14.0, 25.0, 41.0, 53.0, 29.739999771118164, 10.0, 5.0))",0.0,2015,1
"Map(vectorType -> sparse, length -> 13532, indices -> List(1, 8, 9, 1377, 8496, 8814, 8859, 9246, 9288, 9313, 9336, 9342, 9351, 10582, 11869, 13129, 13155, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13518, 13519, 13520, 13521, 13522, 13525, 13526, 13527, 13528, 13529), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 85.0, 5.0, 446585.0, 25.0, 49.0, 39.0, 25.68000030517578, 10.0, 28.0, 38.0, 68.0, 25.75, 10.0, 26.0, 31.0, 82.0, 25.719999313354492, 10.0))",0.0,2015,1
"Map(vectorType -> sparse, length -> 13532, indices -> List(1, 4, 9, 530, 8496, 8814, 8859, 9246, 9300, 9327, 9336, 9342, 9351, 10610, 11888, 13129, 13137, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529, 13530), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 70.0, 5.0, 446585.0, 27.0, 37.0, 67.0, 25.719999313354492, 10.0, 27.0, 40.0, 60.0, 25.760000228881836, 10.0, 3.0, 25.0, 48.0, 41.0, 25.770000457763672, 10.0, 3.0))",0.0,2015,1
"Map(vectorType -> sparse, length -> 13532, indices -> List(1, 3, 9, 213, 8451, 8802, 8890, 9235, 9297, 9310, 9336, 9342, 9351, 10589, 11869, 13129, 13176, 13505, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 90.0, 5.0, 446585.0, 52.0, 62.0, 70.0, 29.8700008392334, 10.0, 49.0, 63.0, 60.0, 29.850000381469727, 10.0, 5.0, 55.0, 62.0, 78.0, 29.93000030517578, 10.0))",0.0,2015,1
"Map(vectorType -> sparse, length -> 13532, indices -> List(1, 7, 9, 51, 8451, 8802, 8890, 9235, 9302, 9329, 9336, 9342, 9351, 10582, 11996, 13137, 13505, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13517, 13518, 13519, 13520, 13521, 13522, 13525, 13526, 13527, 13528, 13529, 13530), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 90.0, 5.0, 446585.0, 57.0, 60.333333333333336, 89.66666666666667, 30.110000610351562, 3.1666666666666665, 1.0, 55.25, 61.0, 83.25, 30.09000015258789, 10.0, 56.0, 61.0, 85.0, 30.020000457763672, 10.0, 4.333333333333333))",1.0,2015,1
"Map(vectorType -> sparse, length -> 13532, indices -> List(1, 5, 9, 5013, 8451, 8802, 8890, 9235, 9299, 9317, 9336, 9342, 9370, 10594, 11869, 13129, 13137, 13505, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13516, 13518, 13519, 13520, 13521, 13522, 13523, 13525, 13526, 13527, 13528, 13529), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 85.0, 5.0, 446585.0, 54.0, 61.0, 78.0, 30.100000381469727, 10.0, 3.0, 55.0, 62.0, 78.0, 30.09000015258789, 10.0, 8.0, 55.5, 64.0, 75.0, 30.15999984741211, 10.0))",0.0,2015,1
"Map(vectorType -> sparse, length -> 13532, indices -> List(1, 4, 9, 189, 8451, 8802, 8890, 9235, 9298, 9322, 9336, 9342, 9356, 10582, 11873, 13129, 13169, 13505, 13508, 13509, 13510, 13511, 13512, 13513, 13514, 13515, 13516, 13518, 13519, 13520, 13521, 13522, 13525, 13526, 13527, 13528, 13529, 13530), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 90.0, 5.0, 446585.0, 54.0, 63.0, 73.0, 29.989999771118164, 10.0, 3.0, 54.0, 64.0, 70.0, 30.0, 10.0, 49.0, 68.0, 51.0, 29.989999771118164, 10.0, 6.0))",0.0,2015,1


### UnderSampling

In [0]:
sample_ratio  = (df_train.filter(df_train.label == 1).count())/(df_train.filter(df_train.label == 0).count())

sample_df_train = df_train.filter(df_train.label == 1)
for i in [2015, 2016, 2017, 2018, 2019, 2020]:
  sample_0_df = df_train.filter((df_train.label == 0) &(df_train.year == i)).sample(False, sample_ratio, seed= 2022)
  sample_df_train = sample_df_train.unionAll(sample_0_df)

In [0]:
sample_df_train.write.mode('overwrite').parquet(f"{blob_url}/final_train_dataset")

In [0]:
df_test.write.mode('overwrite').parquet(f"{blob_url}/final_test_2021_dataset")

In [0]:
sample_df_train = spark.read.parquet(f"{blob_url}/final_train_dataset")
sample_df_train.display()

In [0]:
df_test = spark.read.parquet(f"{blob_url}/final_test_2021_dataset")
df_test.display()

### Fit in Custom CV

In [0]:
# import custom cv module
spark.sparkContext.addPyFile("dbfs:/custom_cv.py")
from custom_cv import CustomCrossValidator


# set up grid search: estimator, set of params, and evaluator
lr = LogisticRegression(featuresCol='features', labelCol='label', maxIter=10)
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
xgb = XgboostClassifier(num_workers=3, featuresCol="features", labelCol="label", missing=0.0, max_depth=2, n_estimators= 100, numEarlyStoppingRounds=10)

grid_rf = ParamGridBuilder()\
            .addGrid(rf.maxDepth, [5])\
            .addGrid(rf.numTrees, [20])\
            .build()

# grid search
grid_xgb = ParamGridBuilder()\
  .addGrid(xgb.max_depth, [5])\
  .addGrid(xgb.n_estimators, [100])\
  .build()

# lr grid search
grid_lr = ParamGridBuilder()\
      .addGrid(lr.standardization,[True, False])\
      .build()



evaluator = BinaryClassificationEvaluator()

In [0]:
def cal_metrics(df):
  tp = df[(df.label == 1) & (df.prediction == 1)].count()
  tn = df[(df.label == 0) & (df.prediction == 0)].count()
  fp = df[(df.label == 0) & (df.prediction == 1)].count()
  fn = df[(df.label == 1) & (df.prediction == 0)].count()

  precision = tp / (tp + fp)
  recall = tp / (tp + fn)
  
  accuracy = (tp+tn)/(tp+tn+fp+fn)

  F_beta = (1.25 * precision * recall) / (0.25 * precision + recall)
  print("Test F0.5 Score: ", F_beta)
  print("Test Precision Score: ", precision)
  print("Test Recall Score: ", recall)
  print("Test Accuracy Score: ", accuracy)

In [0]:
d = {}

d['df1'] = sample_df_train.filter((sample_df_train.year == 2015) | ((sample_df_train.year == 2016) & (sample_df_train.quarter == 1)))\
                   .withColumn('cv', F.when(sample_df_train.year == 2015, 'train')
                                         .otherwise('test'))

d['df2'] = sample_df_train.filter((sample_df_train.year == 2016) | ((sample_df_train.year == 2017) & (sample_df_train.quarter == 1)))\
                   .withColumn('cv', F.when(sample_df_train.year == 2016, 'train')
                                         .otherwise('test'))

d['df3'] = sample_df_train.filter((sample_df_train.year == 2017) | ((sample_df_train.year == 2018) & (sample_df_train.quarter == 1)))\
                   .withColumn('cv', F.when(sample_df_train.year == 2017, 'train')
                                         .otherwise('test'))

d['df4'] = sample_df_train.filter((sample_df_train.year == 2018) | ((sample_df_train.year == 2019) & (sample_df_train.quarter == 1)))\
                   .withColumn('cv', F.when(sample_df_train.year == 2018, 'train')
                                         .otherwise('test'))

d['df5'] = sample_df_train.filter((sample_df_train.year == 2019) | ((sample_df_train.year == 2020) & (sample_df_train.quarter == 1)))\
                   .withColumn('cv', F.when(sample_df_train.year == 2019, 'train')
                                         .otherwise('test'))

In [0]:
for i in d:
  d[i] = d[i].withColumn("cv_flag", F.when(d[i].cv == "train",False )
                                 .when(d[i].cv == "test",True)
                                 )

In [0]:
# Check balance
d['df1'].groupby('year','cv').count().orderBy('year').show()
d['df2'].groupby('year','cv').count().orderBy('year').show()
d['df3'].groupby('year','cv').count().orderBy('year').show()
d['df4'].groupby('year','cv').count().orderBy('year').show()
d['df5'].groupby('year','cv').count().orderBy('year').show()


#### Logistic Regression and Decision Tree

https://adb-731998097721284.4.azuredatabricks.net/?o=731998097721284#notebook/849916349911411/command/849916349917740

#### Random Forest

In [0]:
# run cross validation
cv_rf = CustomCrossValidator(estimator=rf, evaluator=evaluator, estimatorParamMaps=grid_rf, splitWord = ('train', 'test'), cvCol = 'cv', parallelism=10)

cvModel_rf = cv_rf.fit(d)

In [0]:
fold 1 start...
fold 1 end
fold 2 start...
fold 2 end
fold 3 start...
fold 3 end
fold 4 start...
fold 4 end
fold 5 start...
fold 5 end
Best Model:  {Param(parent='RandomForestClassifier_0f3d09ee9160', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 10, Param(parent='RandomForestClassifier_0f3d09ee9160', name='numTrees', doc='Number of trees to train (>= 1).'): 15} Detailed Score [0.6546401388505421, 0.7393021933110175, 0.6811848878077641, 0.7594602698603996, 0.7572991909482532] Avg Score 0.7183773361555954

In [0]:
rf3 = RandomForestClassifier(labelCol="label", featuresCol="features", maxDepth = 10, numTrees = 15)
cvModel_rf3 = rf3.fit(sample_df_train)
prediction3 = cvModel_rf3.transform(df_test)
cal_metrics(prediction3['label', 'prediction'])

#### XGboost

https://adb-731998097721284.4.azuredatabricks.net/?o=731998097721284#notebook/849916349916003/command/849916349916004

SMOTE

In [0]:
import random
import numpy as np
from pyspark.sql import Row
from sklearn import neighbors
from pyspark.ml.feature import VectorAssembler


def SmoteSampling(vectorized, k = 5, minorityClass = 1, majorityClass = 0, percentageOver = 200, percentageUnder = 100):
  
    dataInput_min = vectorized[vectorized['label'] == minorityClass]
    dataInput_maj = vectorized[vectorized['label'] == majorityClass]
    feature = dataInput_min.select('features')
    feature = feature.rdd
    feature = feature.map(lambda x: x[0])
    feature = feature.collect()
    feature = np.asarray(feature)
    nbrs = neighbors.NearestNeighbors(n_neighbors=k, algorithm='auto').fit(feature)
    neighbours =  nbrs.kneighbors(feature)
    gap = neighbours[0]
    neighbours = neighbours[1]
    min_rdd = dataInput_min.drop('label').rdd
    pos_rddArray = min_rdd.map(lambda x : list(x))
    pos_ListArray = pos_rddArray.collect()
    min_Array = list(pos_ListArray)
    newRows = []
    nt = len(min_Array)
    nexs = percentageOver/100
    for i in range(nt):
        for j in range(nexs):
            neigh = random.randint(1,k)
            difs = min_Array[neigh][0] - min_Array[i][0]
            newRec = (min_Array[i][0]+random.random()*difs)
            newRows.insert(0,(newRec))
    newData_rdd = sc.parallelize(newRows)
    newData_rdd_new = newData_rdd.map(lambda x: Row(features = x, label = 1))
    new_data = newData_rdd_new.toDF()
    new_data_minor = dataInput_min.unionAll(new_data)
    new_data_major = dataInput_maj.sample(False, (float(percentageUnder)/float(100)))
    return new_data_major.unionAll(new_data_minor)

balanced_train_df = SmoteSampling(df_train)

In [0]:
balanced_train_df.write.parquet(f"{blob_url}/balanced_train_df")