## Airline dataset:
downloaded from https://www.kaggle.com/giovamata/airlinedelaycauses.
It was used for experiment in the following two articles:

 https://arxiv.org/pdf/1309.6835.pdf
 
 https://proceedings.neurips.cc/paper/2017/file/8208974663db80265e9bfe7b222dcb18-Paper.pdf

In [1]:
import sys 
import numpy as np # linear algebra
from scipy.stats import randint
import matplotlib.pyplot as plt # this is used for the plot the graph 
%matplotlib inline
import re
from tqdm import notebook
import seaborn as sns
import tensorflow as tf
from scipy import stats
import pandas as pd
import json
from pandas.io.json import json_normalize
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

### read in the first 800,000 obs as suggested in paper

In [3]:
nRowsRead = 80000 # specify 'None' if want to read whole file
# DelayedFlights.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df1 = pd.read_csv('DelayedFlights.csv', delimiter=',',nrows=nRowsRead,index_col=0)
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 1936758 rows and 29 columns


In [20]:
df1.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,...,4.0,8.0,0,N,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,...,5.0,10.0,0,N,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,...,3.0,17.0,0,N,0,,,,,
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,...,3.0,10.0,0,N,0,2.0,0.0,0.0,0.0,32.0
5,2008,1,3,4,1940.0,1915,2121.0,2110,WN,378,...,4.0,10.0,0,N,0,,,,,


In [21]:
df2=df1.filter(items=["Month","DayofMonth","DayOfWeek","ArrDelay","DepTime","ArrTime","AirTime","Distance"])

In [22]:
df2.columns

Index(['Month', 'DayofMonth', 'DayOfWeek', 'ArrDelay', 'DepTime', 'ArrTime',
       'AirTime', 'Distance'],
      dtype='object')

In [23]:
df2.isna().sum()

Month            0
DayofMonth       0
DayOfWeek        0
ArrDelay      2874
DepTime          0
ArrTime       2874
AirTime       2874
Distance         0
dtype: int64

In [24]:
df2.dropna(inplace=True)

### one hot coding

In [25]:
df2=pd.get_dummies(df2,columns=["Month","DayofMonth","DayOfWeek"],drop_first=True)

In [26]:
df2.head()

Unnamed: 0,ArrDelay,DepTime,ArrTime,AirTime,Distance,Month_2,Month_3,Month_4,Month_5,DayofMonth_2,...,DayofMonth_28,DayofMonth_29,DayofMonth_30,DayofMonth_31,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6,DayOfWeek_7
0,-14.0,2003.0,2211.0,116.0,810,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2.0,754.0,1002.0,113.0,810,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,14.0,628.0,804.0,76.0,515,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,34.0,1829.0,1959.0,77.0,515,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,11.0,1940.0,2121.0,87.0,688,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


### normalize continuous

In [27]:
air=np.array(df2)

In [28]:
#continuous
aircontinuous=air[:,:5].copy()

In [29]:
scaler = StandardScaler()
scaler.fit(aircontinuous)
aircontinuous=scaler.transform(aircontinuous)

In [30]:
air[:,:5]=aircontinuous

### split into training and testing

In [31]:
np.random.seed(2020)
air=air[np.random.choice(len(air),len(air),replace=False),:]

In [32]:
#outcome to the last row
air=air[:,::-1]

### Output data

In [33]:
np.save("airdt",air)

In [4]:
np.load('airdt.npy').shape

(797126, 45)