In [2]:
import pandas as pd
from pandas_profiling import ProfileReport as pr
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [3]:
df_19 = pd.read_csv('Jan_2019_ontime.csv')
df_20 = pd.read_csv('Jan_2020_ontime.csv')

In [4]:
df_19.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,...,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE,Unnamed: 21
0,1,2,9E,20363,9E,N8688C,3280,11953,1195302,GNV,...,ATL,601.0,0.0,0600-0659,722.0,0.0,0.0,0.0,300.0,
1,1,2,9E,20363,9E,N348PQ,3281,13487,1348702,MSP,...,CVG,1359.0,0.0,1400-1459,1633.0,0.0,0.0,0.0,596.0,
2,1,2,9E,20363,9E,N8896A,3282,11433,1143302,DTW,...,CVG,1215.0,0.0,1200-1259,1329.0,0.0,0.0,0.0,229.0,
3,1,2,9E,20363,9E,N8886A,3283,15249,1524906,TLH,...,ATL,1521.0,0.0,1500-1559,1625.0,0.0,0.0,0.0,223.0,
4,1,2,9E,20363,9E,N8974C,3284,10397,1039707,ATL,...,FSM,1847.0,0.0,1900-1959,1940.0,0.0,0.0,0.0,579.0,


In [5]:
df_20.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,...,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE,Unnamed: 21
0,1,3,EV,20366,EV,N48901,4397,13930,1393007,ORD,...,GRB,1003.0,0.0,1000-1059,1117.0,0.0,0.0,0.0,174.0,
1,1,3,EV,20366,EV,N16976,4401,15370,1537002,TUL,...,ORD,1027.0,0.0,1000-1059,1216.0,0.0,0.0,0.0,585.0,
2,1,3,EV,20366,EV,N12167,4404,11618,1161802,EWR,...,TYS,1848.0,0.0,1800-1859,2120.0,0.0,0.0,0.0,631.0,
3,1,3,EV,20366,EV,N14902,4405,10781,1078105,BTR,...,IAH,1846.0,0.0,1800-1859,2004.0,0.0,0.0,0.0,253.0,
4,1,3,EV,20366,EV,N606UX,4407,14524,1452401,RIC,...,IAH,1038.0,0.0,1000-1059,1330.0,0.0,0.0,0.0,1157.0,


In [6]:
# make one shared dataset
df = pd.concat([df_19,df_20])

# make some improvments of the data
df.drop(['Unnamed: 21'], inplace=True, axis=1)
df = df[ (df.ARR_DEL15.isna() == False)]

In [7]:
df.info() # initial info for understanding dataset

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1165231 entries, 0 to 607345
Data columns (total 21 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   DAY_OF_MONTH           1165231 non-null  int64  
 1   DAY_OF_WEEK            1165231 non-null  int64  
 2   OP_UNIQUE_CARRIER      1165231 non-null  object 
 3   OP_CARRIER_AIRLINE_ID  1165231 non-null  int64  
 4   OP_CARRIER             1165231 non-null  object 
 5   TAIL_NUM               1165231 non-null  object 
 6   OP_CARRIER_FL_NUM      1165231 non-null  int64  
 7   ORIGIN_AIRPORT_ID      1165231 non-null  int64  
 8   ORIGIN_AIRPORT_SEQ_ID  1165231 non-null  int64  
 9   ORIGIN                 1165231 non-null  object 
 10  DEST_AIRPORT_ID        1165231 non-null  int64  
 11  DEST_AIRPORT_SEQ_ID    1165231 non-null  int64  
 12  DEST                   1165231 non-null  object 
 13  DEP_TIME               1165231 non-null  float64
 14  DEP_DEL15          

In [8]:
#report = pr(df) # more automatic visualisations
#report
#report.to_file('pandas_profiling.html')


In [9]:
plot1 = df_19.groupby('DAY_OF_MONTH')['CANCELLED'].count()
fig = go.Figure()

fig.add_trace(go.Bar(x=plot1.index, y=plot1.values, name='Cancellations',opacity=0.9,marker_color='#7759e3'))
fig.add_trace(go.Scatter(x=plot1.index, y=plot1.values, line=dict(color='red'), name='Trend'))

fig.update_layout(
    title="Cancelled flights on every day of January 2019",
    xaxis_title="Day",
    yaxis_title="Cancellation count",
)

fig.show()

In [10]:
plot1 = df_20.groupby('DAY_OF_MONTH')['CANCELLED'].count()
fig = go.Figure()

fig.add_trace(go.Bar(x=plot1.index, y=plot1.values, name='Cancellations',opacity=0.9,marker_color='#7759e3'))
fig.add_trace(go.Scatter(x=plot1.index, y=plot1.values, line=dict(color='red'), name='Trend'))

fig.update_layout(
    title="Cancelled flights on every day of January 2020",
    xaxis_title="Day",
    yaxis_title="Cancellation count",
)

fig.show()

In [11]:
plot3 = df.groupby('OP_CARRIER')['DEP_DEL15'].sum().sort_values()
fig = px.pie(names=plot3.index, values=list(map(int,plot3.values)),
              color_discrete_sequence =px.colors.qualitative.D3, hole=0.2, title='Airlines with the highest number of delayed flights')
fig.show()

In [12]:
#classify a weekend or a working day
df.DAY_OF_WEEK = df.DAY_OF_WEEK.apply(lambda x: 0 if x <= 5 else 1)

In [13]:
# drop features, that we dont use
features_to_be_dropped = ['DAY_OF_MONTH','ORIGIN_AIRPORT_SEQ_ID','ORIGIN','DEST', 'DEST_AIRPORT_SEQ_ID', 'OP_UNIQUE_CARRIER','OP_CARRIER_AIRLINE_ID', 'OP_CARRIER', 'TAIL_NUM', 'OP_CARRIER_FL_NUM','DEP_DEL15','DEP_TIME','DIVERTED','CANCELLED']
df.drop(features_to_be_dropped, inplace=True, axis=1)

In [14]:
# mistake 0500-0559 fixed
df.loc[df['DEP_TIME_BLK'] == '0001-0559', 'DEP_TIME_BLK'] = '0500-0559'

In [15]:
blocks = []
for hour in range(0,24):
    hour_part = ('%02d' %(hour))
    blocks.append(hour_part + '00-' + hour_part + '59')
    
# label encoding
le = LabelEncoder()
le.fit(blocks)
le.classes_
df['DEP_TIME_BLK'] = le.transform(df.DEP_TIME_BLK.values)

In [16]:
# divide the data into training and test data
Y = df['ARR_DEL15'].values
X = df.drop(['ARR_DEL15'], axis=1).values

X_train, X_test, Y_train, Y_test =  train_test_split(X,Y, test_size=0.3, random_state=1)

In [17]:
dtc = DecisionTreeClassifier(random_state=0, max_depth=2)
dtc.fit(X_train,Y_train)

Y_train_pred = dtc.predict(X_train)
Y_test_pred = dtc.predict(X_test)

In [18]:
print('DecisionTreeClassifier"s accuracy for train: %.5f and test: %.5f' %(accuracy_score(Y_train,Y_train_pred), accuracy_score(Y_test,Y_test_pred)))

DecisionTreeClassifier"s accuracy for train: 0.84244 and test: 0.84195


In [19]:
rfc = RandomForestClassifier(n_estimators=20)
rfc.fit(X_train,Y_train)

Y_train_pred = rfc.predict(X_train)
Y_test_pred = rfc.predict(X_test)

In [20]:
print('RandomForestClassifier"s accuracy for train: %.5f and test: %.5f' %(accuracy_score(Y_train,Y_train_pred), accuracy_score(Y_test,Y_test_pred)))

RandomForestClassifier"s accuracy for train: 0.99289 and test: 0.93069


In [21]:
sgdc = SGDClassifier(max_iter=2000)
sgdc.fit(X_train,Y_train)

Y_train_pred = sgdc.predict(X_train)
Y_test_pred = sgdc.predict(X_test)

In [22]:
print('SGDC"s accuracy for train: %.5f and test: %.5f' %(accuracy_score(Y_train,Y_train_pred), accuracy_score(Y_test,Y_test_pred)))

SGDC"s accuracy for train: 0.83916 and test: 0.83890


As a result, we can say that we have obtained three prediction models using the DecisionTreeClassifier, the RandomForestClassifier, SDGClassifier.

The RandomForestClassifier shows better accuracy, but the DecisionTreeClassifier is faster.

In general, we can now predict delays in January for next year. We believe that the information can help you better organize your time for flights, since January is the month of vacations.