**Goal:** Predict whether a flight will be delayed for more than 15 minutes.

In [2]:
import pandas as pd

In [3]:
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/flight_delays_2015.csv'
train = pd.read_csv(url)
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [4]:
train['dep_delayed_15min'] = train.dep_delayed_15min.apply(lambda x: 1 if x == 'Y' else 0)

## Time and Date

In [5]:
train['DayOfWeek'] = train.DayOfWeek.apply(lambda x: int(x.split('-')[1]))
train['Month'] = train.Month.apply(lambda x: int(x.split('-')[1]))
train['DayofMonth'] = train.DayofMonth.apply(lambda x: int(x.split('-')[1]))

In [6]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,8,21,7,1934,AA,ATL,DFW,732,0
1,4,20,3,1548,US,PIT,MCO,834,0
2,9,2,5,1422,XE,RDU,CLE,416,0
3,11,25,6,1015,OO,DEN,MEM,872,0
4,10,7,6,1828,WN,MDW,OMA,423,1


In [7]:
# hour
def extract_hours(time):
    time = str(time)
    if len(time)==3:
        return int(time[0])
    elif len(time)<=2:
        return 0
    else:
        hour = int(time[0:2])
        if hour==24:
            return 0
        elif hour==25:
            return 1
        else:
            return hour
    

In [8]:
train['DepHour'] = train.DepTime.apply(extract_hours)
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,DepHour
0,8,21,7,1934,AA,ATL,DFW,732,0,19
1,4,20,3,1548,US,PIT,MCO,834,0,15
2,9,2,5,1422,XE,RDU,CLE,416,0,14
3,11,25,6,1015,OO,DEN,MEM,872,0,10
4,10,7,6,1828,WN,MDW,OMA,423,1,18


## Busiest airports 

In [9]:
# busiest airports
top_40_airports = list(train.Origin.value_counts().sort_values(ascending=False).head(40).index)
print(top_40_airports)

['ATL', 'ORD', 'DFW', 'LAX', 'IAH', 'DEN', 'PHX', 'LAS', 'CVG', 'EWR', 'SLC', 'DTW', 'MSP', 'SFO', 'LGA', 'BOS', 'MCO', 'PHL', 'CLT', 'SEA', 'IAD', 'BWI', 'JFK', 'MDW', 'DCA', 'SAN', 'CLE', 'TPA', 'OAK', 'FLL', 'MIA', 'STL', 'BNA', 'RDU', 'SJC', 'PDX', 'HOU', 'MCI', 'HNL', 'SMF']


In [10]:
train.Origin = train.Origin.apply(lambda x : x if x in top_40_airports else 'Other')
train.Dest = train.Dest.apply(lambda x : x if x in top_40_airports else 'Other')

In [11]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,DepHour
0,8,21,7,1934,AA,ATL,DFW,732,0,19
1,4,20,3,1548,US,Other,MCO,834,0,15
2,9,2,5,1422,XE,RDU,CLE,416,0,14
3,11,25,6,1015,OO,DEN,Other,872,0,10
4,10,7,6,1828,WN,MDW,Other,423,1,18


## Machine Learning

In [12]:
X = train[['Month','DayofMonth','DayOfWeek','DepHour','UniqueCarrier','Distance','Origin','Dest']]
y = train.dep_delayed_15min

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [14]:
X_train.shape

(75000, 8)

In [15]:
X_train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepHour,UniqueCarrier,Distance,Origin,Dest
1924,12,19,1,7,OH,645,Other,BOS
21527,9,11,1,18,UA,2586,JFK,SFO
91317,2,27,1,15,FL,396,LGA,Other
40325,11,1,2,9,CO,1825,IAH,PDX
76661,4,20,4,13,NW,1040,MSP,Other


## Functions

In [16]:
# Machine Learning models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Evaluation metrics
from sklearn.metrics import accuracy_score, recall_score,precision_score, confusion_matrix, roc_auc_score
# Grid search
from sklearn.model_selection import GridSearchCV

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [16]:
X_train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepHour,UniqueCarrier,Distance,Origin,Dest
35781,6,18,6,12,WN,405,MDW,MCI
34640,10,2,1,13,AA,929,DFW,TPA
35601,12,2,5,10,NW,453,PHL,DTW
1360,5,13,5,6,UA,2704,BOS,SFO
72019,12,8,4,12,UA,629,LAS,DEN


# Logistic Regression

**pipeline**

In [17]:
# preprocessing transformer

# categorical features: can only take on a limited, and usually fixed, number of possible values
categorical_features = ['Month','DayofMonth','DayOfWeek','UniqueCarrier','DepHour','Origin','Dest']
numerical_features = ['Distance']
preprocessor = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaler', StandardScaler(), numerical_features)])

In [18]:
# pipeline
pipe = Pipeline(steps=[
    ('transformer', preprocessor),
    ('clf', LogisticRegression(max_iter=1000))
])
pipe

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('ohe',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Month', 'DayofMonth',
                                                   'DayOfWeek', 'UniqueCarrier',
                                                   'DepHour', 'Origin',
                                                   'Dest']),
                                                 ('scaler', StandardScaler(),
                                                  ['Distance'])])),
                ('clf', LogisticRegression(max_iter=1000))])

In [19]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('ohe',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Month', 'DayofMonth',
                                                   'DayOfWeek', 'UniqueCarrier',
                                                   'DepHour', 'Origin',
                                                   'Dest']),
                                                 ('scaler', StandardScaler(),
                                                  ['Distance'])])),
                ('clf', LogisticRegression(max_iter=1000))])

In [20]:
y_test_pred = pipe.predict(X_test)

In [21]:
confusion_matrix(y_test, y_test_pred)

array([[19961,   255],
       [ 4459,   325]], dtype=int64)

In [22]:
(y_test==1).sum()

4784

**grid search**

In [51]:
# grid search
params = {'clf__C' : [0.01,0.1,0,1,10,100],
          'clf__class_weight' : [None,'balanced']} #  'balanced' places more emphasis on the minority class
grid = GridSearchCV(pipe, params, cv=5, scoring='balanced_accuracy', verbose=True, n_jobs=-1)

scoring functions:

- accuracy: percentage of correct classifications
- recall: When the actual value is 1 (delayed flight), how often is the prediction correct?
- precision: When a 1 (delayed flight) is predicted, how often is the prediction correct?

In [52]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   24.3s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(transformers=[('ohe',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['Month',
                                                                          'DayofMonth',
                                                                          'DayOfWeek',
                                                                          'UniqueCarrier',
                                                                          'DepHour',
                                                                          'Origin',
                                                                          'Dest']),
                                                                        ('scaler',
                                                

In [53]:
grid.best_score_

0.6613184238709031

In [54]:
grid.best_params_

{'clf__C': 1, 'clf__class_weight': 'balanced'}

In [55]:
best_pipe = grid.best_estimator_

**Model evaluation**

In [56]:
y_test_pred = best_pipe.predict(X_test)

In [57]:
confusion_matrix(y_test, y_test_pred)

array([[12462,  7754],
       [ 1478,  3306]], dtype=int64)

In [58]:
accuracy_score(y_test,y_test_pred)

0.63072

In [59]:
recall_score(y_test,y_test_pred)

0.6910535117056856

In [60]:
precision_score(y_test,y_test_pred)

0.2989150090415913

## Random Forest

In [None]:
X_train.head()

In [18]:
# preprocessing transformer: OneHotEncoder
categorical_features = ['Month','DayofMonth','DayOfWeek', 'DepHour','UniqueCarrier','Origin','Dest']
#numerical_features = ['DepTime','Distance']
preprocessor = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaler', StandardScaler(), ['Distance'])])

In [62]:
# pipeline
pipe = Pipeline(steps=[
    ('transformer', preprocessor),
    ('clf', RandomForestClassifier(n_jobs=-1))
])
pipe

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('ohe',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Month', 'DayofMonth',
                                                   'DayOfWeek', 'DepHour',
                                                   'UniqueCarrier', 'Origin',
                                                   'Dest']),
                                                 ('scaler', StandardScaler(),
                                                  ['Distance'])])),
                ('clf', RandomForestClassifier(n_jobs=-1))])

In [None]:
pipe.fit(X_train,y_train)
y_test_pred = pipe.predict(X_test)
accuracy_score(y_test, y_test_pred)

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
recall_score(y_test,y_test_pred)

**grid search**

Random Forest/ Decision Trees hyper-parameters:
- max_depth: The maximum depth of the tree.
- min_samples_split: The minimum number of samples required to split an internal node
- min_sample_leaf : The minimum number of samples required to be at a leaf node.)
- max_leaf_nodes : the maximum number of leaf nodes)
- max_features : the maximum number of features that are evaluated for splitting at each node)

In [63]:
params = {"clf__class_weight" : ['balanced'],
          "clf__max_depth" : [10,25,50],
          "clf__max_leaf_nodes" : [20,50,100],
          "clf__min_samples_leaf" : [1,2,4],
         "clf__min_samples_split" : [2,4,8], 
         "clf__n_estimators" : [50,100,200]}
grid = GridSearchCV(pipe, params, cv=4, scoring='balanced_accuracy', verbose=True, n_jobs=-1)
grid.fit(X_train,y_train)

Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  2.9min finished


GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(transformers=[('ohe',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['Month',
                                                                          'DayofMonth',
                                                                          'DayOfWeek',
                                                                          'DepHour',
                                                                          'UniqueCarrier',
                                                                          'Origin',
                                                                          'Dest']),
                                                                        ('scaler',
                                                

In [64]:
grid.best_score_

0.6586084388844273

In [67]:
grid.best_params_

{'clf__class_weight': 'balanced',
 'clf__max_depth': 50,
 'clf__max_leaf_nodes': 100,
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 2,
 'clf__n_estimators': 200}

In [None]:
results = pd.DataFrame(grid.cv_results_)[['mean_test_score', 'params']]
results.plot(marker='o')

In [66]:
best_pipe = grid.best_estimator_
y_test_pred = best_pipe.predict(X_test)
confusion_matrix(y_test, y_test_pred)

array([[12036,  8180],
       [ 1410,  3374]], dtype=int64)