#  **TABLE OF CONTENTS**

---
## **1. Import librabries and data preprocessing**
## **2. Apply model with pipelines**
> ### 2.1 Hold out
> ### 2.2 Naive Bayes Pipeline
> ### 2.3 Ensemble Algorithms Pipeline
> ### 2.4 Support Vector Machines & K-Nearest Neighbors Pipeline





---


## **1. Import librabries and data preprocessing**

> Connect Google Drive to Google Colab and import necessary librabries

> Load the data in file csv from Google Drive

In [5]:
 !pip install pipelinehelper



In [35]:
import pandas as pd # Librabry for table data
import numpy as np # Librabry for algebra
# Library for visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Librabry sklearn model selection 
from sklearn.pipeline import Pipeline
from pipelinehelper import PipelineHelper
from sklearn.preprocessing import StandardScaler,MaxAbsScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# Librabry for feature selection
from sklearn.decomposition import PCA
# Librabry for algorithm
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
%matplotlib inline

In [36]:
df = pd.read_csv('D:/Download/new_2007.csv') # Load data from Google Drive
df.head() # See some information at the top of dataset

Unnamed: 0.1,Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,FlightNum,TailNum,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Diverted,ArrDelay_categorical
0,0,1,1,1,1232.0,WN,2891,N351,1.0,7.0,SMF,ONT,389,4,11,0,0
1,1,1,1,1,1918.0,WN,462,N370,8.0,13.0,SMF,PDX,479,5,6,0,0
2,3,1,1,1,1230.0,WN,1355,N364,26.0,30.0,SMF,PDX,479,3,8,0,0
3,4,1,1,1,831.0,WN,2278,N480,-3.0,1.0,SMF,PDX,479,3,9,0,0
4,5,1,1,1,1430.0,WN,2386,N611SW,3.0,10.0,SMF,PDX,479,2,7,0,0


In [37]:
df = df.drop(columns=['TailNum', 'Origin', 'Dest','Unnamed: 0','ArrDelay','DepDelay']) # Drop categorical features which are analyzed at Analysis section 

In [38]:
X = df.drop(['ArrDelay_categorical', 'Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier'], axis=1) # Drop available features to convert to categorical feature
y = df["ArrDelay_categorical"] # Target column

In [39]:
Categorical_features = df[['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier']]
one_hot_encoding = pd.get_dummies(data= Categorical_features, columns=['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier'])
new_X = pd.concat([X, one_hot_encoding], axis=1, sort=False)


---
## **2. Apply model with pipelines**

> ### 2.1 Hold out

In [40]:
X_train, X_val, y_train, y_val = train_test_split(new_X, y, test_size = 0.2, random_state = 0) # Divide into 2 train test and test set

> ### 2.2 Naive Bayes Pipeline

In [51]:
# Define pipeline 1
pipe_1 = Pipeline([
    ('preprocessor', PipelineHelper([
        ('max', MinMaxScaler())
    ])),
    ('classifier', PipelineHelper([
            ('mnb', MultinomialNB()),  
            ('gnb', GaussianNB()),
    ])),
])

# Test configuration
parameters_1 = {
    'preprocessor__selected_model': pipe_1.named_steps['preprocessor'].generate({
        'max__copy': [True, False],
    }),
    'classifier__selected_model': pipe_1.named_steps['classifier'].generate({
        'mnb__fit_prior': [True, False],
        'mnb__alpha': [0.1, 0.2],
        'gnb__var_smoothing': [1e-9, 1e-5],
    })
}
# Select configuration
grid = GridSearchCV(pipe_1, parameters_1, cv=5, scoring='accuracy',verbose=1)
grid.fit(X_train, y_train)
grid_best = grid.best_estimator_ # Best estimator
print("="*100)
print("Best configuration: ", grid.best_params_)
print("="*100)
print("CV Results", grid.cv_results_)
print("="*100)
print("Accuracy: {:6.2f}".format(grid.score(X_val, y_val)))
print("="*100)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best configuration:  {'classifier__selected_model': ('mnb', {'alpha': 0.1, 'fit_prior': True}), 'preprocessor__selected_model': ('max', {'copy': True})}
CV Results {'mean_fit_time': array([5.21579504, 4.42087374, 5.08048782, 4.2728591 , 5.2715414 ,
       4.54142742, 4.90548787, 4.33260221, 8.07232203, 7.41083627,
       8.03066974, 7.65619335]), 'std_fit_time': array([0.25627491, 0.14506177, 0.27858311, 0.09129725, 0.25848562,
       0.25084361, 0.02654034, 0.09368405, 0.04266048, 0.01803334,
       0.02572283, 0.22190882]), 'mean_score_time': array([0.67568645, 0.48473558, 0.65259876, 0.48468151, 0.70237212,
       0.52169337, 0.64829984, 0.53249564, 1.71808677, 1.61334147,
       1.73997817, 1.51205454]), 'std_score_time': array([0.08217579, 0.02996149, 0.0330624 , 0.01404981, 0.06094017,
       0.05514283, 0.01171501, 0.01963766, 0.07830064, 0.02422142,
       0.04528042, 0.04574586]), 'param_classifier__selected_model': 

In [52]:
grid_best.fit(X_train, y_train) # Retrain
y_pred = grid_best.predict(X_val) # Predict with test set
print("="*100)
print("Classification Report")
print(classification_report(y_val, y_pred)) # Evaluate the model
print("="*100)

Classification Report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99   1040663
           1       1.00      0.10      0.19     20650

    accuracy                           0.98   1061313
   macro avg       0.99      0.55      0.59   1061313
weighted avg       0.98      0.98      0.98   1061313



> ### 2.3 Ensemble Algorithms Pipeline

In [53]:
# Define pipeline 2
pipe_2 = Pipeline([
    ('preprocessor', PipelineHelper([
        ('std', StandardScaler()),
        ('max', MaxAbsScaler())
    ], optional=True)),
    ('classifier', PipelineHelper([
        ('dc', DecisionTreeClassifier()),
        ('rf', RandomForestClassifier()),
        ('ada', AdaBoostClassifier()),
        ('gb', GradientBoostingClassifier()),
    ])),
])

# Test configuration
parameters_2 = {
    'preprocessor__selected_model': pipe_2.named_steps['preprocessor'].generate({
        'std__with_mean': [True, False],
        'max__copy': [True, False],
    }),
    'classifier__selected_model': pipe_2.named_steps['classifier'].generate({
        'dc__min_samples_leaf': [5, 10],
        'dc__max_depth': [5, 10],
        'rf__n_estimators': [10, 20, 50],
        'rf__min_samples_split': [2, 5],
        'ada__n_estimators': [10, 20],
        'ada__algorithm': ['SAMME', 'SAMME.R'],
        'gb__n_estimators': [10, 50],
    })
}
# Select configuration
grid = GridSearchCV(pipe_2, parameters_2, cv=5, scoring='accuracy',verbose=1)
grid.fit(X_train, y_train)
print("-"*80)
print("Best configuration: ", grid.best_params_)
print("Accuracy: {:6.2f}".format(grid.score(X_val, y_val)))

Fitting 5 folds for each of 80 candidates, totalling 400 fits
--------------------------------------------------------------------------------
Best configuration:  {'classifier__selected_model': ('rf', {'min_samples_split': 5, 'n_estimators': 50}), 'preprocessor__selected_model': ('std', {'with_mean': False})}
Accuracy:   0.98


In [58]:
grid_best.fit(X_train, y_train) # Retrain
y_pred = grid_best.predict(X_val) # Predict with test set
print("="*100)
print("Classification Report")
print(classification_report(y_val, y_pred)) # Evaluate the model
print("="*100)

Classification Report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99   1040663
           1       1.00      0.10      0.19     20650

    accuracy                           0.98   1061313
   macro avg       0.99      0.55      0.59   1061313
weighted avg       0.98      0.98      0.98   1061313



> ### 2.4 Support Vector Machines & K-Nearest Neighbors Pipeline

In [59]:
# Define pipeline 2
pipe_3 = Pipeline([
    ('preprocessor', PipelineHelper([
        ('std', StandardScaler()),
    ])),
    ('classifier', PipelineHelper([
        ('svm', SVC()),
        ('knn', KNeighborsClassifier()),
    ])),
])

# Test configuration
parameters_3 = {
    'preprocessor__selected_model': pipe_3.named_steps['preprocessor'].generate({
        'std__with_mean': [True, False],
    }),
    'classifier__selected_model': pipe_3.named_steps['classifier'].generate({
        'svm__C': [0.5, 1.0],
        'svm__kernel': ['linear', 'rbf'],
        'knn__n_neighbors': [2, 5],
    })
}
# Select configuration
grid = GridSearchCV(pipe_3, parameters_3, cv=5, scoring='accuracy',verbose=1)
grid.fit(X_train, y_train)
grid_best = grid.best_estimator_ # Best estimator
print("="*100)
print("Best configuration: ", grid.best_params_)
print("="*100)
print("CV Results", grid.cv_results_)
print("="*100)
print("Accuracy: {:6.2f}".format(grid.score(X_val, y_val)))
print("="*100)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
grid_best.fit(X_train, y_train) # Retrain
y_pred = grid_best.predict(X_val) # Predict with test set
print("="*100)
print("Classification Report")
print(classification_report(y_val, y_pred)) # Evaluate the model
print("="*100)