In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import pyplot
% matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier 

In [None]:
# install PostgreSQL in Colab
!apt install postgresql postgresql-contrib &>log
!service postgresql start
!sudo -u postgres psql -c "CREATE USER root WITH SUPERUSER"

 * Starting PostgreSQL 10 database server
   ...done.
CREATE ROLE


In [None]:
# set connection
%load_ext sql
%config SqlMagic.feedback=False 
%config SqlMagic.autopandas=True
%sql postgresql+psycopg2://postgres:groupcgroupc@flight-delay-project.chgeeix9show.us-east-2.rds.amazonaws.com:5432/flight-delay-tables

  """)


'Connected: postgres@flight-delay-tables'

In [None]:
# Creating DataFrame from DB table "sample_train_test" using %sql 
# NOTE: Original table contain over 6mil data points so the team utilized R to randomly sample 600,000 for testing

df = %sql SELECT * FROM public.sample_train_test
df.head()

 * postgresql+psycopg2://postgres:***@flight-delay-project.chgeeix9show.us-east-2.rds.amazonaws.com:5432/flight-delay-tables


Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DISTANCE_GROUP,DEP_BLOCK,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,AIRLINE_FLIGHTS_MONTH,AIRLINE_AIRPORT_FLIGHTS_MONTH,AVG_MONTHLY_PASS_AIRPORT,AVG_MONTHLY_PASS_AIRLINE,FLT_ATTENDANTS_PER_PASS,GROUND_SERV_PER_PASS,PLANE_AGE,DEPARTING_AIRPORT,LATITUDE,LONGITUDE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
0,11,1,False,4,AFTERNOON,3,18,70,SkyWest Airlines Inc.,2176,66502,228,197188,3472966,3.4e-05,9.9e-05,1,Charleston International,32.899,-80.039,Logan International,0.0,0.0,0.0,71.0,3.36
1,1,4,False,5,EVENING,4,24,230,Frontier Airlines Inc.,2327,9496,122,279230,1857122,0.000116,7e-06,2,General Mitchell Field,42.95,-87.897,San Diego International Lindbergh Fl,0.0,0.0,0.0,27.0,6.04
2,6,6,False,4,MORNING,2,50,50,American Eagle Airlines Inc.,7008,27159,1035,1413432,1204766,0.000348,0.000107,15,Miami International,25.792,-80.286,Greenville-Spartanburg,0.78,0.0,0.0,85.0,7.61
3,2,5,True,3,LATE_NIGHT,8,153,158,Delta Air Lines Inc.,28011,67273,17181,4365661,12460183,0.000144,0.000149,21,Atlanta Municipal,33.641,-84.427,Friendship International,0.0,0.0,0.0,62.0,2.91
4,4,7,False,3,MIDDAY,3,31,199,Delta Air Lines Inc.,3690,81803,638,404840,12460183,0.000144,0.000149,21,Southwest Florida International,26.536,-81.755,Atlanta Municipal,0.0,0.0,0.0,90.0,7.38


## Encoding Data

In [None]:
df = df[['DEP_DEL15','DEP_BLOCK','SEGMENT_NUMBER','CONCURRENT_FLIGHTS','AIRLINE_AIRPORT_FLIGHTS_MONTH','AIRPORT_FLIGHTS_MONTH','PRCP','AWND','SNOW','TMAX']].copy()
df.head()

Unnamed: 0,DEP_DEL15,DEP_BLOCK,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,AIRLINE_AIRPORT_FLIGHTS_MONTH,AIRPORT_FLIGHTS_MONTH,PRCP,AWND,SNOW,TMAX
0,False,AFTERNOON,3,18,228,2176,0.0,3.36,0.0,71.0
1,False,EVENING,4,24,122,2327,0.0,6.04,0.0,27.0
2,False,MORNING,2,50,1035,7008,0.78,7.61,0.0,85.0
3,True,LATE_NIGHT,8,153,17181,28011,0.0,2.91,0.0,62.0
4,False,MIDDAY,3,31,638,3690,0.0,7.38,0.0,90.0


In [None]:
# Encoding "departure_block" feature
dep_block = {"EARLY_MORNING": 0,
             "MORNING": 1,
             "AFTERNOON": 2,
             "MIDDAY": 3,
             "EVENING": 4,
             "LATE_NIGHT": 5}

encoded_df = df.copy()
encoded_df['DEP_BLOCK'] = encoded_df['DEP_BLOCK'].apply(lambda x: dep_block[x])

In [None]:
# Encoding "DEPARTURE_DELAY(>15 MINUTES)" feature
# False = 0
# True = 1
encoded_df['DEP_DEL15'] = encoded_df['DEP_DEL15'].astype(int)

## Split dataset

In [None]:
# Separate the features from the target
X = encoded_df.drop(columns='DEP_DEL15')
y = encoded_df['DEP_DEL15']

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(450000, 9)

## Scale

In [None]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

## KNN model

In [None]:
# Instantiate KNN model classifier
classifier = KNeighborsClassifier(n_neighbors=5)

# Fitting the data
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## Prediction

In [None]:
predictions = classifier.predict(X_test)

In [None]:
results_df = pd.DataFrame({'Prediction': predictions,
                           'Actual': y_test}).reset_index(drop=True)
results_df.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,1
5,0,0
6,1,0
7,0,0
8,0,0
9,0,0


## Validation

In [None]:
# Generating a Confusion Matrix 
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print('--------------------------------------------------------')
print(f"Accuracy Score : {acc_score}")
print('--------------------------------------------------------')
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,112852,7751
Actual 1,24406,4991


--------------------------------------------------------
Accuracy Score : 0.78562
--------------------------------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.82      0.94      0.88    120603
           1       0.39      0.17      0.24     29397

    accuracy                           0.79    150000
   macro avg       0.61      0.55      0.56    150000
weighted avg       0.74      0.79      0.75    150000



## Correlation matrix & Feature selection

In [None]:
corr_matrix = encoded_df.corr()

In [None]:
corr_matrix.style.background_gradient(cmap='coolwarm')

target = 'DEP_DEL15'
threshold = 0.015
correlation_scores = corr_matrix[(corr_matrix[target] > threshold) | (corr_matrix[target] < -threshold)][target]

correlation_scores.sort_values(ascending=False)

DEP_DEL15                1.000000
DEP_BLOCK                0.138847
SEGMENT_NUMBER           0.115103
PRCP                     0.085239
SNOW                     0.050990
AWND                     0.048003
AIRPORT_FLIGHTS_MONTH    0.025291
CONCURRENT_FLIGHTS       0.015538
Name: DEP_DEL15, dtype: float64