In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
df = pd.read_csv(r'C:\data\mia_data2.csv')

In [3]:
df.keys()

Index(['Unnamed: 0.1', 'Unnamed: 0', 'YEAR', 'MONTH', 'DAY_OF_WEEK',
       'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_TIME',
       'DEP_DELAY', 'DEP_DELAY_NEW', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'ARR_DELAY_NEW', 'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED',
       'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
       'WHY_DELAY', 'DAY', 'DATE'],
      dtype='object')

In [4]:
# Keeping all rows were the flight was not diverted
df = df[df.DIVERTED ==0]

In [5]:
# Keeping all rows where the flight was not cancelled
df = df[df.CANCELLED == 0]


In [6]:
df.WHY_DELAY.unique()

array(['0', 'nas', 'carrier', 'weather', 'late aircraft', 'security'],
      dtype=object)

In [7]:
# Keeping all rows that were either on time or delayed by weather.
keep = ['0', 'weather']
df1 = df[df.WHY_DELAY.isin(keep)]

In [8]:
# I want to create dummy values based on the why delay column
dummy = pd.get_dummies(df1['WHY_DELAY'])

In [9]:
dummy.head()

Unnamed: 0,0,weather
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [10]:
df = df1

In [11]:
df = pd.concat([df, dummy], axis = 1)
df.keys()

Index(['Unnamed: 0.1', 'Unnamed: 0', 'YEAR', 'MONTH', 'DAY_OF_WEEK',
       'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_TIME',
       'DEP_DELAY', 'DEP_DELAY_NEW', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'ARR_DELAY_NEW', 'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED',
       'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
       'WHY_DELAY', 'DAY', 'DATE', '0', 'weather'],
      dtype='object')

In [12]:
df.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0', 'OP_CARRIER_FL_NUM', 'CANCELLATION_CODE', 'DIVERTED',  'CARRIER_DELAY','WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'DATE', 'WHY_DELAY','OP_CARRIER_FL_NUM', 'CANCELLED', '0']  , inplace = True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 528022 entries, 0 to 736649
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   YEAR                 528022 non-null  int64  
 1   MONTH                528022 non-null  int64  
 2   DAY_OF_WEEK          528022 non-null  float64
 3   ORIGIN               528022 non-null  object 
 4   DEST                 528022 non-null  object 
 5   CRS_DEP_TIME         528022 non-null  int64  
 6   DEP_TIME             528022 non-null  float64
 7   DEP_DELAY            528022 non-null  float64
 8   DEP_DELAY_NEW        528022 non-null  float64
 9   CRS_ARR_TIME         528022 non-null  int64  
 10  ARR_TIME             528022 non-null  float64
 11  ARR_DELAY            528022 non-null  float64
 12  ARR_DELAY_NEW        528022 non-null  float64
 13  CRS_ELAPSED_TIME     528022 non-null  float64
 14  ACTUAL_ELAPSED_TIME  528022 non-null  float64
 15  DAY              

In [14]:
df_features = df.drop(columns=['DAY_OF_WEEK', 'DEP_TIME', 'DEP_DELAY', 'DEP_DELAY_NEW', 'ARR_TIME', 'ARR_DELAY', 'ARR_DELAY_NEW', 'ACTUAL_ELAPSED_TIME', 'CRS_ELAPSED_TIME'])
                    

In [15]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 528022 entries, 0 to 736649
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   YEAR          528022 non-null  int64 
 1   MONTH         528022 non-null  int64 
 2   ORIGIN        528022 non-null  object
 3   DEST          528022 non-null  object
 4   CRS_DEP_TIME  528022 non-null  int64 
 5   CRS_ARR_TIME  528022 non-null  int64 
 6   DAY           528022 non-null  int64 
 7   weather       528022 non-null  uint8 
dtypes: int64(5), object(2), uint8(1)
memory usage: 32.7+ MB


In [None]:
'''
# Want to create a wider time are rather than the specific time of departure. Going to be three categories morning, 
# afternoon and night.

#List of conditions
conditions = [
    (df_features['CRS_DEP_TIME'] >= 600) & (df_features['CRS_DEP_TIME'] <1400),
    (df_features['CRS_DEP_TIME'] >= 1400) & (df_features['CRS_DEP_TIME'] <2200),
    (df_features['CRS_DEP_TIME'] >= 2200) | (df_features['CRS_DEP_TIME'] <600)   
    ]

# List of values
values = ['Morning', 'Afternoon', 'Night']
df_features['Time_of_Day'] = np.select(conditions, values)
'''

In [None]:
#df_features['Time_of_Day'].unique()

In [16]:
df_features.head()

Unnamed: 0,YEAR,MONTH,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,DAY,weather
0,2010,5,STT,MIA,1535,1840,1,0
1,2010,5,STT,MIA,1535,1840,2,0
2,2010,5,STT,MIA,1535,1840,3,0
3,2010,5,STT,MIA,1535,1840,4,0
4,2010,5,STT,MIA,1535,1840,5,0


In [17]:
# I need to create two data frames to work on one that is about departures from MIA and the 
#other one that deals with arrivals.


df_arr = df_features[df_features.DEST == 'MIA']
df_dep = df_features[df_features.ORIGIN == 'MIA']

In [18]:
df_arr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 267544 entries, 0 to 736648
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   YEAR          267544 non-null  int64 
 1   MONTH         267544 non-null  int64 
 2   ORIGIN        267544 non-null  object
 3   DEST          267544 non-null  object
 4   CRS_DEP_TIME  267544 non-null  int64 
 5   CRS_ARR_TIME  267544 non-null  int64 
 6   DAY           267544 non-null  int64 
 7   weather       267544 non-null  uint8 
dtypes: int64(5), object(2), uint8(1)
memory usage: 16.6+ MB


In [19]:
# In df_ arr, the DEST value is the same for all flights, I will be eliminating that column as well as everything that has to 
# deal with departures.

df_arr.drop(columns = ['DEST', 'CRS_DEP_TIME'], inplace = True)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_arr.drop(columns = ['DEST', 'CRS_DEP_TIME'], inplace = True)


In [20]:
# Want to create a wider time are rather than the specific time of departure. Going to be three categories morning, 
# afternoon and night.

#List of conditions
conditions = [
    (df_arr['CRS_ARR_TIME'] >= 600) & (df_arr['CRS_ARR_TIME'] <1400),
    (df_arr['CRS_ARR_TIME'] >= 1400) & (df_arr['CRS_ARR_TIME'] <2200),
    (df_arr['CRS_ARR_TIME'] >= 2200) | (df_arr['CRS_ARR_TIME'] <600)   
    ]

# List of values
values = ['Morning', 'Afternoon', 'Night']
df_arr['Time_of_Day'] = np.select(conditions, values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_arr['Time_of_Day'] = np.select(conditions, values)


In [21]:
df_arr['Time_of_Day'].unique()

array(['Afternoon', 'Morning', 'Night'], dtype=object)

In [22]:
df_arr.head()

Unnamed: 0,YEAR,MONTH,ORIGIN,CRS_ARR_TIME,DAY,weather,Time_of_Day
0,2010,5,STT,1840,1,0,Afternoon
1,2010,5,STT,1840,2,0,Afternoon
2,2010,5,STT,1840,3,0,Afternoon
3,2010,5,STT,1840,4,0,Afternoon
4,2010,5,STT,1840,5,0,Afternoon


In [None]:
# Converting Origin to numerical data by using get_dummies.
temp = pd.get_dummies(df_arr.ORIGIN, prefix = 'ORIGIN')



In [None]:
temp

In [None]:
#Join the two dataframes
df_arr = pd.concat([df_arr, temp], axis = 1)

In [None]:
# Get dummies for 'Time_of_Day column'

In [23]:
t_d = pd.get_dummies(df_arr.Time_of_Day)

In [24]:
df_arr = pd.concat([df_arr, t_d], axis = 1)

In [25]:
y_arr = df_arr['weather']

In [26]:
df_arr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 267544 entries, 0 to 736648
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   YEAR          267544 non-null  int64 
 1   MONTH         267544 non-null  int64 
 2   ORIGIN        267544 non-null  object
 3   CRS_ARR_TIME  267544 non-null  int64 
 4   DAY           267544 non-null  int64 
 5   weather       267544 non-null  uint8 
 6   Time_of_Day   267544 non-null  object
 7   Afternoon     267544 non-null  uint8 
 8   Morning       267544 non-null  uint8 
 9   Night         267544 non-null  uint8 
dtypes: int64(4), object(2), uint8(4)
memory usage: 15.3+ MB


In [28]:
df_arr.drop(columns = ['weather', 'ORIGIN', 'Time_of_Day'], inplace = True)

In [29]:
df_arr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 267544 entries, 0 to 736648
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   YEAR          267544 non-null  int64
 1   MONTH         267544 non-null  int64
 2   CRS_ARR_TIME  267544 non-null  int64
 3   DAY           267544 non-null  int64
 4   Afternoon     267544 non-null  uint8
 5   Morning       267544 non-null  uint8
 6   Night         267544 non-null  uint8
dtypes: int64(4), uint8(3)
memory usage: 11.0 MB


In [30]:
X_arr_train, X_arr_test, y_arr_train, y_arr_test = train_test_split(df_arr, y_arr, test_size = .33, random_state = 42)

In [1]:
df_dep.info()

<class 'NameError'>: name 'df_dep' is not defined

In [None]:
#Going to do the same things to df_dep that I just did with df_arr
df_dep.drop(columns = ['ORIGIN', 'CRS_ARR_TIME'], inplace = True)

In [None]:
# Converting DEST to numerical data by using get_dummies.
temp = pd.get_dummies(df_dep.DEST, prefix = 'ORIGIN')

In [None]:
temp

In [None]:
#Join the two dataframes
df_dep = pd.concat([df_dep, temp], axis = 1)

In [None]:
df_dep.to_csv('C:\data\departure_basic.csv')

In [None]:
df_arr.to_csv(r'C:\data\arrival_basic.csv')

In [None]:
y_dep = df_dep['weather']

In [None]:
df_dep.drop(columns = ['weather', 'DEST'], inplace = True)

In [None]:
df_dep.head()

In [None]:
X_dep_train, X_dep_test, y_dep_train, y_dep_test = train_test_split(df_dep, y_dep, test_size = .33, random_state = 42)

In [None]:
y_dep_test.value_counts()

In [None]:

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42, sampling_strategy = 1.0)
X_dep_train, y_dep_train = sm.fit_resample(X_dep_train, y_dep_train)


In [31]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42, sampling_strategy = 1.0)
X_arr_train, y_arr_train = sm.fit_resample(X_arr_train, y_arr_train)

In [None]:
y_dep_train.value_counts() 

In [32]:

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report



In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(max_depth = 5, n_estimators=100, random_state = 42)
rf.fit(X_arr_train, y_arr_train)
rf_y_pred = rf.predict(X_arr_test)




In [None]:
print(f'Accuracy score with Random Forest Classifier is {accuracy_score(y_arr_test, rf_y_pred)}.')

In [None]:
print(f'Classification report for Random Forest Classifier is {classification_report(y_arr_test, rf_y_pred )}.')

In [34]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lgreg = LogisticRegression(class_weight = 'balanced')
lgreg.fit(X_arr_train, y_arr_train)
predlgreg = lgreg.predict(X_arr_test)


In [35]:
print(f'Accuracy score with Logistic Regression is {accuracy_score(y_arr_test, predlgreg)}.')
print(f'Classification report \n {classification_report(y_arr_test, predlgreg)}.')

Accuracy score with Logistic Regression is 0.617986181900555.
Classification report 
               precision    recall  f1-score   support

           0       0.99      0.62      0.76     86782
           1       0.03      0.74      0.06      1508

    accuracy                           0.62     88290
   macro avg       0.51      0.68      0.41     88290
weighted avg       0.98      0.62      0.75     88290
.


In [None]:
# KNeighbor Classifier
def knn(X_train,y_train, X_test, y_test):
    from sklearn.neighbors import KNeighborsClassifier
    score = {}
    for n in range (1,5):
        knn = KNeighborsClassifier(n_neighbors = (n))
        knn.fit(X_train,y_train)
        y_pred_class = knn.predict(X_test)
        score[n] = y_pred_class
    
    return score


In [None]:
knn_score = knn(X_arr_train, y_arr_train, X_arr_test, y_arr_test)


In [None]:
knn_score

In [None]:
trinket = {}
for item in knn_score.items():
    trinket[item[0]] = accuracy_score(y_arr_test, item[1])
max_score = max(trinket.values())
best_n = max(trinket, key=trinket.get)


In [None]:
print(f'{best_n} makes the best n value with a score of {max_score}.')
print(f'Classification report for KNeighbors Classifier \n {classification_report(y_arr_test, knn_score[best_n])}.')

In [None]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier(criterion = 'gini', random_state = 42, max_depth = 3, min_samples_leaf=5)
DTC.fit(X_arr_train, y_arr_train)
DTC_y_predict = DTC.predict(X_arr_test)


In [None]:
print(f'Accuracy score with Decision Tree Classifier is {accuracy_score(y_arr_test, DTC_y_predict)}.')
print(f' Classification Reoprt for Decision Tree Classifier \n{classification_report(y_arr_test, DTC_y_predict)}.')

In [None]:
# Support Vector Classifier
from sklearn.svm import SVC
svm = SVC(kernel = 'linear')
svm.fit(X_arr_train, y_arr_train)
svm_pred = svc.predict(y_arr_test)



In [None]:
svm_score = svc.score(X_arr_train, y_arr_train)
#cv_svc(svc, X_arr_train, y_arr_train, cv = 10)
cr_svm= classification_report(y_arr_test, svm_pred)
svc_accuracy_score= accuracy_score(y_arr_test,svm_pred)


In [None]:
# Naive Bayes Gaussian NB
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(X_arr_train, y_arr_train)
gaus_pred = gaussian.predict(X_arr_test)


In [None]:
gaus_accuracy = accuracy_score(y_arr_test, gaus_pred)
gaus_class = classification_report(y_arr_test, gaus_pred)

In [None]:
print(f' Naiv Bayes Gaussian has an accuracy score of {gaus_accuracy}.')
print(f' Naiv Bayes Gausssian confidence report is \n {gaus_class}.')

In [None]:
# Naive Bayes Multinomial
from sklearn.naive_bayes import MultinomialNB
multinomial = MultinomialNB()
multinomial.fit(X_arr_train, y_arr_train)
multi_pred = multinomial.predict(X_arr_test)


In [None]:
multi_accuracy = accuracy_score(y_arr_test, multi_pred)
multi_class = classification_report(y_arr_test, multi_pred)

In [None]:
print(f' Naiv Bayes Multinomial has an accuracy score of {multi_accuracy}.')
print(f' Naiv Bayes Multinomial confidence report is \n {multi_class}.')