In [1]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, r2_score,mean_squared_error, mean_absolute_error
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, LassoCV, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures, scale
from sklearn.feature_selection import SelectKBest,f_classif,chi2,f_regression
from sklearn import preprocessing

import pandas as pd 
import os
import glob
import datetime, warnings, scipy 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
class DataPreprocess:
    def __init__(self, filename):
        self.filename = filename
        
    def read_in_data(self):
        return pd.read_csv(self.filename)

    def select_feature_manually(self):
        self.df = self.read_in_data()
        self.df_feature = self.df[["AIRLINE_ID", "ORIGIN_AIRPORT_ID", "DEST_AIRPORT_ID","LATITUDE", "LONGITUDE", "MONTH","DAY_OF_MONTH", "PLANE_AGE", "AWND", "PRCP", "SNOW", "TMAX", "TAVG",
             "CRS_DEP_TIME", "CRS_ARR_TIME", "DISTANCE"]]

    def convert_time(self, time):
        hour = np.floor(time/100)
        minutes = hour*60 + time - hour*100
        return minutes

    def make_dataset(self, airline):

        # airline_oh = pd.get_dummies(self.df_feature['AIRLINE_ID'])
        # name = 'AIRLINE_ID'
        # airline_oh_columns = []
        # for i in range(airline_oh.shape[1]):
        #     airline_oh_columns.append(name+" "+str(i))    
        # airline = pd.DataFrame(airline_oh.to_numpy(),columns=airline_oh_columns)
        # self.df_feature = self.df_feature.join(airline)

        arr_airport_oh = pd.get_dummies(self.df_feature['DEST_AIRPORT_ID'])
        name = 'DEST_AIRPORT_ID'
        arr_airport_oh_columns = []
        for i in range(arr_airport_oh.shape[1]):
            arr_airport_oh_columns.append(name+" "+str(i))
            
        arr_airport = pd.DataFrame(arr_airport_oh.to_numpy(),columns=arr_airport_oh_columns)
        self.df_feature = self.df_feature.join(arr_airport)
            
        dep_airport_oh = pd.get_dummies(self.df_feature['ORIGIN_AIRPORT_ID'])
        name = 'ORIGIN_AIRPORT_ID'
        dep_airport_oh_columns = []
        for i in range(dep_airport_oh.shape[1]):
            dep_airport_oh_columns.append(name+" "+str(i))
            
        dep_airport = pd.DataFrame(dep_airport_oh.to_numpy(),columns=dep_airport_oh_columns)
        self.df_feature = self.df_feature.join(dep_airport)

#         airline_oh = airline_oh_df.to_numpy()
#         arr_airport_oh = arr_airport_oh_df.to_numpy()
#         dep_airport_oh = dep_airport_oh_df.to_numpy()

        # self.df_feature = self.df_feature.drop(['AIRLINE_ID', 'DEST_AIRPORT_ID', 'ORIGIN_AIRPORT_ID'], axis=1)
        self.df_feature = self.df_feature.drop(['DEST_AIRPORT_ID', 'ORIGIN_AIRPORT_ID'], axis=1)

        self.df_feature = self.df_feature.fillna(0)

        arr_time = self.df_feature["CRS_ARR_TIME"].to_numpy().reshape(-1, 1)
        dep_time = self.df_feature["CRS_DEP_TIME"].to_numpy().reshape(-1, 1)
        arr_time_min = self.convert_time(arr_time)
        dep_time_min = self.convert_time(dep_time)
        time_min_df = pd.DataFrame(np.hstack([arr_time_min, dep_time_min]), columns=['CRS_ARR_TIME_MIN', 'CRS_DEP_TIME_MIN'])

        self.df_feature = self.df_feature.drop(["CRS_ARR_TIME", "CRS_DEP_TIME"], axis=1)
        self.df_feature = self.df_feature.join(time_min_df)

        features_without_id = self.df_feature.to_numpy()
        self.features = features_without_id # array
        self.df_truth = self.df[["CANCELLED", "DEP_DELAY_NEW", "ARR_DELAY_NEW"]]
    
    
    def generate(self, airline):

        self.select_feature_manually()
        self.make_dataset(airline)
    

        y = np.zeros((self.df_truth.shape[0],2))

        for i in range(6):
            if i==0:
                y[self.df_truth["DEP_DELAY_NEW"]==0, 0] = 0
                y[self.df_truth["ARR_DELAY_NEW"]==0, 1] = 0
            elif i==5:
                ub = (i-1)*15
                y[self.df_truth["DEP_DELAY_NEW"]>ub, 0] = ub
                y[self.df_truth["ARR_DELAY_NEW"]>ub, 1] = ub
            else: 
                lb = (i-1)*15
                ub = i*15
                y[np.logical_and(self.df_truth["DEP_DELAY_NEW"]>lb, self.df_truth["DEP_DELAY_NEW"]<=ub), 0] = lb
                y[np.logical_and(self.df_truth["ARR_DELAY_NEW"]>lb, self.df_truth["ARR_DELAY_NEW"]<=ub), 1] = lb       

        y = pd.DataFrame(y, columns=['dep','arr'])

        if airline != '':
            all_data = pd.merge(self.df_feature,self.df_truth,left_index=True,right_index=True)
            all_data = pd.merge(all_data, y, left_index=True, right_index=True)
            new_df = all_data[all_data['AIRLINE_ID'] == 19393]
            # new_df = all_data[all_data['DEP_DELAY_NEW'] <= 360]

            to_drop = ["CANCELLED", "DEP_DELAY_NEW", "ARR_DELAY_NEW","dep","arr"]
            to_keep = new_df.columns.difference(to_drop)

            self.df_feature = pd.DataFrame(new_df, columns=to_keep)
            self.df_truth = new_df[["CANCELLED", "DEP_DELAY_NEW", "ARR_DELAY_NEW"]]
            y = pd.DataFrame(new_df, columns=['dep','arr'])
        #     self.df_feature = new_df
        
        return self.df_feature, self.features, self.df_truth, y

def select_K_Best(X,y,method,num_features):
    if method == "clf":
        select = SelectKBest(f_classif,k=num_features)
        z = select.fit_transform(X,y)
        filter = select.get_support(1)
        X_new = X[X.columns[filter]]
        selected_fs = X_new.columns
        return selected_fs,z
    else:
        select = SelectKBest(f_regression,k=num_features)
        z = select.fit_transform(X,y)
        filter = select.get_support(1)
        X_new = X[X.columns[filter]]
        selected_fs = X_new.columns
        return selected_fs,z

In [3]:
filename = "train_2019.csv"
data = DataPreprocess(filename)
all_airlines = ''
# all_airlines = 'Southwest Airlines Co.'
X, _, y_truth, y = data.generate(all_airlines)
print(len(X))


2216748


In [5]:
FEATURE_NUM = 74
# FEATURE_NUM = 40
# features_reg,z_reg = select_K_Best(X, y_truth['DEP_DELAY_NEW'],'reg', FEATURE_NUM)
features_reg,z_reg = select_K_Best(X, y_truth['DEP_DELAY_NEW'],'reg',num_features=FEATURE_NUM)
# features_reg,z_reg = select_K_Best(X2, y_truth2['DEP_DELAY_NEW'],'reg', FEATURE_NUM)

In [6]:
X_input = X[features_reg]
# X_input = X

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_input, y_truth['DEP_DELAY_NEW'], test_size=0.3, random_state=3)

prep_x_train2 = preprocessing.scale(X_train2)
prep_y_train2 = preprocessing.scale(y_train2)
prep_x_test2 = preprocessing.scale(X_test2)
prep_y_test2 = preprocessing.scale(y_test2)

In [14]:
mlr = LinearRegression()
mlr.fit(prep_x_train2, prep_y_train2)

pred = mlr.predict(prep_x_test2)

r2 = r2_score(prep_y_test2, pred)
mse = mean_squared_error(prep_y_test2, pred)
mae = mean_absolute_error(prep_y_test2, pred)

dist = pd.DataFrame()
dist["Prediction"] = pd.Series([float(s) for s in pred]) 
dist["Ground Truth"] = pd.Series([float(s) for s in prep_y_test2])
mlr_plot = sns.jointplot(x="Ground Truth", y="Prediction", data=dist,
              joint_kws={'line_kws':{'color':'k'}}, kind='reg')
plt.show()

-5.354733487064133e+19
5.354733487064131e+19
5482685576.766741


In [22]:
mlr_plot.figure.savefig("mlr_plot.png")

In [17]:
a = min(pred)
b = max(pred)
c = min(y_test2)
d = max(y_test2)
e = min(prep_y_test2)
f = max(prep_y_test2)
# g = min(y_train2)
# h = max(y_train2)

print("range of pred : {},   {}".format(a,b))
print("range of y_test2 : {},   {}".format(c,d))
print("range of prep_y_test2 : {},   {}".format(e,f))

print("avg delay: {}".format(y_truth['DEP_DELAY_NEW'].mean()))
print("min delay: {}".format(y_truth['DEP_DELAY_NEW'].min()))
print("max delay: {}".format(y_truth['DEP_DELAY_NEW'].max()))

range of pred : -40785117799.70468,   23275014688.54867
range of y_test2 : 0.0,   540.0
range of prep_y_test2 : -0.438909505998893,   15.986244467408715
avg delay: 14.450585208269736
min delay: 0.0
max delay: 804.0


In [24]:
# a = 10**np.linspace(-6,6,100)
a = [0.068]
ridgecv = RidgeCV(alphas=a, scoring='r2',cv=5).fit(prep_x_train2, prep_y_train2)
ridge_pred = ridgecv.predict(prep_x_test2)

r2_ridge = r2_score(prep_y_test2, ridge_pred)
mse_ridge = mean_squared_error(prep_y_test2, ridge_pred)
mae_ridge = mean_absolute_error(prep_y_test2, ridge_pred)

dist = pd.DataFrame()
dist["Prediction"] = pd.Series([float(s) for s in ridge_pred]) 
dist["Ground Truth"] = pd.Series([float(s) for s in prep_y_test2])
ridge_plot = sns.jointplot(x="Ground Truth", y="Prediction", data=dist,
              joint_kws={'line_kws':{'color':'k'}}, kind='reg')
plt.show()

r2 ridge:  0.035581659119470754
mse ridge:  0.9644183408805289
mae ridge:  0.44290589787950757
best score:  0.03633324356098877
range of pred : -0.6007777693208204,   2.5252487660395384


In [27]:
ridge_plot.figure.savefig("ridge_plot.png")

In [28]:
a = 10**np.linspace(-6,6,100)
num_lasso_a = 100
lassocv = LassoCV(eps=(min(a)/max(a)), n_alphas=num_lasso_a, cv=5).fit(prep_x_train2, prep_y_train2)
lasso_pred = lassocv.predict(prep_x_test2)

r2_lasso = r2_score(prep_y_test2, lasso_pred)
mse_lasso = mean_squared_error(prep_y_test2, lasso_pred)
mae_lasso = mean_absolute_error(prep_y_test2, lasso_pred)

dist = pd.DataFrame()
dist["Prediction"] = pd.Series([float(s) for s in lasso_pred]) 
dist["Ground Truth"] = pd.Series([float(s) for s in prep_y_test2])
lasso_plot = sns.jointplot(x="Ground Truth", y="Prediction", data=dist,
              joint_kws={'line_kws':{'color':'k'}}, kind='reg')
plt.show()

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


r2 lasso:  0.0355807600429795
mse lasso:  0.9644192399570202
mae lasso:  0.4429003133326184
range of pred : -0.5998932301276076,   2.525028450487708


In [30]:
lasso_plot.figure.savefig("lasso_plot.png")

In [8]:
a = 10**np.linspace(-6,6,100)
elastic_num_a = 100
elasticcv = ElasticNetCV(eps=(min(a)/max(a)), n_alphas=elastic_num_a, cv=5).fit(prep_x_train2, prep_y_train2)
elastic_pred = elasticcv.predict(prep_x_test2)

r2_elastic = r2_score(prep_y_test2, elastic_pred)
mse_elastic = mean_squared_error(prep_y_test2, elastic_pred)
mae_elastic = mean_absolute_error(prep_y_test2, elastic_pred)

dist = pd.DataFrame()
dist["Prediction"] = pd.Series([float(s) for s in elastic_pred]) 
dist["Ground Truth"] = pd.Series([float(s) for s in prep_y_test2])
lasso_plot = sns.jointplot(x="Ground Truth", y="Prediction", data=dist,
              joint_kws={'line_kws':{'color':'k'}}, kind='reg')
plt.show()

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


r2 elastic:  0.03558098716591507
mse elastic:  0.9644190128340846
range of pred : -0.6003517033563278,   2.5257927567992646


In [46]:
from sklearn.tree import DecisionTreeRegressor
best_depth = 10
dtr = DecisionTreeRegressor(max_depth=best_depth)
dtr.fit(prep_x_train2, prep_y_train2)
dt_pred = dtr.predict(prep_x_test2)

r2_dt = r2_score(prep_y_test2, dt_pred)
mse_dt = mean_squared_error(prep_y_test2, dt_pred)
mae_st = mean_absolute_error(prep_y_test2, dt_pred)

dist = pd.DataFrame()
dist["Prediction"] = pd.Series([float(s) for s in dt_pred]) 
dist["Ground Truth"] = pd.Series([float(s) for s in prep_y_test2])
dt_plot = sns.jointplot(x="Ground Truth", y="Prediction", data=dist,
              joint_kws={'line_kws':{'color':'k'}}, kind='reg')
plt.show()

r2 dt:  0.06237345261077709
mse dt:  0.9376265473892226
mae dt:  0.42686057349904416


In [48]:
dt_plot.figure.savefig("dt_plot.png")

In [9]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
num_est = 100
lr = 1
best_dt = DecisionTreeRegressor(max_depth=best_depth)
abr = AdaBoostRegressor(base_estimator= best_dt, n_estimators=num_est,learning_rate=lr)
model = abr.fit(X_train2, y_train2)
ada_pred = model.predict(X_test2)

r2_ada = r2_score(y_test2, ada_pred)
mse_ada = mean_squared_error(y_test2, ada_pred)
mae_ada = mean_absolute_error(y_test2, ada_pred)

dist = pd.DataFrame()
dist["Prediction"] = pd.Series([float(s) for s in ada_pred]) 
dist["Ground Truth"] = pd.Series([float(s) for s in y_test2])
ada_plot = sns.jointplot(x="Ground Truth", y="Prediction", data=dist,
              joint_kws={'line_kws':{'color':'k'}}, kind='reg')
plt.show()

ada r2:  -43.35029092846982
mse ada:  107896.58813157002
mae ada:  302.4821665626039


In [52]:
ada_plot.figure.savefig("ada_plot.png")