# IMPORT DATA AND DATA PROCESSING

In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.metrics import mean_squared_error 
import math
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split



In [2]:
df = pd.read_csv("../Input/DelayedFlights.csv")

In [3]:
# because of too much missing values,which maybe influence the training result, we decide to remove the last 5 column
for column in df.columns[-5:]:
    print("The number of missing values of :%s"%str(column)+" : %d"%df[column].isnull().sum())
    df.drop(column, axis = 1, inplace = True)
# because there are only several rows missing some values, we choose remove it
df.dropna(axis=0, how='any', inplace=True)

The number of missing values of :CarrierDelay : 362841
The number of missing values of :WeatherDelay : 362841
The number of missing values of :NASDelay : 362841
The number of missing values of :SecurityDelay : 362841
The number of missing values of :LateAircraftDelay : 362841


In [4]:
droplist = ['Year','Month','DepTime','CRSDepTime','CRSArrTime','FlightNum',\
            'TailNum','ActualElapsedTime', 'CRSElapsedTime','Cancelled', \
            'CancellationCode', 'Diverted','Unnamed: 0']
for column in droplist:
    df.drop(column, axis = 1, inplace = True)
df.drop(["AirTime"], axis = 1, inplace = True)
df.drop(["ArrTime"], axis = 1, inplace = True)


In [5]:
df_filter = df[df['UniqueCarrier']=="WN"]

categories = ['Origin', 'Dest']
indicator_var = pd.DataFrame()
for i in categories:
    x = pd.get_dummies(df_filter[i],drop_first=True)
    indicator_var = pd.concat([indicator_var, x],axis=1)
df_filter.drop('UniqueCarrier', axis = 1, inplace = True)

for col in categories:
    df_filter.drop(col, axis = 1, inplace = True)
final_data = pd.concat([indicator_var, df_filter],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [6]:
final_data.shape

(213885, 133)

In [7]:
final_data_filter_values = final_data.values
label_array = final_data_filter_values[:,-5]
features_array = np.delete(final_data_filter_values, -5, axis=1)


In [8]:
# Nomalization
scaler = StandardScaler() 
scaler.fit(features_array)  
features_array_norm = scaler.transform(features_array)
train_data,test_data,train_label,test_label = train_test_split(features_array_norm,label_array,train_size=0.8)



In [9]:
print("train dataset size : " + str(np.shape(train_data)))
print("test dataset size : " + str(np.shape(test_data)))

train dataset size : (171108, 132)
test dataset size : (42777, 132)


# BEST MODEL  -- MLPREGROSSOR

In [10]:
# the method to calculate Negative Log Likelihood for the predictions(NLL)
def n_log_less(y_true,y_pred):
    y_pred_mean = y_pred.mean()
    y_sqrt = np.sum((y_pred-y_true)**2)
    log_likelihood_elements = (0.5*np.log(2*math.pi*y_sqrt))+np.sum((y_true -y_pred_mean)**2)/(2*y_sqrt)
    return log_likelihood_elements

In [11]:
### traindataset 

In [13]:
filename='mlp-model.sav'
mlpmodel = pickle.load(open(filename, 'rb'))
pre_result =mlpmodel.predict(test_data) 
print("--------------------------MLP RESULT--------------------------")
print("MSE : %2f"%(mean_squared_error(test_label,pre_result)))
print("NLL : %2f"%(n_log_less(test_label,pre_result)))

--------------------------MLP RESULT--------------------------
MSE : 56.251165
NLL : 23.562480
