In [1]:
import pandas as pd
import numpy as np
import datetime, warnings, scipy
import seaborn as sns
import matplotlib as mpl

In [2]:
df1 = pd.read_csv(filepath_or_buffer = '../data/2008.csv')

### CRSDepTime is Scheduled Departure Time

In [3]:
df1.loc[:, "CRSDepTime"]

0          1955
1           735
2           620
3           930
4          1755
5          1915
6          1830
7          1040
8           615
9          1620
10          700
11         1510
12         1430
13          715
14         1700
15         1020
16         1425
17          745
18         1255
19         1325
20          705
21         1625
22         1840
23         1030
24          800
25         1455
26         1255
27         1925
28          635
29          730
           ... 
7009698    1635
7009699    1221
7009700    1845
7009701    1500
7009702    1522
7009703    1910
7009704    1445
7009705     830
7009706    1440
7009707    1755
7009708     710
7009709    1520
7009710    1220
7009711    1041
7009712     843
7009713     815
7009714     545
7009715     850
7009716     936
7009717     600
7009718     847
7009719     640
7009720     800
7009721     615
7009722     750
7009723     959
7009724     835
7009725     700
7009726    1240
7009727    1103
Name: CRSDepTime, Length

In [4]:
#Convert the HHMM format to HH:MM format
def format_hour(hm):
    if pd.isnull(hm):
        return np.nan
    else:
        if hm == 2400: hm = 0
        hm = "{0:04d}".format(int(hm))
        newHM = datetime.time(int(hm[0:2]), int(hm[2:4]))
        return newHM

In [5]:
df1['CRSDepTime_Formatted'] = df1['CRSDepTime'].apply(format_hour)

In [6]:
df1['CRSDepTime_Formatted']

0          19:55:00
1          07:35:00
2          06:20:00
3          09:30:00
4          17:55:00
5          19:15:00
6          18:30:00
7          10:40:00
8          06:15:00
9          16:20:00
10         07:00:00
11         15:10:00
12         14:30:00
13         07:15:00
14         17:00:00
15         10:20:00
16         14:25:00
17         07:45:00
18         12:55:00
19         13:25:00
20         07:05:00
21         16:25:00
22         18:40:00
23         10:30:00
24         08:00:00
25         14:55:00
26         12:55:00
27         19:25:00
28         06:35:00
29         07:30:00
             ...   
7009698    16:35:00
7009699    12:21:00
7009700    18:45:00
7009701    15:00:00
7009702    15:22:00
7009703    19:10:00
7009704    14:45:00
7009705    08:30:00
7009706    14:40:00
7009707    17:55:00
7009708    07:10:00
7009709    15:20:00
7009710    12:20:00
7009711    10:41:00
7009712    08:43:00
7009713    08:15:00
7009714    05:45:00
7009715    08:50:00
7009716    09:36:00


### Apply the same format for Scheduled Arrival Time, Departure Time, Arrival Time

In [7]:
df1['CRSArrTime_Formatted'] = df1['CRSArrTime'].apply(format_hour)
df1['CRSArrTime_Formatted']

0          22:25:00
1          10:00:00
2          07:50:00
3          11:00:00
4          19:25:00
5          21:10:00
6          19:40:00
7          11:50:00
8          06:50:00
9          16:55:00
10         09:15:00
11         17:25:00
12         14:25:00
13         07:10:00
14         16:55:00
15         10:10:00
16         16:25:00
17         09:55:00
18         15:10:00
19         14:35:00
20         08:10:00
21         17:35:00
22         19:50:00
23         11:40:00
24         09:10:00
25         16:05:00
26         16:10:00
27         22:35:00
28         09:45:00
29         10:20:00
             ...   
7009698    17:58:00
7009699    13:59:00
7009700    20:06:00
7009701    16:42:00
7009702    18:23:00
7009703    20:16:00
7009704    16:22:00
7009705    10:08:00
7009706    17:04:00
7009707    20:15:00
7009708    08:37:00
7009709    17:18:00
7009710    15:52:00
7009711    13:03:00
7009712    10:21:00
7009713    15:26:00
7009714    06:50:00
7009715    10:05:00
7009716    11:19:00


### DepTime and ArrTime is the real Departure Time and Arrival Time

In [8]:
df1['DepTime_Formatted'] = df1['DepTime'].apply(format_hour)

In [9]:
df1['ArrTime_Formatted'] = df1['ArrTime'].apply(format_hour)

## Putting Scheduled Departure Time, Scheduled Arrival Time, Departure Time and Arrival Time togerther

In [10]:
df1.loc[:, ['CRSDepTime_Formatted', 'CRSArrTime_Formatted', 'DepTime_Formatted', 'ArrTime_Formatted']]

Unnamed: 0,CRSDepTime_Formatted,CRSArrTime_Formatted,DepTime_Formatted,ArrTime_Formatted
0,19:55:00,22:25:00,20:03:00,22:11:00
1,07:35:00,10:00:00,07:54:00,10:02:00
2,06:20:00,07:50:00,06:28:00,08:04:00
3,09:30:00,11:00:00,09:26:00,10:54:00
4,17:55:00,19:25:00,18:29:00,19:59:00
5,19:15:00,21:10:00,19:40:00,21:21:00
6,18:30:00,19:40:00,19:37:00,20:37:00
7,10:40:00,11:50:00,10:39:00,11:32:00
8,06:15:00,06:50:00,06:17:00,06:52:00
9,16:20:00,16:55:00,16:20:00,16:39:00


### Combine DayOfMonth, Month, Year

In [11]:
df1['Day'] = df1['DayofMonth']

In [12]:
df1['DepDate'] = pd.to_datetime(df1[['Year','Month', 'Day']])

In [13]:
def date_time_combine(r):
    if pd.isnull(r[0]) or pd.isnull(r[1]):
        return np.nan
    else:
        return datetime.datetime.combine(r[0], r[1])
def create_scheduled_dep_time(df, col):
    dt = []
    for index, cols in df1[['DepDate', col]].iterrows():
        if pd.isnull(cols[1]):
            dt.append(np.nan)
        elif float(cols[1]) == 2400:
            cols[0] += datetime.timedelta(days = 1)
            cols[1] = datetime.time(0,0)
            dt.append(date_time_combine(cols))
        else:
            cols[1] = format_hour(cols[1])
            dt.append(date_time_combine(cols))
    return pd.Series(dt)

### Combine the scheduled departure time and departure date

In [14]:
df1['CRSDep_Date_Time'] = create_scheduled_dep_time(df1, 'CRSDepTime')

### CRSDep_Date_Time
This feature shows the **scheduled departure time** in the format of Date and Time 

In [15]:
df1['CRSDep_Date_Time']

0         2008-01-03 19:55:00
1         2008-01-03 07:35:00
2         2008-01-03 06:20:00
3         2008-01-03 09:30:00
4         2008-01-03 17:55:00
5         2008-01-03 19:15:00
6         2008-01-03 18:30:00
7         2008-01-03 10:40:00
8         2008-01-03 06:15:00
9         2008-01-03 16:20:00
10        2008-01-03 07:00:00
11        2008-01-03 15:10:00
12        2008-01-03 14:30:00
13        2008-01-03 07:15:00
14        2008-01-03 17:00:00
15        2008-01-03 10:20:00
16        2008-01-03 14:25:00
17        2008-01-03 07:45:00
18        2008-01-03 12:55:00
19        2008-01-03 13:25:00
20        2008-01-03 07:05:00
21        2008-01-03 16:25:00
22        2008-01-03 18:40:00
23        2008-01-03 10:30:00
24        2008-01-03 08:00:00
25        2008-01-03 14:55:00
26        2008-01-03 12:55:00
27        2008-01-03 19:25:00
28        2008-01-03 06:35:00
29        2008-01-03 07:30:00
                  ...        
7009698   2008-12-13 16:35:00
7009699   2008-12-13 12:21:00
7009700   

### Putting all the necessary features together
In this model, we will consider these following features: 
- UniqueCarrier
- Origin
- Dest
- DepTime_Formatted
- DepDelay
- CRSArrTime_Formatted
- ArrTime_Formatted
- ArrDelay
- ActualElapsedTime
- CRSElapsedTime

In [16]:
notUsedFeatures = ['TaxiIn', 'TaxiOut', 'Year', 'Month', 'DayofMonth', 'DayOfWeek', "DepDate", 'NASDelay', 'WeatherDelay', 'CarrierDelay', 'SecurityDelay', 'LateAircraftDelay', 'Cancelled', 'CancellationCode', 'Diverted', 'AirTime', 'FlightNum', 'TailNum']
df1.drop(notUsedFeatures, axis = 1, inplace=True)
df1 = df1[['UniqueCarrier', 'Origin', 'Dest', 'CRSDep_Date_Time', 'DepTime_Formatted', 'DepDelay', 'CRSArrTime_Formatted', 'ArrTime_Formatted', 'ArrDelay', 'CRSElapsedTime', 'ActualElapsedTime']]

In [17]:
df1

Unnamed: 0,UniqueCarrier,Origin,Dest,CRSDep_Date_Time,DepTime_Formatted,DepDelay,CRSArrTime_Formatted,ArrTime_Formatted,ArrDelay,CRSElapsedTime,ActualElapsedTime
0,WN,IAD,TPA,2008-01-03 19:55:00,20:03:00,8.0,22:25:00,22:11:00,-14.0,150.0,128.0
1,WN,IAD,TPA,2008-01-03 07:35:00,07:54:00,19.0,10:00:00,10:02:00,2.0,145.0,128.0
2,WN,IND,BWI,2008-01-03 06:20:00,06:28:00,8.0,07:50:00,08:04:00,14.0,90.0,96.0
3,WN,IND,BWI,2008-01-03 09:30:00,09:26:00,-4.0,11:00:00,10:54:00,-6.0,90.0,88.0
4,WN,IND,BWI,2008-01-03 17:55:00,18:29:00,34.0,19:25:00,19:59:00,34.0,90.0,90.0
5,WN,IND,JAX,2008-01-03 19:15:00,19:40:00,25.0,21:10:00,21:21:00,11.0,115.0,101.0
6,WN,IND,LAS,2008-01-03 18:30:00,19:37:00,67.0,19:40:00,20:37:00,57.0,250.0,240.0
7,WN,IND,LAS,2008-01-03 10:40:00,10:39:00,-1.0,11:50:00,11:32:00,-18.0,250.0,233.0
8,WN,IND,MCI,2008-01-03 06:15:00,06:17:00,2.0,06:50:00,06:52:00,2.0,95.0,95.0
9,WN,IND,MCI,2008-01-03 16:20:00,16:20:00,0.0,16:55:00,16:39:00,-16.0,95.0,79.0


### Examine the dataset
Drop the records with NaN values

In [18]:
df1.dropna(inplace=True)

### Save preprocessed dataset 
Write the preprocessed dataset to csv file

In [26]:
df1.to_csv(path_or_buf='../data/preprocessed_2008.csv', index=False)

In [2]:
#Read the preprocessed data
df1 = pd.read_csv('../data/preprocessed_2008.csv')

In [30]:
df1.dropna(inplace=True)

In [19]:
df1

Unnamed: 0,UniqueCarrier,Origin,Dest,CRSDep_Date_Time,DepTime_Formatted,DepDelay,CRSArrTime_Formatted,ArrTime_Formatted,ArrDelay,CRSElapsedTime,ActualElapsedTime
0,WN,IAD,TPA,2008-01-03 19:55:00,20:03:00,8.0,22:25:00,22:11:00,-14.0,150.0,128.0
1,WN,IAD,TPA,2008-01-03 07:35:00,07:54:00,19.0,10:00:00,10:02:00,2.0,145.0,128.0
2,WN,IND,BWI,2008-01-03 06:20:00,06:28:00,8.0,07:50:00,08:04:00,14.0,90.0,96.0
3,WN,IND,BWI,2008-01-03 09:30:00,09:26:00,-4.0,11:00:00,10:54:00,-6.0,90.0,88.0
4,WN,IND,BWI,2008-01-03 17:55:00,18:29:00,34.0,19:25:00,19:59:00,34.0,90.0,90.0
5,WN,IND,JAX,2008-01-03 19:15:00,19:40:00,25.0,21:10:00,21:21:00,11.0,115.0,101.0
6,WN,IND,LAS,2008-01-03 18:30:00,19:37:00,67.0,19:40:00,20:37:00,57.0,250.0,240.0
7,WN,IND,LAS,2008-01-03 10:40:00,10:39:00,-1.0,11:50:00,11:32:00,-18.0,250.0,233.0
8,WN,IND,MCI,2008-01-03 06:15:00,06:17:00,2.0,06:50:00,06:52:00,2.0,95.0,95.0
9,WN,IND,MCI,2008-01-03 16:20:00,16:20:00,0.0,16:55:00,16:39:00,-16.0,95.0,79.0


In [19]:
carriers = pd.read_csv('../data/carriers.csv')

In [26]:
carriers.loc[carriers['Code'] == 'WN']

Unnamed: 0,Code,Description
1388,WN,Southwest Airlines Co.


In [21]:
carriers_abbr = carriers.set_index('Code')['Description'].to_dict()

In [22]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}
#_______________________________________________________________
# Creation of a dataframe with statitical infos on each airline:
global_stats = df1['DepDelay'].groupby(df1['UniqueCarrier']).apply(get_stats).unstack()
global_stats = global_stats.sort_values('count')
global_stats

Unnamed: 0_level_0,count,max,mean,min
UniqueCarrier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AQ,7752.0,336.0,-1.482456,-61.0
HA,61212.0,963.0,0.439211,-534.0
F9,95384.0,817.0,5.903107,-25.0
AS,148492.0,947.0,6.717439,-79.0
OH,190695.0,960.0,11.51015,-70.0
B6,192114.0,846.0,12.572827,-70.0
YV,245131.0,607.0,11.952744,-92.0
9E,254322.0,1127.0,6.733861,-42.0
FL,258713.0,1206.0,9.229818,-62.0
EV,274867.0,965.0,11.922875,-61.0


In [23]:
global_stats.shape

(20, 4)

### Model 1: One Airline, One Airport

In [27]:
carrier = 'WN'
check_airports = df1[(df1['UniqueCarrier'] == carrier)]['DepDelay'].groupby(
                         df1['Origin']).apply(get_stats).unstack()
check_airports.sort_values('count', ascending = False, inplace = True)
check_airports[-5:]

Unnamed: 0_level_0,count,max,mean,min
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IAD,3834.0,417.0,9.509911,-10.0
HRL,3827.0,395.0,9.866998,-9.0
RSW,3822.0,368.0,7.853741,-16.0
JAN,3023.0,560.0,8.877605,-10.0
CRP,1879.0,351.0,6.772219,-10.0


In [28]:
def get_flight_delays(df, carrier, id_airport, extrem_values = False):
    df2 = df[(df['UniqueCarrier'] == carrier) & (df['Origin'] == id_airport)]
    # remove extreme values before fitting
    if extrem_values:
        df2['DepDelay'] = df2['DepDelay'].apply(lambda x:x if x < 60 else np.nan)
        df2.dropna(how = 'any')
    # Conversion: date + hour -> hour
    df2.sort_values('CRSDep_Date_Time', inplace = True)
    df2['CRSDepTime'] =  df2['CRSDep_Date_Time'].apply(lambda x:x.time())
    
    test2 = df2['DepDelay'].groupby(df2['CRSDepTime']).apply(get_stats).unstack()
    test2.reset_index(inplace=True)
    
    htm = lambda x:x.hour*3600+x.minute*60+x.second
    test2.reset_index(inplace=True)
    test2['CRSDep_Time_In_Min'] = test2['CRSDepTime'].apply(htm)
    return test2

### Model 2: One Airline, Multilple Airports

In [29]:
train = df1[df1['CRSDep_Date_Time'].apply(lambda x:x.date()) < datetime.date(2008, 5, 23)]
test = df1[df1['CRSDep_Date_Time'].apply(lambda x:x.date()) > datetime.date(2008, 5, 23)]
df2 = train

In [35]:
def get_delays_info(df, carrier):
    airports = df[df['UniqueCarrier'] == carrier]['Origin'].unique()
    i = 0
    cols = ['Airports', 'CRSDep_Time_In_Min', 'mean']
    for airport in airports:
        test2 = get_flight_delays(df, carrier, airport, True)
        test2.loc[:, 'Airports'] = airport
        test2 = test2[cols]
        test2.dropna(how = 'any', inplace = True)
        if i == 0:
            merged_df = test2.copy()
        else:
            merged_df = pd.concat([merged_df, test2], ignore_index = True)
        i += 1    
    return merged_df

In [36]:
carrier = 'WN'
merged_df = get_delays_info(df2, carrier)
#merged_df.loc[:,:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [37]:
##Label Encoding
from sklearn import metrics, linear_model
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from scipy.optimize import curve_fit
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(merged_df['Airports'])
#__________________________________________________________
# correspondance between the codes and tags of the airports
zipped = zip(integer_encoded, merged_df['Airports'])
label_airports = list(set(list(zipped)))
label_airports.sort(key = lambda x:x[0])
label_airports

[(0, 'ABQ'),
 (1, 'ALB'),
 (2, 'AMA'),
 (3, 'AUS'),
 (4, 'BDL'),
 (5, 'BHM'),
 (6, 'BNA'),
 (7, 'BOI'),
 (8, 'BUF'),
 (9, 'BUR'),
 (10, 'BWI'),
 (11, 'CLE'),
 (12, 'CMH'),
 (13, 'CRP'),
 (14, 'DAL'),
 (15, 'DEN'),
 (16, 'DTW'),
 (17, 'ELP'),
 (18, 'FLL'),
 (19, 'GEG'),
 (20, 'HOU'),
 (21, 'HRL'),
 (22, 'IAD'),
 (23, 'IND'),
 (24, 'ISP'),
 (25, 'JAN'),
 (26, 'JAX'),
 (27, 'LAS'),
 (28, 'LAX'),
 (29, 'LBB'),
 (30, 'LIT'),
 (31, 'MAF'),
 (32, 'MCI'),
 (33, 'MCO'),
 (34, 'MDW'),
 (35, 'MHT'),
 (36, 'MSY'),
 (37, 'OAK'),
 (38, 'OKC'),
 (39, 'OMA'),
 (40, 'ONT'),
 (41, 'ORF'),
 (42, 'PBI'),
 (43, 'PDX'),
 (44, 'PHL'),
 (45, 'PHX'),
 (46, 'PIT'),
 (47, 'PVD'),
 (48, 'RDU'),
 (49, 'RNO'),
 (50, 'RSW'),
 (51, 'SAN'),
 (52, 'SAT'),
 (53, 'SDF'),
 (54, 'SEA'),
 (55, 'SFO'),
 (56, 'SJC'),
 (57, 'SLC'),
 (58, 'SMF'),
 (59, 'SNA'),
 (60, 'STL'),
 (61, 'TPA'),
 (62, 'TUL'),
 (63, 'TUS')]

In [38]:
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
b = np.array(merged_df['CRSDep_Time_In_Min'])
b = b.reshape(len(b),1)
X = np.hstack((onehot_encoded, b))
Y = np.array(merged_df['mean'])
Y = Y.reshape(len(Y), 1)

In [39]:
Y.shape

(7979, 1)

In [40]:
lm = linear_model.LinearRegression()
model = lm.fit(X,Y)
predictions = lm.predict(X)
print("MSE =", metrics.mean_squared_error(predictions, Y))

('MSE =', 18.963500018146416)


In [41]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [42]:
from sklearn.metrics import mean_squared_error, r2_score
poly = PolynomialFeatures(degree = 2)
reg = linear_model.LinearRegression()
X_ = poly.fit_transform(X_train)
reg.fit(X_, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [43]:
X_0 = poly.fit_transform(X_test)
pred = reg.predict(X_0)
print("MSE = ", metrics.mean_squared_error(pred, Y_test))

('MSE = ', 18.703999066301677)


In [44]:
from sklearn.metrics import mean_squared_error, r2_score
poly = PolynomialFeatures(degree = 2)
reg = linear_model.Ridge(alpha=.5)
X_ = poly.fit_transform(X_train)
reg.fit(X_, Y_train)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 4.85171606175e-18


Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [45]:
X_0 = poly.fit_transform(X_test)
pred = reg.predict(X_0)
print("MSE = ", metrics.mean_squared_error(pred, Y_test))

('MSE = ', 18.681993856147951)


In [41]:
score_min = 10000
for pol_order in range(1, 3):
    for alpha in range(0, 20, 2):
        ridgereg = linear_model.Ridge(alpha = alpha/10, normalize=True)
        poly = PolynomialFeatures(degree = pol_order)
        #regr = linear_model.LinearRegression()
        X_ = poly.fit_transform(X_train)
        ridgereg.fit(X_, Y_train)        
        X_ = poly.fit_transform(X_test)
        result = ridgereg.predict(X_)
        score = metrics.mean_squared_error(result, Y_test)        
        if score < score_min:
            score_min = score
            parameters = [alpha/10, pol_order]
        print("n={} alpha={} , MSE = {:<0.5}".format(pol_order, alpha, score))

n=1 alpha=0 , MSE = 19.372
n=1 alpha=2 , MSE = 19.372
n=1 alpha=4 , MSE = 19.372
n=1 alpha=6 , MSE = 19.372
n=1 alpha=8 , MSE = 19.372
n=1 alpha=10 , MSE = 21.616
n=1 alpha=12 , MSE = 21.616
n=1 alpha=14 , MSE = 21.616
n=1 alpha=16 , MSE = 21.616
n=1 alpha=18 , MSE = 21.616
n=2 alpha=0 , MSE = 19.326
n=2 alpha=2 , MSE = 19.326
n=2 alpha=4 , MSE = 19.326
n=2 alpha=6 , MSE = 19.326
n=2 alpha=8 , MSE = 19.326
n=2 alpha=10 , MSE = 20.363
n=2 alpha=12 , MSE = 20.363
n=2 alpha=14 , MSE = 20.363
n=2 alpha=16 , MSE = 20.363
n=2 alpha=18 , MSE = 20.363


In [46]:
df3 = test
df3[:5]

Unnamed: 0,UniqueCarrier,Origin,Dest,CRSDep_Date_Time,DepTime_Formatted,DepDelay,CRSArrTime_Formatted,ArrTime_Formatted,ArrDelay,CRSElapsedTime,ActualElapsedTime
2458438,WN,ABQ,AMA,2008-05-24 18:30:00,18:27:00,-3.0,20:30:00,20:23:00,-7.0,60.0,56.0
2458439,WN,ABQ,AMA,2008-05-24 20:10:00,20:05:00,-5.0,22:10:00,21:55:00,-15.0,60.0,50.0
2458440,WN,ABQ,BWI,2008-05-24 14:25:00,14:58:00,33.0,20:10:00,20:15:00,5.0,225.0,197.0
2458441,WN,ABQ,BWI,2008-05-24 10:45:00,10:49:00,4.0,16:30:00,16:42:00,12.0,225.0,233.0
2458442,WN,ABQ,DAL,2008-05-24 15:00:00,16:12:00,72.0,17:45:00,18:45:00,60.0,105.0,93.0


### Testing the model with the test set (flights after May 28, 2008)

In [48]:
test

Unnamed: 0,UniqueCarrier,Origin,Dest,CRSDep_Date_Time,DepTime_Formatted,DepDelay,CRSArrTime_Formatted,ArrTime_Formatted,ArrDelay,CRSElapsedTime,ActualElapsedTime
2458438,WN,ABQ,AMA,2008-05-24 18:30:00,18:27:00,-3.0,20:30:00,20:23:00,-7.0,60.0,56.0
2458439,WN,ABQ,AMA,2008-05-24 20:10:00,20:05:00,-5.0,22:10:00,21:55:00,-15.0,60.0,50.0
2458440,WN,ABQ,BWI,2008-05-24 14:25:00,14:58:00,33.0,20:10:00,20:15:00,5.0,225.0,197.0
2458441,WN,ABQ,BWI,2008-05-24 10:45:00,10:49:00,4.0,16:30:00,16:42:00,12.0,225.0,233.0
2458442,WN,ABQ,DAL,2008-05-24 15:00:00,16:12:00,72.0,17:45:00,18:45:00,60.0,105.0,93.0
2458443,WN,ABQ,DAL,2008-05-24 07:00:00,07:04:00,4.0,09:40:00,09:37:00,-3.0,100.0,93.0
2458444,WN,ABQ,DAL,2008-05-24 18:00:00,18:13:00,13.0,20:40:00,20:49:00,9.0,100.0,96.0
2458445,WN,ABQ,DAL,2008-05-24 19:20:00,19:24:00,4.0,22:00:00,21:58:00,-2.0,100.0,94.0
2458446,WN,ABQ,DAL,2008-05-24 13:10:00,13:09:00,-1.0,15:50:00,15:38:00,-12.0,100.0,89.0
2458447,WN,ABQ,DAL,2008-05-24 10:05:00,10:05:00,0.0,12:45:00,12:38:00,-7.0,100.0,93.0


In [47]:
merged_df_test = get_delays_info(test, carrier)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [49]:
merged_df_test[:5]

Unnamed: 0,Airports,CRSDep_Time_In_Min,mean
0,ABQ,21600,-0.297297
1,ABQ,21900,-2.583333
2,ABQ,22200,-0.035971
3,ABQ,22500,-0.746667
4,ABQ,22800,-0.733333


In [51]:
label_conversion = dict()
for s in label_airports:
    label_conversion[s[1]] = s[0]
merged_df_test['Airports'].replace(label_conversion, inplace = True)
for index, label in label_airports:
    temp = merged_df_test['Airports'] == index
    temp = temp.apply(lambda x:1.0 if x else 0.0)
    if index == 0:
        matrix = np.array(temp)
    else:
        matrix = np.vstack((matrix, temp))
matrix = matrix.T
b = np.array(merged_df_test['CRSDep_Time_In_Min'])
b = b.reshape(len(b),1)
X_test = np.hstack((matrix, b))
Y_test = np.array(merged_df_test['mean'])
Y_test = Y_test.reshape(len(Y_test), 1)

In [57]:
X_ = poly.fit_transform(X_test)
result = reg.predict(X_)
mse = metrics.mean_squared_error(result, Y_test)
'MSE = {:.2f}'.format(score)

'MSE = 24.57'

In [58]:
'Average Delay = {:.2f} min'.format(np.sqrt(mse))

'Average Delay = 4.96 min'