In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from datetime import datetime
import sys
sys.path.append('../scripts')
from feature_transform import dates_encoder, merge_path, interpolate_missing_values, scrap_weather, merge_weather_data, get_distance

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
import haversine as hs
from haversine import Unit
%matplotlib inline

In [60]:
train = pd.read_csv('../data/flights_train.csv', parse_dates = ['flight_date']).reset_index(drop=True)
X_test = pd.read_csv('../data/flights_Xtest.csv', parse_dates = ['flight_date']).reset_index(drop=True)

In [61]:
train = get_distance(train)
X_test = get_distance(X_test)

In [62]:
train ['flight_date'].min(), train ['flight_date'].max(), X_test['flight_date'].min(), X_test['flight_date'].max()

(Timestamp('2011-09-01 00:00:00'),
 Timestamp('2012-11-14 00:00:00'),
 Timestamp('2012-11-15 00:00:00'),
 Timestamp('2013-03-05 00:00:00'))

In [63]:
train_weather = scrap_weather(train, start=datetime(2011, 9, 1), end=datetime(2012, 11, 14))
train_weather

Unnamed: 0,time,tavg,tmin,tmax,prcp,wspd,airport_code
0,2011-09-01,27.8,21.7,33.9,0.0,4.0,ATL
1,2011-09-02,27.1,19.4,35.0,0.0,3.5,ATL
2,2011-09-03,27.7,20.6,35.0,0.0,5.6,ATL
3,2011-09-04,23.7,20.6,26.7,6.1,4.4,ATL
4,2011-09-05,23.7,22.2,25.6,16.0,10.8,ATL
...,...,...,...,...,...,...,...
436,2012-11-10,11.6,8.9,15.0,0.0,16.9,SFO
437,2012-11-11,10.6,6.1,15.6,0.0,3.6,SFO
438,2012-11-12,11.5,6.1,17.2,0.0,5.0,SFO
439,2012-11-13,13.5,8.3,18.3,0.0,4.3,SFO


In [64]:
test_weather = scrap_weather(X_test, start=datetime(2012, 11, 15), end=datetime(2013, 3, 5))
test_weather

Unnamed: 0,time,tavg,tmin,tmax,prcp,wspd,airport_code
0,2012-11-15,7.6,5.0,11.7,0.0,6.3,ATL
1,2012-11-16,7.8,1.0,16.1,0.0,0.9,ATL
2,2012-11-17,7.4,0.6,16.1,0.0,5.7,ATL
3,2012-11-18,10.9,5.0,17.8,0.0,7.4,ATL
4,2012-11-19,10.6,2.8,15.6,0.0,2.7,ATL
...,...,...,...,...,...,...,...
106,2013-03-01,11.8,7.8,21.7,0.0,7.6,SFO
107,2013-03-02,12.8,10.6,17.2,0.0,9.4,SFO
108,2013-03-03,11.4,9.4,15.0,0.0,20.2,SFO
109,2013-03-04,9.6,7.8,13.3,0.0,15.5,SFO


In [65]:
train = merge_weather_data(X = train, weather_data = train_weather)
train

Unnamed: 0,flight_date,from,to,avg_weeks,target,std_weeks,distance,tavg_from,prcp_from,wspd_from,tavg_to,prcp_to,wspd_to
0,2012-06-19,ORD,DFW,12.875000,12.331296,9.812647,1290.781744,30.0,0.0,24.8,27.5,0.0,29.2
1,2012-06-19,ORD,PHX,11.772727,10.502073,8.970490,2313.182187,30.0,0.0,24.8,35.4,0.0,12.2
2,2012-06-19,ORD,SFO,13.480000,12.160800,9.372477,2963.977918,30.0,0.0,24.8,15.7,0.0,16.9
3,2012-06-19,LAS,SFO,13.925926,10.377696,11.272587,664.841940,33.8,0.0,17.6,15.7,0.0,16.9
4,2012-06-19,ORD,ATL,11.173913,11.788080,8.819669,976.059253,30.0,0.0,24.8,24.6,0.0,3.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8891,2011-10-15,LGA,MIA,13.230769,10.267310,8.815022,1768.376935,15.8,0.0,24.8,26.8,0.5,15.1
8892,2012-01-19,LGA,MIA,12.695652,11.372761,9.373244,1768.376935,-2.3,0.0,13.3,20.7,0.0,9.7
8893,2011-09-28,LGA,MIA,11.545455,10.643622,7.738139,1768.376935,22.5,1.3,13.3,27.9,12.7,6.5
8894,2012-09-14,LGA,MIA,15.137931,11.330373,9.905713,1768.376935,21.5,0.0,10.8,28.0,0.0,13.0


In [45]:
X_test = merge_weather_data(X = X_test, weather_data = test_weather)
X_test

Unnamed: 0,flight_date,from,to,avg_weeks,std_weeks,distance,tavg_from,prcp_from,wspd_from,tavg_to,prcp_to,wspd_to
0,2013-01-22,ATL,MCO,10.363636,8.232025,650.799935,2.3,0.0,13.3,15.1,0.0,11.3
1,2013-02-07,ATL,MCO,10.000000,6.935796,650.799935,7.4,11.9,10.7,22.6,0.0,15.5
2,2013-02-07,ATL,ORD,10.428571,7.039683,976.059253,7.4,11.9,10.7,0.4,27.2,13.3
3,2013-02-07,SEA,ORD,7.857143,6.549893,2761.758289,6.7,1.3,14.8,0.4,27.2,13.3
4,2013-02-07,DTW,ORD,6.076923,4.030334,376.724299,0.1,0.0,12.0,0.4,27.2,13.3
...,...,...,...,...,...,...,...,...,...,...,...,...
2227,2012-12-08,MIA,JFK,13.000000,10.291406,1757.111034,24.4,0.5,12.2,8.9,8.6,9.4
2228,2013-02-20,LAS,JFK,12.370370,8.522135,3609.302981,8.6,0.0,12.2,-0.1,0.0,34.2
2229,2012-12-14,LAS,JFK,13.785714,10.307700,3609.302981,9.1,5.1,18.4,5.2,0.0,15.5
2230,2012-11-19,LAS,JFK,13.607143,9.964155,3609.302981,15.2,0.0,9.0,6.4,0.0,16.6


In [46]:
train = train.sort_values(by='flight_date', ascending=True)
X_test = X_test.sort_values(by='flight_date', ascending=True)

In [47]:
train.isnull().sum()

flight_date     0
from            0
to              0
avg_weeks       0
target          0
std_weeks       0
distance        0
tavg_from      10
prcp_from       0
wspd_from       0
tavg_to        14
prcp_to         0
wspd_to         0
dtype: int64

In [48]:
X_test.isnull().sum()

flight_date     0
from            0
to              0
avg_weeks       0
std_weeks       0
distance        0
tavg_from      11
prcp_from       0
wspd_from       0
tavg_to        10
prcp_to         0
wspd_to         0
dtype: int64

In [49]:
train = interpolate_missing_values(train, column='tavg_from', rename = 'tavg_from_filled')
train = interpolate_missing_values(train, column='tavg_to', rename = 'tavg_to_filled')

In [50]:
X_test = interpolate_missing_values(X_test, column='tavg_from', rename = 'tavg_from_filled')
X_test = interpolate_missing_values(X_test, column='tavg_to', rename = 'tavg_to_filled')

In [51]:
train = train.drop(labels=['from', 'to'], axis=1)
X_test = X_test.drop(labels=['from', 'to'], axis=1)

# Process data for pycaret evaluation

In [16]:
train_processed = dates_encoder(train)
X_test_processed = dates_encoder(X_test)

In [18]:
train_processed.to_csv('../data/flights_train_weather_distance_processed.csv')
X_test_processed.to_csv('../data/flights_X_test_weather_distance_processed.csv')

# Model

In [52]:
date_encoder = FunctionTransformer(dates_encoder)
date_cols = ["flight_date"]

preprocessor = make_column_transformer(
        (date_encoder, date_cols),
        remainder='passthrough'
    )

In [53]:
regressor_ = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=8945, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

regressor = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=-1, oob_score=False,
                       random_state=8945, verbose=0, warm_start=False)

pipe_line = make_pipeline(preprocessor, regressor)

In [54]:
train

Unnamed: 0,flight_date,avg_weeks,target,std_weeks,distance,prcp_from,wspd_from,prcp_to,wspd_to,tavg_from,tavg_to
7323,2011-09-01,12.250000,12.686301,8.103944,1174.582929,0.0,13.7,0.0,4.0,32.9,27.8
1998,2011-09-01,12.615385,11.155192,7.848959,4193.040474,0.0,11.9,0.0,12.2,17.6,19.5
1999,2011-09-01,9.058824,11.376947,6.950455,450.146709,0.0,4.0,0.0,12.2,22.6,19.5
314,2011-09-01,10.304348,12.548729,7.175937,1290.781744,0.0,9.4,0.0,13.7,28.1,32.9
1424,2011-09-01,12.200000,11.641595,7.648529,1093.058488,0.0,14.8,0.0,9.7,15.7,15.8
...,...,...,...,...,...,...,...,...,...,...,...
8032,2012-11-14,9.823529,11.210868,7.772028,804.756556,0.0,2.5,0.0,17.6,0.0,5.3
7590,2012-11-14,9.526316,10.951357,6.653056,1491.095313,0.0,15.5,0.0,8.6,2.3,12.6
7331,2012-11-14,15.366667,11.478661,10.283845,1388.871587,1.4,11.8,0.0,14.8,20.9,5.2
5239,2012-11-14,10.529412,11.215433,9.618457,2352.033037,0.0,5.0,0.0,5.8,10.9,14.6


In [55]:
X = train.drop('target', axis = 1)
y = train.target

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=30)

pipe_line.fit(train_X,train_y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('functiontransformer',
                                                  FunctionTransformer(func=<function dates_encoder at 0x000001E02879CA60>),
                                                  ['flight_date'])])),
                ('randomforestregressor',
                 RandomForestRegressor(n_jobs=-1, random_state=8945))])

In [56]:
y_pred = pipe_line.predict(test_X)

print(f'The RMSE score is: {np.sqrt(mean_squared_error(test_y, y_pred))}')

 The RMSE score is: 0.44059887455601604


In [58]:
X_test

Unnamed: 0,flight_date,avg_weeks,std_weeks,distance,prcp_from,wspd_from,prcp_to,wspd_to,tavg_from,tavg_to
2097,2012-11-15,12.000000,7.956403,1925.940575,0.0,10.8,0.0,6.3,1.9,7.6
783,2012-11-15,9.900000,6.608607,1290.781744,0.0,7.6,0.0,8.6,10.7,2.8
781,2012-11-15,9.736842,8.312254,1536.909538,0.0,8.6,0.0,6.1,5.8,16.1
780,2012-11-15,10.909091,8.922465,1983.272852,0.0,7.6,0.0,6.1,10.7,16.1
2100,2012-11-15,10.150000,8.536454,1459.056790,0.0,12.6,0.0,6.3,5.6,7.6
...,...,...,...,...,...,...,...,...,...,...
459,2013-03-05,11.100000,8.589467,4193.040474,0.0,12.2,0.0,11.1,13.5,2.9
460,2013-03-05,10.111111,10.493073,664.841940,3.6,22.0,0.0,11.2,10.2,16.4
1821,2013-03-05,4.636364,4.249064,1088.308056,0.0,10.8,16.8,19.1,3.7,-0.3
1991,2013-03-05,5.916667,4.316108,2231.344813,0.0,12.6,0.0,32.8,4.3,10.5


In [57]:
submission = pipe_line.predict(X_test)
submission = pd.DataFrame(submission)
submission.to_csv('../submissions/FPX_submission_random_forest_weather_distance.csv', index = False, header = False)