In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from datetime import datetime
import sys
sys.path.append('../scripts')
from feature_transform import dates_encoder, merge_path, interpolate_missing_values, scrap_weather, merge_weather_data, get_distance

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
import haversine as hs
from haversine import Unit
%matplotlib inline

In [4]:
train = pd.read_csv('../data/flights_train.csv', parse_dates = ['flight_date']).reset_index(drop=True)
X_test = pd.read_csv('../data/flights_Xtest.csv', parse_dates = ['flight_date']).reset_index(drop=True)

In [5]:
train = get_distance(train)
X_test = get_distance(X_test)

In [3]:
train ['flight_date'].min(), train ['flight_date'].max(), X_test['flight_date'].min(), X_test['flight_date'].max()

(Timestamp('2011-09-01 00:00:00'),
 Timestamp('2012-11-14 00:00:00'),
 Timestamp('2012-11-15 00:00:00'),
 Timestamp('2013-03-05 00:00:00'))

In [7]:
train_weather = scrap_weather(train, start=datetime(2011, 9, 1), end=datetime(2012, 11, 14))
test_weather = scrap_weather(X_test, start=datetime(2012, 11, 15), end=datetime(2013, 3, 5))



KeyboardInterrupt: 

In [33]:
train = merge_weather_data(X = train, weather_data = train_weather)
X_test = merge_weather_data(X = X_test, weather_data = test_weather)

In [34]:
train = merge_path(train)
X_test = merge_path(X_test)

In [35]:
train

Unnamed: 0,flight_date,avg_weeks,target,std_weeks,tavg_from,prcp_from,wspd_from,tavg_to,prcp_to,wspd_to,path
0,2012-06-19,12.875000,12.331296,9.812647,30.0,0.0,24.8,27.5,0.0,29.2,ORD_DFW
1,2012-06-19,11.772727,10.502073,8.970490,30.0,0.0,24.8,35.4,0.0,12.2,ORD_PHX
2,2012-06-19,13.480000,12.160800,9.372477,30.0,0.0,24.8,15.7,0.0,16.9,ORD_SFO
3,2012-06-19,13.925926,10.377696,11.272587,33.8,0.0,17.6,15.7,0.0,16.9,LAS_SFO
4,2012-06-19,11.173913,11.788080,8.819669,30.0,0.0,24.8,24.6,0.0,3.6,ORD_ATL
...,...,...,...,...,...,...,...,...,...,...,...
8891,2012-09-16,18.973684,11.018431,11.679242,19.0,0.0,13.0,29.8,0.0,13.7,JFK_LAS
8892,2011-09-08,12.571429,10.649808,8.230155,32.0,0.0,9.4,21.8,9.7,15.5,LAS_JFK
8893,2011-09-15,7.133333,11.923892,4.688385,21.2,0.8,16.9,19.7,4.6,16.2,BOS_PHL
8894,2012-08-28,13.826087,13.316772,11.491705,23.1,0.0,8.3,26.9,1.5,19.1,ORD_LGA


In [36]:
X_test

Unnamed: 0,flight_date,avg_weeks,std_weeks,tavg_from,prcp_from,wspd_from,tavg_to,prcp_to,wspd_to,path
0,2013-01-22,10.363636,8.232025,2.3,0.0,13.3,15.1,0.0,11.3,ATL_MCO
1,2013-02-22,8.294118,5.542616,6.6,37.6,14.8,-6.8,0.0,16.6,ATL_DEN
2,2013-02-22,7.400000,4.910630,6.1,9.4,29.2,-6.8,0.0,16.6,SEA_DEN
3,2013-02-22,10.200000,6.902326,-4.5,6.1,10.8,-6.8,0.0,16.6,MSP_DEN
4,2013-02-22,9.166667,6.400827,4.1,0.0,17.6,-6.8,0.0,16.6,DFW_DEN
...,...,...,...,...,...,...,...,...,...,...
2227,2012-11-17,20.850000,13.463245,6.6,0.0,11.9,19.5,0.0,15.8,PHL_MCO
2228,2013-01-21,12.692308,7.928527,10.0,0.0,10.1,-1.3,0.8,13.7,LAS_JFK
2229,2012-12-01,9.500000,6.997899,17.9,0.0,7.2,9.5,0.0,15.5,PHX_DEN
2230,2012-12-08,13.000000,10.291406,24.4,0.5,12.2,8.9,8.6,9.4,MIA_JFK


In [37]:
train = train.sort_values(by='flight_date', ascending=True)
X_test = X_test.sort_values(by='flight_date', ascending=True)

In [38]:
train.isnull().sum()

flight_date     0
avg_weeks       0
target          0
std_weeks       0
tavg_from      10
prcp_from       0
wspd_from       0
tavg_to        11
prcp_to         0
wspd_to         0
path            0
dtype: int64

In [39]:
X_test.isnull().sum()

flight_date     0
avg_weeks       0
std_weeks       0
tavg_from      10
prcp_from       0
wspd_from       0
tavg_to        10
prcp_to         0
wspd_to         0
path            0
dtype: int64

In [40]:
train = interpolate_missing_values(train, column='tavg_from', rename = 'tavg_from_filled')
train = interpolate_missing_values(train, column='tavg_to', rename = 'tavg_to_filled')

In [41]:
X_test = interpolate_missing_values(X_test, column='tavg_from', rename = 'tavg_from_filled')
X_test = interpolate_missing_values(X_test, column='tavg_to', rename = 'tavg_to_filled')

In [51]:
train_processed = pd.get_dummies(dates_encoder(train))
train_processed.to_csv('../data/flights_train_weather_processed.csv')

In [52]:
X_test_processed = pd.get_dummies(dates_encoder(X_test))
X_test_processed.to_csv('../data/flights_X_test_weather_processed.csv')

In [53]:
date_encoder = FunctionTransformer(dates_encoder)
date_cols = ["flight_date"]

categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)
categorical_cols = ["path"]

preprocessor = make_column_transformer(
        (date_encoder, date_cols),
        (categorical_encoder, categorical_cols),
        remainder='passthrough'
    )

In [54]:
regressor_ = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=3703, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

pipe_line = make_pipeline(preprocessor, regressor_)

In [55]:
X = train.drop('target', axis = 1)
y = train.target

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=30)

pipe_line.fit(train_X,train_y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('functiontransformer',
                                                  FunctionTransformer(func=<function dates_encoder at 0x0000025B2E5A93A0>),
                                                  ['flight_date']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  ['path'])])),
                ('lgbmregressor', LGBMRegressor(random_state=3703))])

In [56]:
y_pred = pipe_line.predict(test_X)

print(f' The RMSE score is: {np.sqrt(mean_squared_error(test_y, y_pred))}')

 The RMSE score is: 0.4014281200866231


In [None]:
submission = pipe_line.predict(X_test)
submission = pd.DataFrame(submission)
submission.to_csv('../submissions/FPX_first_submission_lgbm_weather.csv', index = False, header = False)