In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import lightgbm as lgb
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split

In [None]:
TRAIN_CSV = 'data/nyc-taxi/train.csv'
TEST_CSV = 'data/nyc-taxi/test.csv'
SUBM_CSV = 'data/nyc-taxi/sample_submission.csv'

In [None]:
MIN_FARE = 0
MAX_FARE = 200
MIN_PASSENGER = 1
MAX_PASSENGER = 9
MIN_LON = -80
MAX_LON = -70
MIN_LAT = 35
MAX_LAT = 45
NROWS = 6_000_000

In [None]:
# Set columns to most suitable type to optimize for memory usage
types = {'fare_amount': 'float32',
         'pickup_datetime': 'str', 
         'pickup_longitude': 'float32',
         'pickup_latitude': 'float32',
         'dropoff_longitude': 'float32',
         'dropoff_latitude': 'float32',
         'passenger_count': 'uint8'}

cols = list(types.keys())

In [None]:
%%time
train_df = pd.read_csv(TRAIN_CSV, usecols=cols, dtype=types, nrows=NROWS)
train_df['pickup_datetime'] = train_df['pickup_datetime'].str.slice(0, 16)
train_df['pickup_datetime'] = pd.to_datetime(train_df['pickup_datetime'], format='%Y-%m-%d %H') 

In [None]:
test_df = pd.read_csv(TEST_CSV, dtype=types)
test_df.drop('key', axis=1, inplace=True)
test_df['pickup_datetime'] = test_df['pickup_datetime'].str.slice(0, 16)
test_df['pickup_datetime'] = pd.to_datetime(test_df['pickup_datetime'], format='%Y-%m-%d %H') 

subm_df = pd.read_csv(SUBM_CSV, index_col='key')

# Feature Engineering

In [None]:
def clean_feats(df):
    df = df.dropna(how='any', axis='rows')
    df = df[(df['fare_amount'].between(MIN_FARE, MAX_FARE, inclusive=False))]
    df = df[(df['passenger_count'].between(MIN_PASSENGER, MAX_PASSENGER))]
    df = df[(df['pickup_longitude'].between(MIN_LON, MAX_LON)) &
            (df['pickup_latitude'].between(MIN_LAT, MAX_LAT)) &
            (df['dropoff_longitude'].between(MIN_LON, MAX_LON)) &
            (df['dropoff_latitude'].between(MIN_LAT, MAX_LAT))] 
    
    return df

In [None]:
def add_distance_feats(df):
    df['longitude_distance'] = np.abs(df['pickup_longitude'] - df['dropoff_longitude'])
    df['latitude_distance'] = np.abs(df['pickup_latitude'] - df['dropoff_latitude'])
    df['manhattan_distance'] = (df['longitude_distance'] + df['latitude_distance'])
    df['distance_travelled'] = (df['longitude_distance'] ** 2 + df['latitude_distance'] ** 2) ** .5

    return df


def add_datetime_feats(df):
    df['year'] = (df['pickup_datetime'].dt.year-2000).astype('uint8') # minus 2000 in order to use uint8 
    df['month'] = df['pickup_datetime'].dt.month.astype('uint8')  
    df['week'] = df['pickup_datetime'].dt.week.astype('uint8')  
    df['dayofweek'] = df['pickup_datetime'].dt.dayofweek.astype('uint8')  
    df['hour'] = df['pickup_datetime'].dt.hour.astype('uint8')  
    df.drop('pickup_datetime', axis=1, inplace=True)
    
    return df


def dist(pickup_lat, pickup_long, dropoff_lat, dropoff_long):  
    return (np.abs(dropoff_lat - pickup_lat) + np.abs(dropoff_long - pickup_long)).astype('float32')  


# Airport Features - By Albert van Breenmen
# https://www.kaggle.com/breemen/nyc-taxi-fare-data-exploration
def add_airport_feats(df):
    nyc = (40.712775,-74.005973) # New York City
    jfk = (40.641311,-73.778139) # John F. Kennedy International Airport
    ewr = (40.689531,-74.174462) # Newark Liberty International Airport
    lgr = (40.776927,-73.873966) # LaGuardia Airport
    
    df['pickup_distance_to_nyc'] = dist(nyc[0], nyc[1], df['pickup_latitude'], df['pickup_longitude'])
    df['dropoff_distance_to_nyc'] = dist(nyc[0], nyc[1], df['dropoff_latitude'], df['dropoff_longitude'])
    df['pickup_distance_to_jfk'] = dist(jfk[0], jfk[1], df['pickup_latitude'], df['pickup_longitude'])
    df['dropoff_distance_to_jfk'] = dist(jfk[0], jfk[1], df['dropoff_latitude'], df['dropoff_longitude'])
    df['pickup_distance_to_ewr'] = dist(ewr[0], ewr[1], df['pickup_latitude'], df['pickup_longitude'])
    df['dropoff_distance_to_ewr'] = dist(ewr[0], ewr[1], df['dropoff_latitude'], df['dropoff_longitude'])
    df['pickup_distance_to_lgr'] = dist(lgr[0], lgr[1], df['pickup_latitude'], df['pickup_longitude'])
    df['dropoff_distance_to_lgr'] = dist(lgr[0], lgr[1], df['dropoff_latitude'], df['dropoff_longitude'])
    
    return df

In [None]:
def add_feats(df):
    df = add_distance_feats(df)
    df = add_datetime_feats(df)
    df = add_airport_feats(df)
    
    return df

In [None]:
%%time
print("Before:", train_df.shape)

train_df = clean_feats(train_df)
train_df = add_feats(train_df)
test_df = add_feats(test_df)

print("After:", train_df.shape)

In [None]:
train_df.shape, test_df.shape

# LGBM

In [None]:
y = train_df.fare_amount.copy()
train_df = train_df[test_df.columns]

In [None]:
categorical = ['passenger_count', 'year', 'month', 'week', 'hour']

In [None]:
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.025,      # 0.1
    'feature_fraction': 1.0,     # 1.0 speed
    'bagging_fraction': 1.0,     # 1.0 speed
    'bagging_freq': 0,           # 0   speed
    'max_bin': 400,              # 255 accuracy
    'max_depth': -1,             # -1  overfitting
    'num_leaves': 50,            # 31  overfitting
}

# Train with K-Split

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=1)
fold_preds = np.zeros(test_df.shape[0])
i=0

for trn_idx, val_idx in folds.split(train_df):   
    i+=1
    print(f'Split {i}')
    
    dtrain = lgb.Dataset(train_df.iloc[trn_idx], label=y.iloc[trn_idx], 
                         feature_name=train_df.columns.tolist(),
                         categorical_feature=categorical,
                         free_raw_data=False)

    dval = lgb.Dataset(train_df.iloc[val_idx], label=y.iloc[val_idx], 
                       feature_name=train_df.columns.tolist(),
                       categorical_feature=categorical,
                       free_raw_data=False)

    dtrain.construct()
    dval.construct()
    
    gbm = lgb.train(
        params=lgbm_params,
        train_set=dtrain,
        valid_sets=dval,
        num_boost_round=10_000, 
        early_stopping_rounds=125,
        verbose_eval=500,
        categorical_feature=categorical
    )
        
    # only calculate training RMSE since validation RMSE is already stored in gbm.best_score
    train_preds = gbm.predict(train_df.iloc[trn_idx])
    train_rmse = mean_squared_error(y.iloc[trn_idx], train_preds) ** .5
    print(f"Training RMSE: {train_rmse:.4f}")
    
    fold_preds = gbm.predict(test_df) 
    subm_df['fare_amount'] = fold_preds
    subm_df.to_csv(f"submissions/LGBM/v02-Fold{i}.csv")

    # predict on test data, dividing by number of folds
#     fold_preds += gbm.predict(test_df) / folds.n_splits