# Import Libraries

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import gc

import lightgbm as lgb

from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split

# Load Data

In [3]:
NROWS = None   # 2_000_000   # NONE: Read all
TRAIN_CSV = 'data/nyc-taxi/train.csv'
TEST_CSV = 'data/nyc-taxi/test.csv'

In [4]:
types = {'fare_amount': 'float32',
         'passenger_count': 'uint8',
         'pickup_longitude': 'float32',
         'pickup_latitude': 'float32',
         'dropoff_longitude': 'float32',
         'dropoff_latitude': 'float32'}

cols = ['fare_amount', 'pickup_datetime', 'passenger_count',
        'pickup_longitude', 'pickup_latitude',
        'dropoff_longitude', 'dropoff_latitude']

In [5]:
%%time
train_df = pd.read_csv(TRAIN_CSV, usecols=cols, dtype=types, nrows=NROWS)
train_df['pickup_datetime'] = train_df['pickup_datetime'].str.slice(0, 16)
train_df['pickup_datetime'] = pd.to_datetime(train_df['pickup_datetime'], format='%Y-%m-%d %H') 

Wall time: 1min 56s


In [6]:
test_df = pd.read_csv(TEST_CSV, dtype=types)
test_df.drop('key', axis=1, inplace=True)
test_df['pickup_datetime'] = test_df['pickup_datetime'].str.slice(0, 16)
test_df['pickup_datetime'] = pd.to_datetime(test_df['pickup_datetime'], format='%Y-%m-%d %H') 

# Feature Engineering

In [7]:
def clean_feats(df):
    
    MIN_FARE = 0
    MAX_FARE = 150
    MIN_PASSENGER = 1
    MAX_PASSENGER = 6
    MIN_LON = -74.3
    MAX_LON = -73.6
    MIN_LAT = 40.5
    MAX_LAT = 40.9

    df = df.dropna()
    df = df.loc[(df["fare_amount"] > MIN_FARE) & (df["fare_amount"] <= MAX_FARE)]
    df = df.loc[(df["passenger_count"] >= MIN_PASSENGER) & (df["passenger_count"] <= MAX_PASSENGER)]

    df = df[(df.pickup_longitude.between(MIN_LON,MAX_LON)) &
            (df.pickup_latitude.between(MIN_LAT,MAX_LAT)) &
            (df.dropoff_longitude.between(MIN_LON,MAX_LON)) &
            (df.dropoff_latitude.between(MIN_LAT,MAX_LAT))]
    
    return df


def add_distance_feats(df):
    df['longitude_distance'] = np.abs(df['pickup_longitude'] - df['dropoff_longitude'])
    df['latitude_distance'] = np.abs(df['pickup_latitude'] - df['dropoff_latitude'])
    df['manhattan_distance'] = (df['longitude_distance'] + df['latitude_distance'])

    return df


def add_datetime_feats(df):
    df['year'] = (df['pickup_datetime'].dt.year-2000).astype('uint8') # minus 2000 in order to use uint8 
    df['month'] = df['pickup_datetime'].dt.month.astype('uint8')  
    df['week'] = df['pickup_datetime'].dt.week.astype('uint8')  
    df['dayofweek'] = df['pickup_datetime'].dt.dayofweek.astype('uint8')  
    df['day'] = df['pickup_datetime'].dt.day.astype('uint8')  
    df['hour'] = df['pickup_datetime'].dt.hour.astype('uint8')  
    
    df.drop('pickup_datetime', axis=1, inplace=True)
    
    return df


def dist(pickup_lat, pickup_long, dropoff_lat, dropoff_long):  
    return (np.abs(dropoff_lat - pickup_lat) + np.abs(dropoff_long - pickup_long)).astype('float32')  


def add_airport_feats(df):
    # Distances to nearby airports, and city center
    # By reporting distances to these points, the model can somewhat triangulate other locations of interest
    nyc = (40.712775,-74.005973) # New York City
    jfk = (40.641311,-73.778139) # John F. Kennedy International Airport
    ewr = (40.689531,-74.174462) # Newark Liberty International Airport
    lgr = (40.776927,-73.873966) # LaGuardia Airport
    
    df['pickup_distance_to_nyc'] = dist(nyc[0], nyc[1], df['pickup_latitude'], df['pickup_longitude'])
    df['dropoff_distance_to_nyc'] = dist(nyc[0], nyc[1], df['dropoff_latitude'], df['dropoff_longitude'])
    df['pickup_distance_to_jfk'] = dist(jfk[0], jfk[1], df['pickup_latitude'], df['pickup_longitude'])
    df['dropoff_distance_to_jfk'] = dist(jfk[0], jfk[1], df['dropoff_latitude'], df['dropoff_longitude'])
    df['pickup_distance_to_ewr'] = dist(ewr[0], ewr[1], df['pickup_latitude'], df['pickup_longitude'])
    df['dropoff_distance_to_ewr'] = dist(ewr[0], ewr[1], df['dropoff_latitude'], df['dropoff_longitude'])
    df['pickup_distance_to_lgr'] = dist(lgr[0], lgr[1], df['pickup_latitude'], df['pickup_longitude'])
    df['dropoff_distance_to_lgr'] = dist(lgr[0], lgr[1], df['dropoff_latitude'], df['dropoff_longitude'])
    
    return df

def add_fare_hike_flag(df):
    flag = (((df.year==(2012-2000)) & (df.month>=9)) | (df.year>(2012-2000)))
    df['farehike'] = flag 
    df['farehike'] = df['farehike'].astype('uint8') 
    
    return df

In [8]:
print("Before:", train_df.shape)

# Clean training set
train_df = clean_feats(train_df)

# Distance Features
train_df = add_distance_feats(train_df)
test_df = add_distance_feats(test_df)

# DateTime Features
train_df = add_datetime_feats(train_df)
test_df = add_datetime_feats(test_df)

# Airport Features
train_df  = add_airport_feats(train_df)
test_df = add_airport_feats(test_df)

# # Fare Hike Flag
train_df = add_fare_hike_flag(train_df)
test_df = add_fare_hike_flag(test_df)

print("After:", train_df.shape)

Before: (55423856, 7)
After: (53991244, 24)


In [9]:
train_df.shape, test_df.shape

((53991244, 24), (9914, 23))

# Save and Load

In [10]:
# train_df.reset_index(drop=True, inplace=True)

In [11]:
# train_df.to_feather('tmp/lgbm_train.feather')
# test_df.to_feather('tmp/lgbm_test.feather')

In [12]:
# train_df = pd.read_feather('tmp/lgbm_train.feather')
# test_df = pd.read_feather('tmp/lgbm_test.feather')

# LGBM Setup

In [13]:
def print_scores(model):
    train_rmse = mean_squared_error(y_train, model.predict(X_train)) ** .5
    val_rmse = mean_squared_error(y_val, model.predict(X_val)) ** .5
    
    print(f"Training RMSE: {train_rmse:.4f}\t Validation RMSE: {val_rmse:.4f}")

In [14]:
y = train_df.fare_amount.copy()
train_df = train_df[test_df.columns]

In [15]:
categorical = ['passenger_count', 'year', 'month', 'week', 'dayofweek', 'day', 'hour', 'farehike']
# categorical = ['year', 'month', 'week', 'dayofweek', 'day', 'hour', 'farehike']
# categorical = ['passenger_count', 'year', 'month', 'week', 'dayofweek', 'day', 'hour']

In [16]:
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.02,       # 0.1
    'feature_fraction': 0.9,     # 1.0
    'bagging_fraction': 0.9,     # 1.0
    'bagging_freq': 10,          # 0
    'max_bin': 300,              # 255
    'num_leaves': 50,            # 31
    'num_threads': 4,
}

# Train with K-Split

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=555)
fold_preds = np.zeros(test_df.shape[0])
oof_preds = np.zeros(train_df.shape[0])
val_rmse_list = []
i=1

for trn_idx, val_idx in folds.split(train_df):   
    
    print(f'Split {i}')
    i+=1
    
    dtrain = lgb.Dataset(train_df.iloc[trn_idx], label=y.iloc[trn_idx], 
                         feature_name=train_df.columns.tolist(),
                         categorical_feature=categorical,
                         free_raw_data=False)

    dval = lgb.Dataset(train_df.iloc[val_idx], label=y.iloc[val_idx], 
                       feature_name=train_df.columns.tolist(),
                       categorical_feature=categorical,
                       free_raw_data=False)

    dtrain.construct()
    dval.construct()
    
    m = lgb.train(
        params=lgbm_params,
        train_set=dtrain,
        valid_sets=dval,
        num_boost_round=5000, 
        early_stopping_rounds=125,
        verbose_eval=500,
        categorical_feature=categorical
    )
    
    train_preds = m.predict(train_df.iloc[trn_idx])
    oof_preds[val_idx] = m.predict(train_df.iloc[val_idx])
    fold_preds += m.predict(test_df) / folds.n_splits
    
    train_rmse = mean_squared_error(y.iloc[trn_idx], train_preds) ** .5
    val_rmse = mean_squared_error(y.iloc[val_idx], oof_preds[val_idx]) ** .5
    val_rmse_list.append(val_rmse)
    
    print(f"Training RMSE: {train_rmse:.4f}\t Validation RMSE: {val_rmse:.4f}")
    print()

Split 1
Training until validation scores don't improve for 125 rounds.
[500]	valid_0's rmse: 3.37224
[1000]	valid_0's rmse: 3.29727
[1500]	valid_0's rmse: 3.26453
[2000]	valid_0's rmse: 3.24704
[2500]	valid_0's rmse: 3.23397
[3000]	valid_0's rmse: 3.22392
[3500]	valid_0's rmse: 3.21469
[4000]	valid_0's rmse: 3.20774
[4500]	valid_0's rmse: 3.2027
[5000]	valid_0's rmse: 3.19747
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 3.19747
Training RMSE: 3.1268	 Validation RMSE: 3.1975

Split 2




Training until validation scores don't improve for 125 rounds.
[500]	valid_0's rmse: 3.37096
[1000]	valid_0's rmse: 3.29622
[1500]	valid_0's rmse: 3.26364
[2000]	valid_0's rmse: 3.24428
[2500]	valid_0's rmse: 3.23171
[3000]	valid_0's rmse: 3.22073
[3500]	valid_0's rmse: 3.21238
[4000]	valid_0's rmse: 3.20568
[4500]	valid_0's rmse: 3.19997
[5000]	valid_0's rmse: 3.19538
Did not meet early stopping. Best iteration is:
[4999]	valid_0's rmse: 3.19538


In [None]:
test_file = pd.read_csv(TEST_CSV)
submission = pd.DataFrame(fold_preds, columns=["fare_amount"], index=test_file['key'])
submission.to_csv("submissions/LGBM/v01.csv", index=True, header=True)

In [None]:
submission.head()

# Train with train-test-split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df, y, test_size=0.2, random_state=42)

In [None]:
dtrain = lgb.Dataset(X_train, label=y_train, 
                     feature_name=train_df.columns.tolist(),
                     categorical_feature=categorical,
                     free_raw_data=False)

dval = lgb.Dataset(X_val, label=y_val, 
                   feature_name=train_df.columns.tolist(),
                   categorical_feature=categorical,
                   free_raw_data=False)

dtrain.construct()
dval.construct()

In [None]:
%%time
m = lgb.train(
    params=lgbm_params,
    train_set=dtrain,
    valid_sets=dval,
    num_boost_round=5000, 
    early_stopping_rounds=125,
    verbose_eval=500,
    categorical_feature=categorical
)

print_scores(m)

## Predict

In [None]:
predictions = m.predict(test_df) 

In [None]:
test_key = pd.read_csv(TEST_CSV)
submission = pd.DataFrame(predictions, columns=["fare_amount"], index=test_key.key)
submission.to_csv("submissions/LGBM/v01.csv", index=True, header=True)

In [None]:
submission.head()