# Import Libraries

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.imports import *
from fastai.structured import *
from fastai.column_data import *

In [3]:
TRAIN_DATA = 'tmp/taxi-train-v9-Airport-Flag'
TEST_DATA = 'tmp/taxi-test-v9-Airport-Flag'
TEST_CSV = 'data/nyc-taxi/test.csv'

# Load data

In [4]:
%%time
train_df = pd.read_feather(TRAIN_DATA)
test_df = pd.read_feather(TEST_DATA)
test_df_raw = pd.read_csv(TEST_CSV, usecols=['key'])

CPU times: user 1.03 s, sys: 1.95 s, total: 2.98 s
Wall time: 2.95 s


In [5]:
train_df.shape, test_df.shape, test_df_raw.shape

((53925796, 23), (9914, 22), (9914, 1))

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53925796 entries, 0 to 53925795
Data columns (total 23 columns):
fare_amount                float32
pickup_longitude           float32
pickup_latitude            float32
dropoff_longitude          float32
dropoff_latitude           float32
passenger_count            uint8
year                       uint8
month                      uint8
week                       uint8
dayofweek                  uint8
day                        uint8
hour                       uint8
longitude_distance         float32
latitude_distance          float32
pickup_distance_to_nyc     float32
dropoff_distance_to_nyc    float32
pickup_distance_to_jfk     float32
dropoff_distance_to_jfk    float32
pickup_distance_to_ewr     float32
dropoff_distance_to_ewr    float32
pickup_distance_to_lgr     float32
dropoff_distance_to_lgr    float32
fare_increased             bool
dtypes: bool(1), float32(15), uint8(7)
memory usage: 3.4 GB


In [7]:
train_df = train_df.sample(20_000_000)

In [8]:
train_df.shape, test_df.shape, test_df_raw.shape

((20000000, 23), (9914, 22), (9914, 1))

In [9]:
train_df.columns

Index(['fare_amount', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year',
       'month', 'week', 'dayofweek', 'day', 'hour', 'longitude_distance',
       'latitude_distance', 'pickup_distance_to_nyc',
       'dropoff_distance_to_nyc', 'pickup_distance_to_jfk',
       'dropoff_distance_to_jfk', 'pickup_distance_to_ewr',
       'dropoff_distance_to_ewr', 'pickup_distance_to_lgr',
       'dropoff_distance_to_lgr', 'fare_increased'],
      dtype='object')

In [10]:
cat_vars = ['passenger_count', 'year', 'month', 'week', 'dayofweek', 'day', 'hour', 'fare_increased']

contin_vars = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
   'longitude_distance', 'latitude_distance']

dep = 'fare_amount'
n = len(train_df) 
n

20000000

In [11]:
for v in cat_vars: 
    train_df[v] = train_df[v].astype('category').cat.as_ordered()

In [12]:
# for v in cat_vars: test_df[v] = test_df[v].astype('category').cat.as_ordered()
apply_cats(test_df, train_df)

In [13]:
%%time
X_train, y_train, nas, mapper = proc_df(train_df, 'fare_amount', do_scale=True)

CPU times: user 1min 31s, sys: 28.3 s, total: 2min
Wall time: 2min


In [14]:
test_df[dep] = 0

In [15]:
X_test, _, nas, mapper = proc_df(test_df, 'fare_amount', do_scale=True, mapper=mapper, na_dict=nas)

In [16]:
X_train.shape, y_train.shape, X_test.shape

((20000000, 22), (20000000,), (9914, 22))

In [17]:
m = 1_000_000
val_idx = get_cv_idxs(n, val_pct=m/n)

In [18]:
def rmse(y_pred, targ):
    pct_var = (targ - y_pred)
    return math.sqrt((pct_var**2).mean())

# Deep Learning

In [19]:
md = ColumnarModelData.from_data_frame(".", val_idx, X_train, y_train.astype(np.float32), 
                                       cat_flds=cat_vars, bs=64, test_df=X_test)

In [20]:
cat_vars

['passenger_count',
 'year',
 'month',
 'week',
 'dayofweek',
 'day',
 'hour',
 'fare_increased']

In [21]:
cat_sz = [(c, len(train_df[c].cat.categories)+1) for c in cat_vars]

In [22]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
emb_szs

[(7, 4), (8, 4), (13, 7), (54, 27), (8, 4), (32, 16), (25, 13), (3, 2)]

In [23]:
max_y = np.max(y_train)
y_range = (0, max_y*1.2)
y_range

(0, 180.0)

In [24]:
m = md.get_learner(emb_szs, # size of embeddings
                   len(X_train.columns)-len(cat_vars), # size of continuous vars
                   0.04, # embedding droput
                   1, # number of output
                   [1000,500], # fully connected layer hidden units
                   [0.001,0.01], # fully connected layers droput
                   y_range=y_range, 
                   use_bn=True)

In [None]:
# m.summary()

In [None]:
# m.lr_find()

In [None]:
# m.sched.plot(100)

In [None]:
# m.sched.plot_lr()

In [25]:
lr = 6e-4

In [26]:
m.fit(lr, n_cycle=1, cycle_len=3, metrics=[rmse])

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   rmse                                
    0      14.497834  11.878623  3.190496  
    1      13.095847  11.266825  3.09198                           
    2      9.975994   10.985688  3.062977                          



[array([10.98569]), 3.0629770673886556]

In [27]:
m.save('three_epochs_lr_6e-4')

In [28]:
m.fit(lr, n_cycle=1, cycle_len=3, metrics=[rmse])

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   rmse                              
    0      10.604453  11.518482  3.129531  
  0%|          | 506/296875 [00:04<43:31, 113.47it/s, loss=13.9] 

KeyboardInterrupt: 

In [None]:
m.save('three_epochs_20M')

In [None]:
m.load('three_epochs_20M')

In [None]:
lr = 1e-4

In [None]:
m.fit(lr, n_cycle=1, cycle_len=3, metrics=[rmse])

In [None]:
lr = 1e-4

In [None]:
m.fit(lr, n_cycle=1, cycle_len=3, metrics=[rmse])

In [None]:
m.fit(lr, n_cycle=3, cycle_len=1, cycle_mult=2, metrics=[rmse])

In [None]:
lr = 1e-3

In [None]:
m.fit(lr, n_cycle=1, cycle_len=3, metrics=[rmse])

In [None]:
m.save('three_epochs_large_sgdr')

In [None]:
m.fit(lr, n_cycle=3, cycle_len=1, cycle_mult=2, metrics=[rmse])

In [None]:
# m.fit(lr, 1, cycle_len=3, best_save_name="my_best_model", metrics=[rmse])

In [None]:
# m.fit(lr, 1, cycle_len=3, best_save_name="my_best_model", metrics=[rmse])

In [None]:
predictions = m.predict(True)

In [None]:
predictions = predictions.reshape(-1)

In [None]:
submission = pd.DataFrame(
    {'key': test_df_raw.key, 'fare_amount': predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv('submissions/fastai_v02.csv', index = False)

In [None]:
submission.head()