# Import Libraries

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.imports import *
from fastai.structured import *
from fastai.column_data import *

In [3]:
TRAIN_DATA = 'tmp/taxi-train-v9-Baseline'
TEST_DATA = 'tmp/taxi-test-v9-Baseline'
TEST_CSV = 'data/nyc-taxi/test.csv'

# Load data

In [4]:
%%time
train_df = pd.read_feather(TRAIN_DATA)
test_df = pd.read_feather(TEST_DATA)
test_df_raw = pd.read_csv(TEST_CSV, usecols=['key'])

CPU times: user 579 ms, sys: 959 ms, total: 1.54 s
Wall time: 1.54 s


In [5]:
train_df.shape, test_df.shape, test_df_raw .shape

((53925796, 14), (9914, 13), (9914, 1))

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53925796 entries, 0 to 53925795
Data columns (total 14 columns):
fare_amount           float32
pickup_longitude      float32
pickup_latitude       float32
dropoff_longitude     float32
dropoff_latitude      float32
passenger_count       uint8
year                  uint8
month                 uint8
week                  uint8
dayofweek             uint8
day                   uint8
hour                  uint8
longitude_distance    float32
latitude_distance     float32
dtypes: float32(7), uint8(7)
memory usage: 1.8 GB


In [7]:
train_df.shape, test_df.shape, test_df_raw.shape

((53925796, 14), (9914, 13), (9914, 1))

In [8]:
train_df.columns

Index(['fare_amount', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year',
       'month', 'week', 'dayofweek', 'day', 'hour', 'longitude_distance',
       'latitude_distance'],
      dtype='object')

In [9]:
cat_vars = ['passenger_count', 'year', 'month', 'week', 'dayofweek', 'day', 'hour']

contin_vars = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
   'longitude_distance', 'latitude_distance']

dep = 'fare_amount'
n = len(train_df) 
n

53925796

In [10]:
for v in cat_vars: 
    train_df[v] = train_df[v].astype('category').cat.as_ordered()

In [11]:
# for v in cat_vars: test_df[v] = test_df[v].astype('category').cat.as_ordered()
apply_cats(test_df, train_df)

In [12]:
%%time
X_train, y_train, nas, mapper = proc_df(train_df, 'fare_amount', do_scale=True)

CPU times: user 1min 29s, sys: 31.5 s, total: 2min
Wall time: 2min


In [13]:
test_df[dep] = 0

In [14]:
X_test, _, nas, mapper = proc_df(test_df, 'fare_amount', do_scale=True, mapper=mapper, na_dict=nas)

In [15]:
X_train.shape, y_train.shape, X_test.shape

((53925796, 13), (53925796,), (9914, 13))

In [16]:
m = 100_000
val_idx = get_cv_idxs(n, val_pct=m/n)

In [17]:
def rmse(y_pred, targ):
    pct_var = (targ - y_pred)
    return math.sqrt((pct_var**2).mean())

# Deep Learning

In [18]:
md = ColumnarModelData.from_data_frame(".", val_idx, X_train, y_train.astype(np.float32), 
                                       cat_flds=cat_vars, bs=512, test_df=X_test)

In [19]:
cat_vars

['passenger_count', 'year', 'month', 'week', 'dayofweek', 'day', 'hour']

In [20]:
cat_sz = [(c, len(train_df[c].cat.categories)+1) for c in cat_vars]

In [21]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
emb_szs

[(7, 4), (8, 4), (13, 7), (54, 27), (8, 4), (32, 16), (25, 13)]

In [22]:
max_y = np.max(y_train)
y_range = (0, max_y*1.2)
y_range

(0, 180.0)

In [23]:
m = md.get_learner(emb_szs, # size of embeddings
                   len(X_train.columns)-len(cat_vars), # size of continuous vars
                   0.01, # embedding droput
                   1, # number of output
                   [1000,500], # fully connected layer hidden units
                   [0.001,0.01], # fully connected layers droput
                   y_range=y_range, 
                   use_bn=True)

In [24]:
# m.summary()

In [25]:
# m.lr_find()

In [26]:
# m.sched.plot(100)

In [None]:
# m.sched.plot_lr()

In [27]:
lr = 3e-4

In [28]:
m.fit(lr, n_cycle=1, cycle_len=3, metrics=[rmse])

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   rmse                                
    0      12.488438  11.307275  3.31947   
    1      11.838237  10.610112  3.214867                         
    2      11.54535   10.392708  3.181058                         



[array([10.39271]), 3.1810581963398756]

In [29]:
m.save('three_epochs')

In [None]:
m.load('three_epochs')

In [30]:
m.fit(lr, n_cycle=3, cycle_len=1, cycle_mult=2, metrics=[rmse])

HBox(children=(IntProgress(value=0, description='Epoch', max=7), HTML(value='')))

epoch      trn_loss   val_loss   rmse                             
    0      10.873532  10.278605  3.163057  
    1      11.56465   10.309098  3.169354                         
  0%|          | 121/105129 [00:05<1:19:24, 22.04it/s, loss=10.8]

KeyboardInterrupt: 

In [None]:
m.fit(lr, n_cycle=3, cycle_len=1, cycle_mult=2, metrics=[rmse])

In [None]:
# m.fit(lr, 1, cycle_len=3, best_save_name="my_best_model", metrics=[rmse])

In [None]:
# m.fit(lr, 1, cycle_len=3, best_save_name="my_best_model", metrics=[rmse])

In [None]:
predictions = m.predict(True)

In [None]:
predictions = predictions.reshape(-1)

In [None]:
submission = pd.DataFrame(
    {'key': test_df_raw.key, 'fare_amount': predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv('submissions/fastai_v02.csv', index = False)

In [None]:
submission.head()