In [5]:
import pandas as pd
from utils import Timer

def cutomizedCoordinationFix(df):
    df = df.assign(rev=df.dropoff_latitude<df.dropoff_longitude)
    idx = (df['rev'] == 1)
    df.loc[idx,['dropoff_longitude','dropoff_latitude']] = df.loc[idx,['dropoff_latitude','dropoff_longitude']].values
    df.loc[idx,['pickup_longitude','pickup_latitude']] = df.loc[idx,['pickup_latitude','pickup_longitude']].values
    df = df.drop(columns=['rev'])
    return df

def clean_df(df):    
    #reverse incorrectly assigned longitude/latitude values
    df = cutomizedCoordinationFix(df)
    df = df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
          (df.passenger_count >= 0) & (df.passenger_count <= 8)  &
           ((df.pickup_longitude != 0) & (df.pickup_latitude != 0) & (df.dropoff_longitude != 0) & (df.dropoff_latitude != 0) )]
    
    return df

cols = [
    'fare_amount', 'pickup_datetime','pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude', 'passenger_count'
]

#sampled_line = 100000
with Timer(f"Load train full"):
    train_data = pd.read_csv("nyc_taxi_fare_train.csv", usecols=cols, nrows=100000)

with Timer("Data Wrangling for train"):
    train_data = clean_df(train_data)


Load train full took 0.11234246799722314 sec
Data Wrangling for train took 0.013182478956878185 sec


In [6]:
train_data

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.841610,40.712278,1
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.761270,-73.991242,40.750562,2
3,7.7,2012-04-21 04:30:42 UTC,-73.987130,40.733143,-73.991567,40.758092,1
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
...,...,...,...,...,...,...,...
99995,9.0,2013-09-24 07:39:00 UTC,-73.947977,40.784792,-73.964262,40.792347,5
99996,6.0,2014-05-15 12:15:45 UTC,-73.962918,40.799107,-73.974178,40.786487,1
99997,5.0,2015-02-19 17:40:43 UTC,-73.996773,40.723549,-73.991974,40.724724,1
99998,6.9,2009-10-10 23:35:00 UTC,-73.983652,40.756667,-73.982715,40.767067,4


In [7]:
from autogluon.tabular import TabularDataset, TabularPredictor

model = TabularPredictor(label="fare_amount")
predictor = model.fit(train_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20221201_194723/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20221201_194723/"
AutoGluon Version:  0.6.0
Python Version:     3.8.10
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Nov 8 23:39:32 UTC 2018
Train Data Rows:    97983
Train Data Columns: 6
Label Column: fare_amount
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (200.0, 0.01, 11.3466, 9.68786)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    357330.93 

[1000]	valid_set's rmse: 4.18268
[2000]	valid_set's rmse: 4.14276
[3000]	valid_set's rmse: 4.13572
[4000]	valid_set's rmse: 4.13964


	-4.1305	 = Validation score   (-root_mean_squared_error)
	17.56s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's rmse: 3.89882


	-3.8981	 = Validation score   (-root_mean_squared_error)
	4.98s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-4.367	 = Validation score   (-root_mean_squared_error)
	6.62s	 = Training   runtime
	0.18s	 = Validation runtime
Fitting model: CatBoost ...
	-3.9498	 = Validation score   (-root_mean_squared_error)
	36.74s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-4.382	 = Validation score   (-root_mean_squared_error)
	1.42s	 = Training   runtime
	0.16s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-7.6499	 = Validation score   (-root_mean_squared_error)
	144.04s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: XGBoost ...
	-3.9719	 = Validation score   (-root_mean_squared_error)
	1.76s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-4.119	 = Validation score   (-root_mean_squared_error)
	581.09s	 = Training   runtime
	0.03s	 = Validation ru

[1000]	valid_set's rmse: 3.89101
[2000]	valid_set's rmse: 3.85401


	-3.8453	 = Validation score   (-root_mean_squared_error)
	33.38s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-3.8103	 = Validation score   (-root_mean_squared_error)
	0.26s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 847.13s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20221201_194723/")


In [27]:
info = predictor.info()
info['model_info'][info['best_model']]

{'name': 'WeightedEnsemble_L2',
 'model_type': 'WeightedEnsembleModel',
 'problem_type': 'regression',
 'eval_metric': 'root_mean_squared_error',
 'stopping_metric': 'root_mean_squared_error',
 'fit_time': 0.2618563175201416,
 'num_classes': None,
 'quantile_levels': None,
 'predict_time': 0.00039458274841308594,
 'val_score': -3.810278397873824,
 'hyperparameters': {'use_orig_features': False,
  'max_base_models': 25,
  'max_base_models_per_type': 5,
  'save_bag_folds': True},
 'hyperparameters_fit': {},
 'hyperparameters_nondefault': ['save_bag_folds'],
 'ag_args_fit': {'max_memory_usage_ratio': 1.0,
  'max_time_limit_ratio': 1.0,
  'max_time_limit': None,
  'min_time_limit': 0,
  'valid_raw_types': None,
  'valid_special_types': None,
  'ignored_type_group_special': None,
  'ignored_type_group_raw': None,
  'get_features_kwargs': None,
  'get_features_kwargs_extra': None,
  'predict_1_batch_size': None,
  'temperature_scalar': None,
  'drop_unique': False},
 'num_features': 5,
 'fea

In [28]:
#sampled_line = 100000
with Timer(f"Load train full"):
    train_data = pd.read_csv("nyc_taxi_fare_train.csv", usecols=cols)

with Timer("Data Wrangling for train"):
    train_data = clean_df(train_data)

model_for_full_data = TabularPredictor(label="fare_amount")
predictor_from_full = model_for_full_data.fit(train_data)

Load train full took 50.208256038837135 sec


No path specified. Models will be saved in: "AutogluonModels/ag-20221201_210756/"


Data Wrangling for train took 7.748062551021576 sec


	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20221201_210756/"
AutoGluon Version:  0.6.0
Python Version:     3.8.10
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Nov 8 23:39:32 UTC 2018
Train Data Rows:    54315955
Train Data Columns: 6
Label Column: fare_amount
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (500.0, 0.01, 11.32425, 9.68662)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Gener

[1000]	valid_set's rmse: 4.25444
[2000]	valid_set's rmse: 4.14332
[3000]	valid_set's rmse: 4.09463
[4000]	valid_set's rmse: 4.06868
[5000]	valid_set's rmse: 4.04759
[6000]	valid_set's rmse: 4.03167
[7000]	valid_set's rmse: 4.01966
[8000]	valid_set's rmse: 4.00981
[9000]	valid_set's rmse: 4.00126
[10000]	valid_set's rmse: 3.99384


	-3.9938	 = Validation score   (-root_mean_squared_error)
	3826.13s	 = Training   runtime
	13.57s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's rmse: 3.98709
[2000]	valid_set's rmse: 3.92522
[3000]	valid_set's rmse: 3.9037
[4000]	valid_set's rmse: 3.88413
[5000]	valid_set's rmse: 3.87263
[6000]	valid_set's rmse: 3.86414
[7000]	valid_set's rmse: 3.85463
[8000]	valid_set's rmse: 3.85004
[9000]	valid_set's rmse: 3.84715
[10000]	valid_set's rmse: 3.84339


	-3.8434	 = Validation score   (-root_mean_squared_error)
	2751.42s	 = Training   runtime
	8.03s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-3.7168	 = Validation score   (-root_mean_squared_error)
	9028.68s	 = Training   runtime
	0.96s	 = Validation runtime
Fitting model: CatBoost ...
	-3.8859	 = Validation score   (-root_mean_squared_error)
	7173.86s	 = Training   runtime
	0.22s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-8.3763	 = Validation score   (-root_mean_squared_error)
	1727.4s	 = Training   runtime
	0.73s	 = Validation runtime
Fitting model: NeuralNetFastAI ...


KeyboardInterrupt: 