In [3]:
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
from utils import Timer
import glob

data_path = "../green_tripdata/"
files = glob.glob(f"{data_path}/*.parquet")
with Timer("Load data"):    
    data = [pd.read_parquet(f) for f in files]
    green_taxi_df = pd.concat(data ,ignore_index=True)
    

Load data took 2.8350759772583842 sec


In [4]:
green_taxi_df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2018-11-01 00:25:03,2018-11-01 00:25:03,N,1.0,193,193,1.0,0.00,2.5,0.5,0.5,0.00,0.00,,0.3,3.80,1.0,1.0,
1,1,2018-11-01 00:06:02,2018-11-01 00:19:28,N,1.0,18,167,1.0,3.30,13.0,0.5,0.5,0.00,0.00,,0.3,14.30,2.0,1.0,
2,2,2018-11-01 00:09:02,2018-11-01 00:15:21,N,1.0,256,80,1.0,1.16,6.5,0.5,0.5,0.00,0.00,,0.3,7.80,2.0,1.0,
3,2,2018-11-01 00:49:56,2018-11-01 01:04:55,N,1.0,112,164,1.0,3.69,14.5,0.5,0.5,0.00,5.76,,0.3,21.56,1.0,1.0,
4,2,2018-11-01 00:32:47,2018-11-01 00:37:17,N,1.0,255,256,2.0,0.75,5.0,0.5,0.5,1.26,0.00,,0.3,7.56,1.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8899713,2,2018-02-28 23:39:30,2018-02-28 23:43:54,N,1.0,41,42,1.0,0.86,5.0,0.5,0.5,0.00,0.00,,0.3,6.30,2.0,1.0,
8899714,2,2018-02-28 23:25:40,2018-02-28 23:45:02,N,1.0,256,17,1.0,2.22,14.0,0.5,0.5,2.00,0.00,,0.3,17.30,1.0,1.0,
8899715,2,2018-02-28 23:39:19,2018-02-28 23:53:08,N,1.0,42,229,1.0,5.67,18.0,0.5,0.5,1.00,0.00,,0.3,20.30,1.0,1.0,
8899716,2,2018-02-28 23:40:13,2018-02-28 23:43:34,N,1.0,42,74,1.0,0.59,4.5,0.5,0.5,2.00,0.00,,0.3,7.80,1.0,1.0,


In [5]:
with Timer("train data wrangling"):
    # data wrangler
    columns_to_remove = ["lpep_dropoff_datetime", "PULocationID", "DOLocationID", "extra", "mta_tax",
                         "improvement_surcharge", "tolls_amount", "ehail_fee", "trip_type", "RatecodeID", 
                         "store_and_fwd_flag", "payment_type", "fare_amount", "tip_amount", "congestion_surcharge"
                        ]
    for col in columns_to_remove:
        green_taxi_df.pop(col)

    final_df = green_taxi_df.query("trip_distance>=0.25 and trip_distance<31")
    final_df = final_df.query("passenger_count>0 and total_amount>0")

train data wrangling took 1.4777259239926934 sec


In [6]:
green_taxi_df.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,total_amount
count,8899718.0,8798641.0,8899718.0,8899718.0
mean,1.840284,1.350911,3.252301,16.09376
std,0.3663905,1.03082,4.582039,14.07958
min,1.0,0.0,0.0,-500.0
25%,2.0,1.0,1.07,8.3
50%,2.0,1.0,1.93,11.8
75%,2.0,1.0,3.9,19.1
max,5.0,9.0,8005.68,10528.75


In [8]:
from sklearn.model_selection import train_test_split

with Timer("Split data"):
    x_train, x_test = train_test_split(final_df, test_size=0.2, random_state=223)

Split data took 1.505374614149332 sec


In [10]:
import logging
from azureml.train.automl import AutoMLConfig

automl_settings = {
    "iteration_timeout_minutes": 10,
    "experiment_timeout_hours": 0.3,
    "enable_early_stopping": True,
    "primary_metric": 'spearman_correlation',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations": 5
}

automl_config = AutoMLConfig(task='regression',
                             debug_log='automated_ml_errors.log',
                             training_data=x_train,
                             label_column_name="totalAmount",
                             **automl_settings)

In [None]:
from azureml.core.workspace import Workspace
ws = Workspace.from_config()

In [None]:
from azureml.core.experiment import Experiment

with Timer("train data with azureML"):
    experiment = Experiment(ws, "Tutorial-NYCTaxi")
    local_run = experiment.submit(automl_config, show_output=True)