In [32]:
import jarvis

ex = jarvis.Experiment('taxi')
ex.groundClient('ground')

tr_data = ex.artifact('train_df.csv')

tr_data2 = ex.artifact('train.csv')

te_data = ex.artifact('test_df.csv')

# What model should I use to predict the duration of a car ride? Idk... try many of them?

# look at the data that I have

# The duration of the cab ride depends on the distance and time of day.

# Feature engineering step: get manhattan distance from coordinates start to coordinates end

# But first, let's get the data into a format that's easy to work with

#Forgot to set the name!
jarvis.setNotebookName('Taxi.ipynb')

@jarvis.func
def dataframize(csvpath):
    import pandas as pd
    return pd.read_csv(csvpath)


do_dfize = ex.action(dataframize, [tr_data2])

tr_data_df = ex.artifact('train_df.pkl', do_dfize)

@jarvis.func
def calculate_distance(data_df):
    def manhattan_distance(x1, y1, x2, y2):
        return abs(x1 - x2) + abs(y1 - y2)
    data_df['distance'] = [ i for i in map(manhattan_distance, data_df['pickup_longitude'], data_df['pickup_latitude'], 
                                           data_df['dropoff_longitude'], data_df['dropoff_latitude'])]
    return data_df

# Trimmed some Notebook Cells without losing the log.

# Other thing with Nb. variables continue to exist even after cell has been clipped.

do_calc_dist = ex.action(calculate_distance, [tr_data_df])
tr_data_dist_df = ex.artifact('train_dist_df.pkl', do_calc_dist)

@jarvis.func
def preproc(train_data):
    import numpy as np
    # https://www.kaggle.com/stephaniestallworth/nyc-taxi-eda-regression-fivethirtyeight-viz/notebook
    train_data = train_data[train_data['passenger_count']>0]
    train_data = train_data[train_data['passenger_count']<9]

    # Remove coordinate outliers
    train_data = train_data[train_data['pickup_longitude'] <= -73.75]
    train_data = train_data[train_data['pickup_longitude'] >= -74.03]
    train_data = train_data[train_data['pickup_latitude'] <= 40.85]
    train_data = train_data[train_data['pickup_latitude'] >= 40.63]
    train_data = train_data[train_data['dropoff_longitude'] <= -73.75]
    train_data = train_data[train_data['dropoff_longitude'] >= -74.03]
    train_data = train_data[train_data['dropoff_latitude'] <= 40.85]
    train_data = train_data[train_data['dropoff_latitude'] >= 40.63]

    # Remove trip_duration outliers
    trip_duration_mean = np.mean(train_data['trip_duration'])
    trip_duration_std = np.std(train_data['trip_duration'])
    train_data = train_data[train_data['trip_duration']<=trip_duration_mean + 2*trip_duration_std]
    train_data = train_data[train_data['trip_duration']>= trip_duration_mean - 2*trip_duration_std]
    train_data = train_data[train_data['trip_duration'] >= 30]
    train_data = train_data[train_data['trip_duration'] <= 60*240]
    
    return train_data

do_preproc = ex.action(preproc, [tr_data_dist_df])
tr_ready = ex.artifact('train_ready.pkl', do_preproc)

@jarvis.func
def split(data_df):
    X = data_df[['vendor_id', 'passenger_count', 'pickup_longitude',
        'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'store_and_fwd_flag', 'pickup_datetime', 'distance']]
    y = data_df['trip_duration']
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    
    return X_train, X_test, y_train, y_test

do_split = ex.action(split, [tr_ready])
xTrain = ex.artifact('xTrain.pkl', do_split)
xTest = ex.artifact('xTest.pkl', do_split)
yTrain = ex.artifact('yTrain.pkl', do_split)
yTest = ex.artifact('yTest.pkl', do_split)

@jarvis.func
def train(data_df, trainingy):
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestRegressor
    
    data_df['duration'] = trainingy
    
    def roundtime(tstring):
        hours, mins, secs = tstring.split(':')
        if int(mins) >= 30:
            if hours == '23':
                return '00'
            else:
                return str(int(hours) + 1)
        else:
            return hours
    
    def weekday(start):
        from datetime import datetime
        fmt = '%Y-%m-%d %H:%M:%S'
        tstamp = datetime.strptime(start, fmt)
        return int(tstamp.weekday())
    
    data_df['start_hr'] = data_df['pickup_datetime'].apply(lambda x: int(roundtime(x.split(' ')[1])))
    data_df['start_month'] = data_df['pickup_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
    data_df['start_weekday']= data_df['pickup_datetime'].apply(lambda x: weekday(x))
    
    import math
    
    clf = RandomForestRegressor(n_estimators=20, n_jobs=3)
    #clf = LinearRegression()
    scaler = StandardScaler()
    # Scaler does not help
    #scaler.fit(data_df[['vendor_id', 'start_hr', 'start_month', 'start_weekday', 'distance']].values)
    clf.fit(data_df[['vendor_id', 'start_hr', 'start_month', 'start_weekday', 'distance', 
                     'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
                    'dropoff_latitude']].values, data_df['duration'].values )
    return clf, scaler

do_train = ex.action(train, [xTrain, yTrain])
model = ex.artifact('model.pkl', do_train)
scaler = ex.artifact('scaler.pkl', do_train)

do_te_dfize = ex.action(dataframize, [te_data])
te_data_df = ex.artifact('test_df.pkl', do_te_dfize)

do_te_calcdist = ex.action(calculate_distance, [te_data_df])
te_data_dist_df = ex.artifact('test_dist_df.pkl', do_te_calcdist)

@jarvis.func
def calculate_duration(data_df):
    def tdiff(start, end):
        from datetime import datetime
        fmt = '%Y-%m-%d %H:%M:%S'
        tstamp1 = datetime.strptime(start, fmt)
        tstamp2 = datetime.strptime(end, fmt)

        if tstamp1 > tstamp2:
            td = tstamp1 - tstamp2
        else:
            td = tstamp2 - tstamp1
        return int(td.total_seconds())
    data_df['duration'] = [i for i in map(tdiff, data_df['pickup_datetime'], data_df['dropoff_datetime'])]
    return data_df



do_te_calcdur = ex.action(calculate_duration, [te_data_dist_df])
te_data_full_df = ex.artifact('test_dist_dur_df.pkl', do_te_calcdur)

@jarvis.func
def test(model, data_df, testingy, scaler):
    import numpy as np
    data_df['duration'] = testingy
    def roundtime(tstring):
        hours, mins, secs = tstring.split(':')
        if int(mins) >= 30:
            if hours == '23':
                return '00'
            else:
                return str(int(hours) + 1)
        else:
            return hours
    def weekday(start):
        from datetime import datetime
        fmt = '%Y-%m-%d %H:%M:%S'
        tstamp = datetime.strptime(start, fmt)
        return int(tstamp.weekday())
    
    data_df['start_hr'] = data_df['pickup_datetime'].apply(lambda x: int(roundtime(x.split(' ')[1])))
    data_df['start_month'] = data_df['pickup_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
    data_df['start_weekday']= data_df['pickup_datetime'].apply(lambda x: weekday(x))
    
    
    preds = model.predict(data_df[['vendor_id', 'start_hr', 'start_month', 'start_weekday', 'distance', 
                     'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
                    'dropoff_latitude']].values)
    from sklearn import metrics
    import math
    score = metrics.explained_variance_score(data_df['duration'].values, preds)
    rmse = np.sqrt(metrics.mean_squared_error(data_df['duration'].values, preds))
    return str(score) + '\n' + str(rmse)


do_test = ex.action(test, [model, xTest, yTest, scaler])
score = ex.artifact('score.txt', do_test)

@jarvis.func
def comp_predictions(model, data_df, scaler):
    def roundtime(tstring):
        hours, mins, secs = tstring.split(':')
        if int(mins) >= 30:
            if hours == '23':
                return '00'
            else:
                return str(int(hours) + 1)
        else:
            return hours
    def weekday(start):
        from datetime import datetime
        fmt = '%Y-%m-%d %H:%M:%S'
        tstamp = datetime.strptime(start, fmt)
        return int(tstamp.weekday())
    
    data_df['start_hr'] = data_df['start_timestamp'].apply(lambda x: int(roundtime(x.split(' ')[1])))
    data_df['start_month'] = data_df['start_timestamp'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
    data_df['start_weekday']= data_df['start_timestamp'].apply(lambda x: weekday(x))
    
    predictions = (model.predict(data_df[['start_hr', 'start_month', 'start_weekday', 'distance']].values), data_df['duration'])
    return predictions

do_comp = ex.action(comp_predictions, [model, te_data_full_df, scaler])
preds = ex.artifact('predictions.pkl', do_comp)
    

In [33]:
score.pull()

In [8]:
cache = xTrain.peek()
cache

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_datetime,distance


In [18]:
data_df = tr_data_dist_df.peek()

In [19]:
data_df[data_df['trip_duration'] >= 60*240]

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance
531,id3307903,2,2016-02-20 04:03:06,2016-02-21 03:33:00,3,-74.008102,40.741489,-74.009956,40.714611,N,84594,0.028732
1134,id1091477,2,2016-05-07 18:36:22,2016-05-08 18:32:11,1,-73.990242,40.750919,-73.976280,40.750889,N,86149,0.013992
1865,id3431345,2,2016-06-07 12:58:48,2016-06-08 12:58:00,6,-73.954956,40.777649,-73.981033,40.743713,N,86352,0.060013
3442,id1487069,2,2016-02-13 00:21:49,2016-02-14 00:19:05,1,-73.968590,40.799217,-73.979584,40.784714,N,86236,0.025497
4172,id3674870,2,2016-03-18 11:54:20,2016-03-19 11:34:17,1,-73.989090,40.736992,-73.972336,40.751511,N,85197,0.031273
4336,id3632390,2,2016-06-08 08:54:33,2016-06-09 07:58:09,5,-73.980560,40.742466,-73.984718,40.748611,N,83016,0.010303
5104,id3354426,2,2016-05-05 15:18:41,2016-05-06 15:11:07,1,-73.989037,40.773514,-73.980682,40.781521,N,85946,0.016361
5778,id0773526,2,2016-04-02 14:58:45,2016-04-03 14:19:55,6,-73.987991,40.761341,-74.002922,40.756279,N,84070,0.019993
6132,id3617210,2,2016-03-15 17:51:32,2016-03-16 17:18:04,1,-73.965607,40.765781,-73.972649,40.753418,N,84392,0.019405
6513,id0067152,2,2016-02-27 21:04:05,2016-02-28 21:03:22,5,-73.993744,40.727444,-74.001335,40.729244,N,86357,0.009392


In [36]:
data_df['trip_duration'].apply(lambda x: math.log(x)).values

array([ 6.12029742,  6.49677499,  7.66105638, ...,  6.63856779,
        5.92157842,  5.28826703])

In [34]:
import math