In [3]:
import jarvis

ex = jarvis.Experiment('taxi')
ex.groundClient('ground')

tr_data = ex.artifact('train_df.csv')

te_data = ex.artifact('test_df.csv')

# What model should I use to predict the duration of a car ride? Idk... try many of them?

# look at the data that I have

# The duration of the cab ride depends on the distance and time of day.

# Feature engineering step: get manhattan distance from coordinates start to coordinates end

# But first, let's get the data into a format that's easy to work with

#Forgot to set the name!
jarvis.setNotebookName('Taxi.ipynb')

@jarvis.func
def dataframize(csvpath):
    import pandas as pd
    return pd.read_csv(csvpath)


do_dfize = ex.action(dataframize, [tr_data])

tr_data_df = ex.artifact('train_df.pkl', do_dfize)

@jarvis.func
def calculate_distance(data_df):
    def manhattan_distance(x1, y1, x2, y2):
        return abs(x1 - x2) + abs(y1 - y2)
    data_df['distance'] = [ i for i in map(manhattan_distance, data_df['start_lng'], data_df['start_lat'], data_df['end_lng'], data_df['end_lat'])]
    return data_df

# Trimmed some Notebook Cells without losing the log.

# Other thing with Nb. variables continue to exist even after cell has been clipped.

do_calc_dist = ex.action(calculate_distance, [tr_data_df])
tr_data_dist_df = ex.artifact('train_dist_df.pkl', do_calc_dist)

@jarvis.func
def preproc(train_data):
    import numpy as np
    # https://www.kaggle.com/stephaniestallworth/nyc-taxi-eda-regression-fivethirtyeight-viz/notebook
    # Remove passenger_count outliers
    train_data = train_data[train_data['passenger_count']>0]
    train_data = train_data[train_data['passenger_count']<9]

    # Remove coordinate outliers
    train_data = train_data[train_data['start_lng'] <= -73.75]
    train_data = train_data[train_data['start_lng'] >= -74.03]
    train_data = train_data[train_data['start_lat'] <= 40.85]
    train_data = train_data[train_data['start_lat'] >= 40.63]
    train_data = train_data[train_data['end_lng'] <= -73.75]
    train_data = train_data[train_data['end_lng'] >= -74.03]
    train_data = train_data[train_data['end_lat'] <= 40.85]
    train_data = train_data[train_data['end_lat'] >= 40.63]

    # Remove trip_duration outliers
    trip_duration_mean = np.mean(train_data['duration'])
    trip_duration_std = np.std(train_data['duration'])
    train_data = train_data[train_data['duration'] <= trip_duration_mean + 2*trip_duration_std]
    train_data = train_data[train_data['duration'] >= trip_duration_mean - 2*trip_duration_std]
    
    return train_data

do_preproc = ex.action(preproc, [tr_data_dist_df])
tr_ready = ex.artifact('train_ready.pkl', do_preproc)

@jarvis.func
def train(data_df):
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestRegressor
    
    def roundtime(tstring):
        hours, mins, secs = tstring.split(':')
        if int(mins) >= 30:
            if hours == '23':
                return '00'
            else:
                return str(int(hours) + 1)
        else:
            return hours
    
    def weekday(start):
        from datetime import datetime
        fmt = '%Y-%m-%d %H:%M:%S'
        tstamp = datetime.strptime(start, fmt)
        return int(tstamp.weekday())
    
    data_df['start_hr'] = data_df['start_timestamp'].apply(lambda x: int(roundtime(x.split(' ')[1])))
    data_df['start_month'] = data_df['start_timestamp'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
    data_df['start_weekday']= data_df['start_timestamp'].apply(lambda x: weekday(x))
    
    #clf = RandomForestRegressor(n_estimators=20)
    clf = LinearRegression()
    scaler = StandardScaler()
    scaler.fit(data_df[['start_hr', 'start_month', 'start_weekday', 'distance']].values)
    clf.fit(data_df[['start_hr', 'start_month', 'start_weekday', 'distance']].values, data_df['duration'].values )
    return clf, scaler

do_train = ex.action(train, [tr_ready])
model = ex.artifact('model.pkl', do_train)
scaler = ex.artifact('scaler.pkl', do_train)

do_te_dfize = ex.action(dataframize, [te_data])
te_data_df = ex.artifact('test_df.pkl', do_te_dfize)

do_te_calcdist = ex.action(calculate_distance, [te_data_df])
te_data_dist_df = ex.artifact('test_dist_df.pkl', do_te_calcdist)

@jarvis.func
def calculate_duration(data_df):
    def tdiff(start, end):
        from datetime import datetime
        fmt = '%Y-%m-%d %H:%M:%S'
        tstamp1 = datetime.strptime(start, fmt)
        tstamp2 = datetime.strptime(end, fmt)

        if tstamp1 > tstamp2:
            td = tstamp1 - tstamp2
        else:
            td = tstamp2 - tstamp1
        return int(td.total_seconds())
    data_df['duration'] = [i for i in map(tdiff, data_df['start_timestamp'], data_df['end_timestamp'])]
    return data_df



do_te_calcdur = ex.action(calculate_duration, [te_data_dist_df])
te_data_full_df = ex.artifact('test_dist_dur_df.pkl', do_te_calcdur)

@jarvis.func
def test(model, data_df, scaler):
    def roundtime(tstring):
        hours, mins, secs = tstring.split(':')
        if int(mins) >= 30:
            if hours == '23':
                return '00'
            else:
                return str(int(hours) + 1)
        else:
            return hours
    def weekday(start):
        from datetime import datetime
        fmt = '%Y-%m-%d %H:%M:%S'
        tstamp = datetime.strptime(start, fmt)
        return int(tstamp.weekday())
    
    data_df['start_hr'] = data_df['start_timestamp'].apply(lambda x: int(roundtime(x.split(' ')[1])))
    data_df['start_month'] = data_df['start_timestamp'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
    data_df['start_weekday']= data_df['start_timestamp'].apply(lambda x: weekday(x))
    
    
    preds = model.predict(data_df[['start_hr', 'start_month', 'start_weekday', 'distance']].values)
    from sklearn import metrics
    score = metrics.explained_variance_score(data_df['duration'].values, preds)
    return score


do_test = ex.action(test, [model, te_data_full_df, scaler])
score = ex.artifact('score.txt', do_test)

@jarvis.func
def comp_predictions(model, data_df, scaler):
    def roundtime(tstring):
        hours, mins, secs = tstring.split(':')
        if int(mins) >= 30:
            if hours == '23':
                return '00'
            else:
                return str(int(hours) + 1)
        else:
            return hours
    def weekday(start):
        from datetime import datetime
        fmt = '%Y-%m-%d %H:%M:%S'
        tstamp = datetime.strptime(start, fmt)
        return int(tstamp.weekday())
    
    data_df['start_hr'] = data_df['start_timestamp'].apply(lambda x: int(roundtime(x.split(' ')[1])))
    data_df['start_month'] = data_df['start_timestamp'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
    data_df['start_weekday']= data_df['start_timestamp'].apply(lambda x: weekday(x))
    
    predictions = (model.predict(data_df[['start_hr', 'start_month', 'start_weekday', 'distance']].values), data_df['duration'])
    return predictions

do_comp = ex.action(comp_predictions, [model, te_data_full_df, scaler])
preds = ex.artifact('predictions.pkl', do_comp)
    

In [4]:
score.peek()

['0.00627427905862\n',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']