In [1]:
import jarvis

ex = jarvis.Experiment('taxi')

ex.groundClient('ground')

tr_data = ex.artifact('train_df.csv')

te_data = ex.artifact('test_df.csv')

# What model should I use to predict the duration of a car ride? Idk... try many of them?

# look at the data that I have

# The duration of the cab ride depends on the distance and time of day.

# Feature engineering step: get manhattan distance from coordinates start to coordinates end

# But first, let's get the data into a format that's easy to work with

#Forgot to set the name!
jarvis.setNotebookName('Taxi.ipynb')

@jarvis.func
def dataframize(csvpath):
    import pandas as pd
    return pd.DataFrame.from_csv(csvpath)
    

do_dfize = ex.action(dataframize, [tr_data])

tr_data_df = ex.artifact('train_df.pkl', do_dfize)

@jarvis.func
def calculate_distance(data_df):
    def manhattan_distance(x1, y1, x2, y2):
        return abs(x1 - x2) + abs(y1 - y2)
    data_df['distance'] = [ i for i in map(manhattan_distance, data_df['start_lng'], data_df['start_lat'], data_df['end_lng'], data_df['end_lat'])]
    return data_df

# Trimmed some Notebook Cells without losing the log.

# Other thing with Nb. variables continue to exist even after cell has been clipped.

do_calc_dist = ex.action(calculate_distance, [tr_data_df])
tr_data_dist_df = ex.artifact('train_dist_df.pkl', do_calc_dist)

@jarvis.func
def train(data_df):
    from sklearn.svm import LinearSVR
    def roundtime(tstring):
        hours, mins, secs = tstring.split(':')
        if int(mins) >= 30:
            if hours == '23':
                return '00'
            else:
                return str(int(hours) + 1)
        else:
            return hours
    data_df['start_hr'] = data_df['start_timestamp'].apply(lambda x: int(roundtime(x.split(' ')[1])))
    clf = LinearSVR()
    clf.fit(data_df[['start_hr', 'distance']].values, data_df['duration'].values )
    return clf

do_train = ex.action(train, [tr_data_dist_df])
model = ex.artifact('model.pkl', do_train)


do_te_dfize = ex.action(dataframize, [te_data])
te_data_df = ex.artifact('test_df.pkl', do_te_dfize)

do_te_calcdist = ex.action(calculate_distance, [te_data_df])
te_data_dist_df = ex.artifact('test_dist_df.pkl', do_te_calcdist)

@jarvis.func
def calculate_duration(data_df):
    def tdiff(start, end):
        from datetime import datetime
        fmt = '%Y-%m-%d %H:%M:%S'
        tstamp1 = datetime.strptime(start, fmt)
        tstamp2 = datetime.strptime(end, fmt)

        if tstamp1 > tstamp2:
            td = tstamp1 - tstamp2
        else:
            td = tstamp2 - tstamp1
        return td.total_seconds()
    data_df['duration'] = [i for i in map(tdiff, data_df['start_timestamp'], data_df['end_timestamp'])]
    return data_df



do_te_calcdur = ex.action(calculate_duration, [te_data_dist_df])
te_data_full_df = ex.artifact('test_dist_dur_df.pkl', do_te_calcdur)

@jarvis.func
def test(model, test_df):
    def roundtime(tstring):
        hours, mins, secs = tstring.split(':')
        if int(mins) >= 30:
            if hours == '23':
                return '00'
            else:
                return str(int(hours) + 1)
        else:
            return hours
    test_df['start_hr'] = test_df['start_timestamp'].apply(lambda x: int(roundtime(x.split(' ')[1])))
    score = model.score(test_df[['start_hr', 'distance']].values, test_df['duration'].values)
    return score


do_test = ex.action(test, [model, te_data_full_df])
score = ex.artifact('score.txt', do_test)

@jarvis.func
def comp_predictions(model, test_df):
    def roundtime(tstring):
        hours, mins, secs = tstring.split(':')
        if int(mins) >= 30:
            if hours == '23':
                return '00'
            else:
                return str(int(hours) + 1)
        else:
            return hours
    test_df['start_hr'] = test_df['start_timestamp'].apply(lambda x: roundtime(x.split(' ')[1]))
    predictions = (model.predict(test_df[['start_hr', 'distance']].values), test_df['duration'])
    return predictions

do_comp = ex.action(comp_predictions, [model, te_data_full_df])
preds = ex.artifact('predictions.pkl', do_comp)
    

In [3]:
cached = preds.peek()
cached



(array([  565.75776228,  1183.1418543 ,   875.03667276, ...,   982.86631961,
          966.95257187,   534.34310514]), id
 0         398.0
 1        2520.0
 2         759.0
 3         395.0
 4         302.0
 5         148.0
 6        1547.0
 7         347.0
 8         533.0
 9         491.0
 10        293.0
 11       1123.0
 12        767.0
 13        535.0
 14       1063.0
 15       1739.0
 16        384.0
 17        558.0
 18        730.0
 19        194.0
 20        719.0
 21       1671.0
 22       1358.0
 23       1084.0
 24        472.0
 25       1064.0
 26       2419.0
 27        513.0
 28        222.0
 29        470.0
           ...  
 72902     607.0
 72903     776.0
 72904     341.0
 72905     927.0
 72906     346.0
 72907    3946.0
 72908     387.0
 72909     938.0
 72910     333.0
 72911     389.0
 72912    2529.0
 72913     319.0
 72914      81.0
 72915     722.0
 72916    1085.0
 72917     977.0
 72918     477.0
 72919     221.0
 72920      91.0
 72921     836.0
 72922     

In [2]:
score.pull()

