In [4]:
#This script reads in cleaned and merged data and performs regressions to predict number of bikes
#departing a bike station on a work day given the following parameters:
#Time bucket, station latitude and longitude, maximum and minimum temperature, precipitation.

import pandas as pd
import numpy as np
import tensorflow as tf

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import itertools

tf.logging.set_verbosity(tf.logging.INFO)

#generate tensorflow input function for gradient descent. 
def get_input_fn(data_set, num_epochs, shuffle,batch_size):
    return tf.estimator.inputs.pandas_input_fn(
      x = pd.DataFrame({k: data_set[k].values for k in FEATURES}),
      y = pd.Series(data_set[LABEL].values),
      num_epochs=num_epochs,
      shuffle=shuffle,
      batch_size=batch_size)
      
if __name__ == '__main__':
    
    COLUMNS = ['Trip_Duration','Start_Time','Start_Station_Latitude','Start_Station_Longitude','Birth_Year','Gender','Holiday','Precipitation','Temperature']
    FEATURES = ['Start_Time','Start_Station_Latitude','Start_Station_Longitude','Birth_Year','Gender','Holiday','Precipitation','Temperature']
    LABEL = 'Trip_Duration'   
           
    #read data                              
    data = pd.read_csv('duration_prediction_data.csv')
    
    #drop rows containing NaN entries
    data = data.dropna(thresh=len(COLUMNS))

    #normalize feature and label vectors
    for k in FEATURES:
        data[k] = (data[k] - data[k].mean()) / (data[k].max() - data[k].min())
    data['Trip_Duration'] = data['Trip_Duration']/100
    
    #randomly split data set into 70% training set, 20% validation set, and 10% test set.
    length = len(data['Trip_Duration'])
    print(length)
    selection = np.random.rand(length)
    training = (selection < 0.7)
    training_set = data[training]
    valid_test = data[~training]
    
    length = len(valid_test['Trip_Duration'])
    selection = np.random.rand(length)
    valid = (selection < 0.66)
    validation_set = valid_test[valid]
    test_set = valid_test[~valid]
    
    
    #define feature columns 
    feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
  
    #Evaluate on validation set every 10 epochs, and train for 10000 epochs. 
    for i in range(1):
        
        #Implement regression on Tensorflow.
        #regressor = tf.estimator.LinearRegressor(feature_columns=feature_cols,
        regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols, hidden_units=[64,32,64],                     
                          #optimizer=tf.train.FtrlOptimizer(
                          #learning_rate=0.1,
                          #l1_regularization_strength=10,
                          #l2_regularization_strength=10))
                          optimizer=tf.train.AdamOptimizer(
                          learning_rate=0.001,
                          beta1=0.9,
                          beta2=0.999,  
                          epsilon=1e-8),model_dir="/tmp/trip_duration_prediction6")
        
        #regressor.train(input_fn=get_input_fn(training_set,num_epochs=1, shuffle=True,batch_size=int(training_set.shape[0]/100)), steps=100)
        
        ev = regressor.evaluate(
        input_fn=get_input_fn(validation_set, num_epochs=1, shuffle=False,batch_size=validation_set.shape[0]))
        
    #To generate examples of predictions, randomly select 100 rows from the test set. 
    snip = test_set.sample(100)
    pred = regressor.predict(
    input_fn=get_input_fn(snip, num_epochs=1, shuffle=False,batch_size=100))
    predictions = list(p["predictions"] for p in itertools.islice(pred, 100))
    pre = [float(element) for element in predictions]
    comparison = pd.DataFrame()
    comparison['Actual_Duration']=snip['Trip_Duration'] 
    comparison['Predicted_Duration']=pre
    print(comparison.sample(frac=1))

   
    


4591655
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/trip_duration_prediction6', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000028D8C3EE1D0>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Starting evaluation at 2018-01-07-05:49:35
INFO:tensorflow:Restoring parameters from /tmp/trip_duration_prediction6\model.ckpt-9601
INFO:tensorflow:Finished evaluation at 2018-01-07-05:49:45
INFO:tensorflow:Saving dict for global step 9601: average_loss = 29.8913, global_step = 9601, loss = 2.72051e+07
INFO:tensorflow:Restoring parameters from /tmp/trip_duration_prediction6\mo