In [25]:
#This script reads in cleaned and merged data and performs regressions to predict number of bikes
#departing a bike station given the following parameters:
#Time bucket, station latitude and longitude, hourly temperature, precipitation, and whether the day is a holiday/weekend or not. 

import pandas as pd
import numpy as np
import tensorflow as tf

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import itertools

tf.logging.set_verbosity(tf.logging.INFO)

#generate tensorflow input function for gradient descent. 
def get_input_fn(data_set, num_epochs,shuffle):
    return tf.estimator.inputs.pandas_input_fn(
      x = pd.DataFrame({k: data_set[k].values for k in FEATURES_WEIGHT}),
      y = pd.Series(data_set[LABEL].values),
      num_epochs=num_epochs,
      shuffle=shuffle,
      batch_size=data_set.shape[0])
     
if __name__ == '__main__':
    
    COLUMNS = ['Start_Time','Start_Station_Latitude','Start_Station_Longitude','Count','Precipitation','Temperature']
    FEATURES = ['Start_Time','Start_Station_Latitude','Start_Station_Longitude','Precipitation','Temperature']
    #This is to assign more weight to under-represented classes to combat the problem of imbalanced data. 
    FEATURES_WEIGHT = ['Start_Time','Start_Station_Latitude','Start_Station_Longitude','Precipitation','Temperature','Weight']
    LABEL = 'Count'   
                                      
    #read data                             
    data = pd.read_csv('demand_prediction_data.csv')
    
    #Assign the number of bikes into intervals of length 4, in order to perform classification. 
    #In this case, we have 9 classes, with last class anything larger than 31.
    bins = [-1,4,8,12,16,20,24,28,32,1000]
    data['Count'] = pd.cut(data['Count'],bins=bins,labels=False).astype(int)
    
    #normalize feature and label vectors
    for k in FEATURES:
        data[k] = (data[k] - data[k].mean()) / (data[k].max() - data[k].min())
        
    #Add weight inversely proportional to the frequency of classes. 
    weight = pd.DataFrame()
    weight['Count'] = range(len(bins)-1)
    frequency = np.asarray(data.groupby('Count').size())
    numerical_weight = 1/(frequency / np.linalg.norm(frequency))
    weight['Weight'] = numerical_weight
    
    cols = ['Count']
    data = data.join(weight.set_index(cols), on=cols)
    
    #randomly split data set into 70% training set, 20% validation set, and 10% test set.
    length = len(data['Count'])
    print(length)
    selection = np.random.rand(length)
    training = (selection < 0.7)
    training_set = data[training]
    valid_test = data[~training]
    
    length = len(valid_test['Count'])
    selection = np.random.rand(length)
    valid = (selection < 0.66)
    validation_set = valid_test[valid]
    test_set = valid_test[~valid]
    
    #define feature columns 
    feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES_WEIGHT]
    
    #Evaluate on validation set every 10 epochs, and train for 1000 epochs. 
    for i in range(1):
        
        #Implement classification on Tensorflow.
        
        classifier = tf.estimator.DNNClassifier(feature_columns=feature_cols, hidden_units=[64,32,64],                     
                          #optimizer=tf.train.FtrlOptimizer(
                          #learning_rate=0.1,
                          #l1_regularization_strength=10,
                          #l2_regularization_strength=10))
                          #optimizer=tf.train.AdamOptimizer(
                          #learning_rate=0.001,
                          #beta1=0.9,
                          #beta2=0.999,  
                          #epsilon=1e-8),
                          n_classes=9,                 
                          model_dir="/tmp/demandClassifier15",
                          weight_column="Weight")
        
        #classifier.train(input_fn=get_input_fn(training_set,num_epochs=10, shuffle=True), steps=10)
        
        ev = classifier.evaluate(
        input_fn=get_input_fn(validation_set, num_epochs=1, shuffle=False))
        
    #To generate examples of predictions, randomly select 100 rows from the test set. 
    snip = test_set.sample(100)
    pred = list(classifier.predict(
    input_fn=get_input_fn(snip, num_epochs=1, shuffle=False)))
    predictions = [p["classes"] for p in pred]
    comparison = pd.DataFrame()
    comparison['Actual_Count']=snip['Count'] 
    comparison['Predicted_Count']=predictions
    print(comparison.sample(frac=1))


1376328
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/demandClassifier15', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000024597EC3F28>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Starting evaluation at 2018-01-06-22:16:26
INFO:tensorflow:Restoring parameters from /tmp/demandClassifier15\model.ckpt-851
INFO:tensorflow:Finished evaluation at 2018-01-06-22:16:28
INFO:tensorflow:Saving dict for global step 851: accuracy = 0.888201, average_loss = 0.27995, global_step = 851, loss = 532100.0
INFO:tensorflow:Restoring parameters from /tmp/demandClassifier15\model.ck