In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd

# Load the data into a Pandas DataFrame
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
# Get sample since the data is huge (2m+ data points)
df = df.sample(n=10000)
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
30107768,2013-12-17 23:06:00.000000212,16.5,2013-12-17 23:06:00 UTC,-73.95816,40.73335,-73.997957,40.716922,1
19161179,2010-08-13 00:12:32.0000004,9.3,2010-08-13 00:12:32 UTC,-73.977288,40.749886,-73.951201,40.779425,1
24778705,2014-08-08 09:49:26.0000001,5.0,2014-08-08 09:49:26 UTC,-73.979291,40.78167,-73.975894,40.792156,1
41567408,2009-09-03 08:43:09.0000003,4.9,2009-09-03 08:43:09 UTC,-73.992026,40.749563,-73.997884,40.737705,1
39437397,2013-01-28 15:50:00.00000070,7.0,2013-01-28 15:50:00 UTC,-73.968825,40.754425,-73.979912,40.761097,2


In [4]:
# Extract datetime
df['key'] = pd.to_datetime(df['key'])
df['year'] = df['key'].dt.year
df['month'] = df['key'].dt.month
df['day'] = df['key'].dt.day
df['hour'] = df['key'].dt.hour
df['minute'] = df['key'].dt.minute
df['second'] = df['key'].dt.second
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,hour,minute,second
30107768,2013-12-17 23:06:00.000000212,16.5,2013-12-17 23:06:00 UTC,-73.95816,40.73335,-73.997957,40.716922,1,2013,12,17,23,6,0
19161179,2010-08-13 00:12:32.000000400,9.3,2010-08-13 00:12:32 UTC,-73.977288,40.749886,-73.951201,40.779425,1,2010,8,13,0,12,32
24778705,2014-08-08 09:49:26.000000100,5.0,2014-08-08 09:49:26 UTC,-73.979291,40.78167,-73.975894,40.792156,1,2014,8,8,9,49,26
41567408,2009-09-03 08:43:09.000000300,4.9,2009-09-03 08:43:09 UTC,-73.992026,40.749563,-73.997884,40.737705,1,2009,9,3,8,43,9
39437397,2013-01-28 15:50:00.000000700,7.0,2013-01-28 15:50:00 UTC,-73.968825,40.754425,-73.979912,40.761097,2,2013,1,28,15,50,0


In [5]:
# Filter fare_amount > 0
df = df[df['fare_amount'] > 0]
len(df)

9998

In [6]:
# Filter passenger > 0
df = df[df['passenger_count'] > 0]
len(df)

9958

In [7]:
# Set boudaries for pickup and dropoff

In [8]:
a = min(df['pickup_latitude'].min(), df['dropoff_latitude'].min())
print(a)

-74.006615


In [9]:
b = max(df['pickup_latitude'].max(), df['dropoff_latitude'].max())
print(b)

41.480803


In [10]:
c = min(df['pickup_longitude'].min(), df['dropoff_longitude'].min())
print(c)

-75.500613


In [11]:
d = max(df['pickup_longitude'].max(), df['dropoff_longitude'].max())
print(d)

40.77653


In [12]:
def select_within_boundingbox(df,bb):
    return(
        (df['pickup_longitude'] >= bb[0]) & (df['pickup_longitude'] <= bb[1]) &
        (df['pickup_latitude'] >= bb[2]) & (df['pickup_latitude'] <= bb[3]) &
        (df['dropoff_longitude'] >= bb[0]) & (df['dropoff_longitude'] <= bb[1]) &
        (df['dropoff_latitude'] >= bb[2]) & (df['dropoff_latitude'] <= bb[3]))

In [13]:
bb = (a, b, c, d)

In [14]:
df = df[select_within_boundingbox(df, bb)]

In [15]:
print("New size {}".format(len(df)))

New size 6808


In [16]:
# Compute distance for new feature
import numpy as np
def distance(lat1, lon1, lat2, lon2):
  p = 0.017453292519943295 # Pi/180
  a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p))/2
  return 0.6213712 * 12742 * np.arcsin(np.sqrt(a))

In [17]:
df["distance_miles"] = distance(df["pickup_latitude"], df["pickup_longitude"], 
                                      df["dropoff_latitude"], df["dropoff_longitude"])

In [18]:
# Split the data into features and labels
features = df.drop(['fare_amount','key','pickup_datetime','passenger_count'], axis=1)
labels = df['fare_amount']

# Assign GPU as the device for TensorFlow to run on
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)
  tf.config.experimental.set_visible_devices(physical_devices[0], 'GPU')
  print("Running on GPU:", physical_devices[0].name)
else:
  print("Running on CPU")

# Convert the DataFrame to a TensorFlow Dataset
dataset = tf.data.Dataset.from_tensor_slices((features.values, labels.values))

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(features.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer=tf.optimizers.Adam(), loss='mean_squared_error')

# Train the model
history = model.fit(
    x=features, y=labels,
    batch_size=32, epochs=100,
    verbose=0
)

# Evaluate the model on the test data
test_loss = model.evaluate(
    x=features, y=labels,
    verbose=0
)

print("Test set loss: {loss:0.3f}".format(loss=test_loss))

Running on GPU: /physical_device:GPU:0


2023-02-01 07:30:01.490900: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-01 07:30:02.284591: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30983 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:86:00.0, compute capability: 7.0


Test set loss: 30.369


In [19]:
!nvidia-smi

Wed Feb  1 07:30:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:86:00.0 Off |                  Off |
| N/A   51C    P0    42W / 250W |    611MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces