# Data Model Implementation

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# import tensorflow as tf

In [None]:
import pandas as pd
import sqlalchemy

In [None]:
# Connect to SQL database
engine = sqlalchemy.create_engine('sqlite:////content/ride_share.db')
query = "SELECT * FROM ride_share_table"
df = pd.read_sql(query, engine)

In [None]:
df.head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,cbd_congestion_fee,weather_condition
0,HV0003,B03404,B03404,2025-03-03 08:50:50,2025-03-03 08:51:10,2025-03-03 08:53:11,2025-03-03 09:04:35,161,162,0.57,...,0.0,0.0,7.7,N,N,N,N,N,1.5,Clear
1,HV0003,B03404,B03404,2025-03-30 10:01:05,2025-03-30 10:04:45,2025-03-30 10:05:02,2025-03-30 10:17:03,61,37,2.14,...,0.0,0.0,14.55,N,N,N,N,N,0.0,Cloudy
2,HV0003,B03404,B03404,2025-03-15 08:10:41,2025-03-15 08:14:21,2025-03-15 08:16:22,2025-03-15 08:20:29,200,220,0.92,...,0.0,0.0,6.0,N,N,N,N,Y,0.0,Overcast
3,HV0003,B03404,B03404,2025-03-01 05:42:30,2025-03-01 05:55:15,2025-03-01 05:56:45,2025-03-01 06:38:43,249,177,7.57,...,0.0,0.0,40.38,N,N,N,N,N,1.5,Cloudy
4,HV0003,B03404,B03404,2025-03-09 01:51:57,2025-03-09 01:54:59,2025-03-09 01:57:01,2025-03-09 03:15:13,157,61,3.77,...,0.0,0.0,16.35,N,N,N,N,N,0.0,Clear


In [None]:
# Drop the non-beneficial columns
columns_to_drop = ["hvfhs_license_num", "dispatching_base_num", "originating_base_num", "request_datetime",
    "on_scene_datetime", "pickup_datetime", "dropoff_datetime",
    "shared_request_flag", "shared_match_flag", "access_a_ride_flag",
    "wav_request_flag", "wav_match_flag", "tolls", "bcf", "sales_tax",
    "congestion_surcharge", "airport_fee"]
df = df.drop(columns = columns_to_drop, axis = 1)

In [None]:
df.head()

Unnamed: 0,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tips,driver_pay,cbd_congestion_fee,weather_condition
0,161,162,0.57,685,14.97,0.0,7.7,1.5,Clear
1,61,37,2.14,721,25.07,0.0,14.55,0.0,Cloudy
2,200,220,0.92,247,9.59,0.0,6.0,0.0,Overcast
3,249,177,7.57,2518,40.87,0.0,40.38,1.5,Cloudy
4,157,61,3.77,1092,27.29,0.0,16.35,0.0,Clear


In [None]:
df.nunique()

Unnamed: 0,0
PULocationID,253
DOLocationID,256
trip_miles,7367
trip_time,3240
base_passenger_fare,5694
tips,1088
driver_pay,4918
cbd_congestion_fee,2
weather_condition,10


In [None]:
# looking at PULocationID value counts to identify and replace with "Others"
pulocationid_count = df["PULocationID"].value_counts()
pulocationid_count

Unnamed: 0_level_0,count
PULocationID,Unnamed: 1_level_1
138,429
132,357
79,269
61,264
230,247
...,...
12,2
27,1
253,1
96,1


In [None]:
# Choose a cutoff value to be replaced
pulocationid_to_replace = list(pulocationid_count[pulocationid_count < 20].index)

for loc in pulocationid_to_replace:
  df["PULocationID"] = df["PULocationID"].replace(loc, "Others")

# Check to make sure replacement was successful
df["PULocationID"].value_counts()

Unnamed: 0_level_0,count
PULocationID,Unnamed: 1_level_1
138,429
Others,412
132,357
79,269
61,264
...,...
131,22
252,22
73,22
98,21


In [None]:
# looking at DOLocationID value counts to identify and replace with "Others"
dolocationid_count = df["DOLocationID"].value_counts()
dolocationid_count

Unnamed: 0_level_0,count
DOLocationID,Unnamed: 1_level_1
265,882
132,445
138,397
61,294
37,241
...,...
30,1
5,1
154,1
184,1


In [None]:
# Choose a cutoff value to be replaced
dolocationid_to_replace = list(dolocationid_count[dolocationid_count < 20].index)

for loc in dolocationid_to_replace:
  df["DOLocationID"] = df["DOLocationID"].replace(loc, "Others")

# Check to make sure replacement was successful
df["DOLocationID"].value_counts()

Unnamed: 0_level_0,count
DOLocationID,Unnamed: 1_level_1
265,882
Others,464
132,445
138,397
61,294
...,...
34,23
252,22
221,21
192,20


In [None]:
# looking at weather_condition value counts to identify and replace with "Others"
weather_condition_count = df["weather_condition"].value_counts()
weather_condition_count

Unnamed: 0_level_0,count
weather_condition,Unnamed: 1_level_1
Cloudy,9010
Clear,6015
Fair,1980
Overcast,1355
Heavy Rain,476
Light Rain,421
Rain,375
Fog,211
Rain Shower,95
Heavy Rain Shower,62


In [None]:
# Handle categorical variables (e.g., PULocationID, DOLocationID, weather condition)
df = pd.get_dummies(df, columns=["PULocationID", "DOLocationID", "weather_condition"])

In [None]:
# Define features and target variable
y = df["driver_pay"]
X = df.drop("driver_pay", axis=1)

In [None]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 1)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Compile, Train and Evaluate the Model

In [None]:
# from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import r2_score


In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
num_input_features =  len(X_train_scaled[1])
hidden_nodes_layer1 = 64
hidden_nodes_layer2 = 32

model = Sequential()

# First hidden layer
model.add(Dense(units = hidden_nodes_layer1, input_dim = num_input_features, activation = "relu"))

# Second hidden layer
model.add(Dense(units = hidden_nodes_layer2, activation = "relu"))

# Output layer
# model.add(Dense(units = 1, activation = "sigmoid"))

model.add(Dense(units = 1))

# Check the structure of the model
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the model
model.compile(loss = "mean_squared_error", optimizer = "adam", metrics = ["mae"])

In [None]:
model_fitted = model.fit(X_train_scaled, y_train, epochs = 100, validation_split = 0.2)

Epoch 1/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 360.5024 - mae: 12.7554 - val_loss: 37.9393 - val_mae: 4.5283
Epoch 2/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 29.1631 - mae: 3.7328 - val_loss: 20.8441 - val_mae: 2.9580
Epoch 3/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 14.9829 - mae: 2.4326 - val_loss: 18.1450 - val_mae: 2.6315
Epoch 4/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 12.4717 - mae: 2.1272 - val_loss: 17.1708 - val_mae: 2.5453
Epoch 5/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 9.9807 - mae: 1.9709 - val_loss: 17.0075 - val_mae: 2.4918
Epoch 6/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 10.6873 - mae: 1.9401 - val_loss: 16.8708 - val_mae: 2.4705
Epoch 7/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [None]:
# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print(f"R-squared: {r_squared:.2f}")

R-squared: 0.94


In [None]:
# Check if the R-squared meets the threshold
if r_squared >= 0.80:
    print("The model demonstrates meaningful predictive power with at least 0.80 R-squared.")
else:
    print("The model does not meet the 0.80 R-squared threshold.")

The model demonstrates meaningful predictive power with at least 0.80 R-squared.


In [None]:
# Save the model to a file
model.save("ride_share_model.h5")




# Data Model Optimization

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from keras_tuner import RandomSearch

In [None]:
# Function to create the model
def create_model(hp):
    model = Sequential()

    # Choose activation function
    activation = hp.Choice("activation", ["relu", "tanh"])

    # Input layer
    model.add(Dense(
        units=hp.Int("number_of_neurons_layer0", 8, 512, step=32),
        activation=activation,
        input_dim=X_train_scaled.shape[1]  # Number of features
    ))

    # Hidden layers
    for x in range(hp.Int("hidden_layer_number", 1, 6)):  # 1 to 6 hidden layers
        model.add(Dense(
            units=hp.Int(f"number_of_neurons_layer{x}", 8, 512, step=32),
            activation=activation
        ))

    # Output layer
    model.add(Dense(units=1))  # No activation for regression

    # Compile the model
    model.compile(
        loss="mean_squared_error",  # Use mean squared error for regression
        optimizer=Adam(hp.Choice("learning_rate", values=[0.01, 0.001, 0.0001])),
        metrics=["mae"]  # Mean absolute error for regression
    )

    return model


In [None]:
# Initialize the tuner
tuner = RandomSearch(
    create_model,
    objective="val_mae",  # Use mean absolute error for regression
    max_trials=5,
    project_name="ride_share_model_optimization"
)

Reloading Tuner from ./ride_share_model_optimization/tuner0.json


In [None]:
# Early stopping callback
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True, verbose=1)

In [None]:
# Search for the best model
tuner.search(X_train_scaled, y_train, epochs=100, validation_data=(X_test_scaled, y_test), callbacks=[early_stopping])

In [None]:
# Get the best model and evaluate
best_model = tuner.get_best_models(1)[0]
y_pred = best_model.predict(X_test_scaled)
r_squared = r2_score(y_test, y_pred)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


In [None]:
# Log the results
optimization_results = {
    'Best R-squared': [r_squared],
    'Notes': ['Best model found using hyperparameter tuning']
}

In [None]:
# Convert results to DataFrame and save to CSV
optimization_df = pd.DataFrame(optimization_results)
optimization_df.to_csv('model_optimization_results.csv', index=False)

In [None]:
print("Model performance metrics saved to 'model_optimization_results.csv'.")
print(optimization_df)
print(f"Final R-squared value: {r_squared:.2f}")


Model performance metrics saved to 'model_optimization_results.csv'.
   Best R-squared                                         Notes
0        0.948458  Best model found using hyperparameter tuning
Final R-squared value: 0.95
