### Import Nessary library

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import gcsfs
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

###  Load dataset 

In [9]:
#loading CSV file in the GCS bucket
# gcs_path = 'gs://cab_bucket/cab-gcp-vertex-pipelines1/data/Final_Chicago_Train.csv'
gcs_path = 'Final_Chicago_Train.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(gcs_path)

# Display the DataFrame
df.head()


Unnamed: 0,unique_key,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,...,company,pickup_latitude,pickup_longitude,pickup_location,dropoff_latitude,dropoff_longitude,dropoff_location,trip_start_day,trip_start_month,trip_start_hour
0,4b0ff5638ed5905d99da624abc903b967f868cfd,3426c4a607f9804cc5f33ecb1cab61f70f63b3f0593c94...,2019-03-11 12:45:00+00:00,2019-03-11 12:45:00+00:00,340,0.84,17031840000.0,17031280000.0,32.0,28.0,...,24 Seven Taxi,41.880994,-87.632746,POINT (-87.6327464887 41.8809944707),41.8853,-87.642808,POINT (-87.6428084655 41.8853000224),11,3,12
1,eb868659d19997db5a97e8ae22097ce12fa3c8e0,5776c9e3fe3235f1c036220f324b07aa8728ebd6641bfc...,2019-03-11 22:45:00+00:00,2019-03-11 23:15:00+00:00,1213,7.66,,,32.0,6.0,...,Chicago Carriage Cab Corp,41.878866,-87.625192,POINT (-87.6251921424 41.8788655841),41.944227,-87.655998,POINT (-87.6559981815 41.9442266014),11,3,22
2,3cd47551b2424db69e6923b7ae4929fd7f04eed2,40dd26181941ac03ca95b3d4cf2f0d12b34ae0338e688b...,2023-05-12 07:00:00+00:00,2023-05-12 07:15:00+00:00,1134,5.47,,,23.0,28.0,...,Flash Cab,41.90007,-87.720918,POINT (-87.7209182385 41.9000696026),41.874005,-87.663518,POINT (-87.6635175498 41.874005383),12,5,7
3,5ded2218e2090f75607f2b03d00265ef69d9d71f,1d72db3a18692cc5b5e4ea41bb1de0e45d4149b495dbd0...,2021-06-13 12:45:00+00:00,2021-06-13 13:00:00+00:00,563,1.62,,,3.0,77.0,...,Flash Cab,41.965812,-87.655879,POINT (-87.6558787862 41.96581197),41.986712,-87.663416,POINT (-87.6634164054 41.9867117999),13,6,12
4,67fd7d0b6a607a3d228738481960eb1dd14079ad,48c3c22d766613be3982924a72efdc03758b9666b3996b...,2023-01-09 10:30:00+00:00,2023-01-09 10:45:00+00:00,420,1.4,,,28.0,28.0,...,"Taxicab Insurance Agency, LLC",41.874005,-87.663518,POINT (-87.6635175498 41.874005383),41.874005,-87.663518,POINT (-87.6635175498 41.874005383),9,1,10


In [10]:
# Convert the 'trip_start_timestamp' to datetime if it's not already.
df['trip_start_timestamp'] = pd.to_datetime(df['trip_start_timestamp'])

# Extract hour, day, and month from the timestamp
df['trip_start_hour'] = df['trip_start_timestamp'].dt.hour
df['trip_start_day'] = df['trip_start_timestamp'].dt.day
df['trip_start_month'] = df['trip_start_timestamp'].dt.month

In [11]:
df['trip_end_timestamp'] = pd.to_datetime(df['trip_end_timestamp'])

df['trip_end_hour'] = df['trip_end_timestamp'].dt.hour
df['trip_end_day'] = df['trip_end_timestamp'].dt.day
df['trip_end_month'] = df['trip_end_timestamp'].dt.month

In [None]:
columns_to_drop = ['unique_key', 'taxi_id', 'trip_start_timestamp', 
                   'trip_end_timestamp', 'dropoff_location', 'pickup_location']
# Dropping unnecessary columns 
df= df.drop(columns=columns_to_drop)


### Data Preprocessing 

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.impute import SimpleImputer

def one_hot_encode(values, num_categories):
    """One-hot encode the values."""
    categories = sorted(set(values.dropna()))  # Handle NaN by dropping
    one_hot_encoded = []
    for v in values:
        encoding = [1 if v == category else 0 for category in categories]
        one_hot_encoded.append(encoding)
    return pd.DataFrame(one_hot_encoded, columns=[f"{values.name}_{c}" for c in categories], index=values.index)

def preprocess_data(df):
    numerical_features = ['trip_miles', 'trip_seconds', 'tips', 'tolls', 'extras', 'trip_total']
    bucket_features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
    categorical_numerical_features = [
        'trip_start_hour', 'trip_start_day', 'trip_start_month',
        'trip_end_hour', 'trip_end_day', 'trip_end_month',
        'pickup_census_tract', 'dropoff_census_tract', 'pickup_community_area',
        'dropoff_community_area'
    ]
    categorical_string_features = ['payment_type', 'company']
    
    # Handling missing values and scaling numerical features
    for feature in numerical_features:
        df[feature] = SimpleImputer(strategy='mean').fit_transform(df[[feature]])
        df[feature] = StandardScaler().fit_transform(df[[feature]])
    
    # Bucketizing geographical features
    for feature in bucket_features:
        df[feature] = SimpleImputer(strategy='mean').fit_transform(df[[feature]])
        discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
        df[feature] = discretizer.fit_transform(df[[feature]])
    
    # One-hot encoding for categorical string features
    for feature in categorical_string_features:
        df_filled = SimpleImputer(strategy='constant', fill_value='missing').fit_transform(df[[feature]].astype(str))
        df_encoded = one_hot_encode(pd.Series(df_filled.flatten(), name=feature), num_categories=None)
        df = pd.concat([df, df_encoded], axis=1)
        df.drop(columns=[feature], inplace=True)
    
    # One-hot encoding for categorical numerical features
    for feature in categorical_numerical_features:
        df_filled = SimpleImputer(strategy='most_frequent').fit_transform(df[[feature]].astype(str))
        df_encoded = one_hot_encode(pd.Series(df_filled.flatten(), name=feature), num_categories=None)
        df = pd.concat([df, df_encoded], axis=1)
        df.drop(columns=[feature], inplace=True)
    
    # Fill missing values for the label (fare)
    df['fare'] = SimpleImputer(strategy='mean').fit_transform(df[['fare']])
    
    return df

In [15]:
# Assuming your DataFrame after dropping columns is named df
df_processed = preprocess_data(df)

# Now you can display the processed DataFrame
df_processed.head()

Unnamed: 0,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_latitude,pickup_longitude,dropoff_latitude,...,dropoff_community_area_71.0,dropoff_community_area_72.0,dropoff_community_area_73.0,dropoff_community_area_74.0,dropoff_community_area_75.0,dropoff_community_area_76.0,dropoff_community_area_77.0,dropoff_community_area_8.0,dropoff_community_area_9.0,dropoff_community_area_nan
0,-0.390372,-0.506731,5.75,-0.075512,-0.020947,-0.103364,-0.262552,6.0,7.0,6.0,...,0,0,0,0,0,0,0,0,0,0
1,0.081204,0.322058,21.5,-0.628153,-0.020947,-0.044446,-0.00109,6.0,7.0,7.0,...,0,0,0,0,0,0,0,0,0,0
2,0.03853,0.055922,16.75,-0.628153,-0.020947,-0.103364,-0.106592,6.0,5.0,6.0,...,0,0,0,0,0,0,0,0,0,0
3,-0.269912,-0.411943,8.0,-0.628153,-0.020947,-0.103364,-0.267139,8.0,6.0,9.0,...,0,0,0,0,0,0,1,0,0,0
4,-0.347157,-0.438678,7.25,-0.628153,-0.020947,-0.103364,-0.280901,6.0,6.0,6.0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df.columns

Index(['trip_seconds', 'trip_miles', 'pickup_census_tract',
       'dropoff_census_tract', 'pickup_community_area',
       'dropoff_community_area', 'fare', 'tips', 'tolls', 'extras',
       'trip_total', 'payment_type', 'company', 'pickup_latitude',
       'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
       'trip_start_day', 'trip_start_month', 'trip_start_hour',
       'trip_end_hour', 'trip_end_day', 'trip_end_month'],
      dtype='object')

### Defining X & y

In [17]:
y =df_processed['fare']
# Dropping unnecessary columns 
X= df_processed.drop(columns='fare')


### Train Test Split

In [18]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Fitting the model and evaluating it

In [19]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Define the ANN functions
def _build_ann_model():
    """Creates a simple artificial neural network model."""
    ann_model = MLPRegressor(hidden_layer_sizes=(100, 70, 50, 20),
                             activation='relu',
                             solver='adam',
                             learning_rate_init=0.0005,
                             random_state=42)
    return ann_model

def _train_ann_model(model, train_data, train_labels):
    """Trains the artificial neural network model."""
    model.fit(train_data, train_labels)

def _evaluate_ann_model(model, eval_data, eval_labels):
    """Evaluates the artificial neural network model."""
    eval_predictions = model.predict(eval_data)
    mse = ((eval_predictions - eval_labels) ** 2).mean()
    return mse

# Initialize models including the ANN model
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "ANN": _build_ann_model()  
}

# Define a function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    if isinstance(model, MLPRegressor):
        # Train and evaluate ANN model separately
        _train_ann_model(model, X_train, y_train)
        mse = _evaluate_ann_model(model, X_test, y_test)
        return mse, np.sqrt(mse), None  # No R2 score for ANN model
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        return mse, rmse, r2

# Evaluate each model
results = {}
for model_name, model in models.items():
    mse, rmse, r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[model_name] = {"MSE": mse, "RMSE": rmse, "R2": r2}

# Print the results
for model_name, metrics in results.items():
    if metrics['R2'] is not None:
        print(f"{model_name} - MSE: {metrics['MSE']:.4f}, RMSE: {metrics['RMSE']:.4f}, R2: {metrics['R2']:.4f}")
    else:
        print(f"{model_name} - MSE: {metrics['MSE']:.4f}, RMSE: {metrics['RMSE']:.4f}")


Linear Regression - MSE: 322788644909691264.0000, RMSE: 568144915.4130, R2: -903044636083671.0000
Decision Tree - MSE: 23.9307, RMSE: 4.8919, R2: 0.9331
Random Forest - MSE: 42.9145, RMSE: 6.5509, R2: 0.8799
Gradient Boosting - MSE: 36.2956, RMSE: 6.0246, R2: 0.8985
ANN - MSE: 6.2948, RMSE: 2.5089


##### Here ANN model is giving the best result compared to other models. So we are selecting the ANN model

### Hyperparameter Tuning

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Define the ANN functions
def _build_ann_model():
    """Creates a simple artificial neural network model."""
    ann_model = MLPRegressor(random_state=42)
    return ann_model
def _train_ann_model(model, train_data, train_labels):
    """Trains the artificial neural network model."""
    model.fit(train_data, train_labels)
def _evaluate_ann_model(model, eval_data, eval_labels):
    """Evaluates the artificial neural network model."""
    eval_predictions = model.predict(eval_data)
    mse = mean_squared_error(eval_labels, eval_predictions)
    return mse
# Hyperparameter grids
param_grids = {
    "Linear Regression": {},
    "Decision Tree": {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10, 20]
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10, 20]
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    "ANN": {
        'hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 70, 50, 20)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'learning_rate_init': [0.001, 0.0005, 0.0001]
    }
}
# Initialize models including the ANN model
models = {
    # "Linear Regression": LinearRegression(),
    # "Decision Tree": DecisionTreeRegressor(random_state=42),
    # "Random Forest": RandomForestRegressor(random_state=42),
    # "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "ANN": _build_ann_model()  # Include the ANN model here
}

# Define a function to evaluate models
def evaluate_model(model, param_grid, X_train, X_test, y_train, y_test):
    if isinstance(model, MLPRegressor):
        grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=3)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        _train_ann_model(best_model, X_train, y_train)
        mse = _evaluate_ann_model(best_model, X_test, y_test)
        return mse, np.sqrt(mse), None, best_model
    else:
        grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=3)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        return mse, rmse, r2, best_model

# Evaluate each model
results = {}
best_models = {}
for model_name, model in models.items():
    param_grid = param_grids[model_name]
    mse, rmse, r2, best_model = evaluate_model(model, param_grid, X_train, X_test, y_train, y_test)
    results[model_name] = {"MSE": mse, "RMSE": rmse, "R2": r2}
    best_models[model_name] = best_model

# Print the results
for model_name, metrics in results.items():
    if metrics['R2'] is not None:
        print(f"{model_name} - MSE: {metrics['MSE']:.4f}, RMSE: {metrics['RMSE']:.4f}, R2: {metrics['R2']:.4f}")
    else:
        print(f"{model_name} - MSE: {metrics['MSE']:.4f}, RMSE: {metrics['RMSE']:.4f}")

# Print the best model for each algorithm
for model_name, best_model in best_models.items():
    print(f"Best model for {model_name}: {best_model}")
