<a href="https://colab.research.google.com/github/tkoide01/BikeShareRiderPredictor_UChicago_MachineLearningFinalProject/blob/main/CapitalBikeshare_PredictiveModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import os
from matplotlib.ticker import FuncFormatter

# Set the display format for floating-point numbers
pd.options.display.float_format = '{:.2f}'.format

# Mount Google Drive: 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 1. Load Bike Sharing Service CSV data into dataframe

In [2]:

# Specify the directory path: 
path = "/content/drive/My Drive/MSCA31009MLPA_FinalProject/data"
!ls "/content/drive/My Drive/MSCA31009MLPA_FinalProject/data"

2010-2017  2021		     post-pandemic	      pre_pandemic_data2.csv
2018	   2022		     post_pandemic_data2.csv  pre_pandemic_data.csv
2019	   DF_2010-2022.csv  post_pandemic_data.csv
2020	   Merged_data	     pre-pandemic


In [3]:
# Load the two pre and post pandemic data
file_name1 = '/pre_pandemic_data2.csv'
file_name2 = '/post_pandemic_data2.csv'


pre_pandemic_data = pd.read_csv(path+file_name1)
print(pre_pandemic_data.head())
print(pre_pandemic_data.shape)
post_pandemic_data = pd.read_csv(path+file_name2)
print(post_pandemic_data.head())
print(post_pandemic_data.shape)

   Unnamed: 0  Start station number  End station number Member type  \
0           0                 31634               31208      Member   
1           1                 31258               31270      Casual   
2           2                 31289               31222      Casual   
3           3                 31289               31222      Casual   
4           4                 31258               31270      Casual   

  day_of_week    month  year  Total trip count  
0      Sunday  January  2017                 2  
1      Sunday  January  2017                 4  
2      Sunday  January  2017                19  
3      Sunday  January  2017                19  
4      Sunday  January  2017                 4  
(25915290, 8)


  post_pandemic_data = pd.read_csv(path+file_name2)


   Unnamed: 0 Start station number End station number Member type day_of_week  \
0           0             31318.00           31405.00      Casual      Friday   
1           1             31270.00           31663.00      Member      Friday   
2           2             31926.00           31036.00      Member    Thursday   
3           3             31907.00           31047.00      Member    Thursday   
4           4             31931.00           31047.00      Casual      Monday   

     month  year  Total trip count  
0  January  2021                 1  
1  January  2021                 1  
2  January  2021                 1  
3  January  2021                 1  
4  January  2021                 1  
(6045461, 8)


### 2. Run the Predictive models: LSTM Model and Random-Forest regressor as Ensemble model

In [4]:
# Import necessary libraries for running LSTM model and RF model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,  OneHotEncoder
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

Index(['Unnamed: 0', 'Start station number', 'End station number',
       'Member type', 'day_of_week', 'month', 'year', 'Total trip count'],
      dtype='object')

In [6]:
# Assign independent and dependent variables based on 
pre_pandemic_X = pre_pandemic_data[['Start station number','End station number', 'Member type','day_of_week', 'month', 'year']]
pre_pandemic_y = pre_pandemic_data['Total trip count']
post_pandemic_X = post_pandemic_data[['Start station number','End station number', 'Member type','day_of_week', 'month', 'year']]
post_pandemic_y = post_pandemic_data['Total trip count']

# One-hot encode the 'Member type' column
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
pre_pandemic_X_encoded = pd.DataFrame(encoder.fit_transform(pre_pandemic_X[['Member type']]))
pre_pandemic_X_encoded.columns = encoder.get_feature_names_out(['Member type'])
pre_pandemic_X = pd.concat([pre_pandemic_X.drop('Member type', axis=1), pre_pandemic_X_encoded], axis=1)


# Define hyperparameters
lstm_params = [
    {'units': 50, 'epochs': 50, 'batch_size': 32},
    {'units': 100, 'epochs': 100, 'batch_size': 64},
    {'units': 200, 'epochs': 100, 'batch_size': 128}
]

rf_params = [
    {'n_estimators': 100},
    {'n_estimators': 200},
    {'n_estimators': 300}
]



In [None]:
# Perform model training and evaluation for each set of hyperparameters
results = []

for lstm_param in lstm_params:
    for rf_param in rf_params:
        # LSTM Model
        # Split the pre-pandemic data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(pre_pandemic_X, pre_pandemic_y, test_size=0.2, random_state=42)
        
        # Normalize the data
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Reshape the input data for LSTM
        X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
        X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

        # Build the LSTM model
        model = Sequential()
        model.add(LSTM(lstm_param['units'], activation='relu', input_shape=(1, X_train_scaled.shape[1])))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mse')

        # Train the LSTM model
        model.fit(X_train_reshaped, y_train, epochs=lstm_param['epochs'], batch_size=lstm_param['batch_size'], verbose=0)

        # Evaluate the LSTM model
        y_pred_lstm = model.predict(X_test_reshaped)
        mse_lstm = mean_squared_error(y_test, y_pred_lstm)

        # Ensemble Model
        # Train a Random Forest regressor on the post-pandemic data
        model_rf = RandomForestRegressor(**rf_param, random_state=42)
        model_rf.fit(post_pandemic_X, post_pandemic_y)

        # Predict using the Random Forest regressor
        y_pred_rf = model_rf.predict(X_test)
        mse_rf = mean_squared_error(y_test, y_pred_rf)

        # Combine the predictions using simple averaging
        y_pred_ensemble = (y_pred_lstm + y_pred_rf) / 2
        mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)

        # Store the results
        result = {
            'LSTM Parameters': lstm_param,
            'Random Forest Parameters': rf_param,
            'LSTM MSE': mse_lstm,
            'Random Forest MSE': mse_rf,
            'Ensemble MSE': mse_ensemble
        }
        results.append(result)

In [None]:
# Print the results
for result in results:
    print(f"LSTM Parameters: {result['LSTM Parameters']}")
    print(f"Random Forest Parameters: {result['Random Forest Parameters']}")
    print(f"LSTM MSE: {result['LSTM MSE']}")
    print(f"Random Forest MSE: {result['Random Forest MSE']}")
    print(f"Ensemble MSE: {result['Ensemble MSE']}")
    print()