## Data Cleaning

In [1]:
# Load packages
import pandas as pd
import numpy as np

In [2]:
# Load 2021 and 2022 July yellow taxi data
july21 = pd.read_parquet('/Users/zach/Personal Projects/CSE 573 Final Project/yellow_tripdata_2021-07.parquet', engine='pyarrow')
july22 = pd.read_parquet('/Users/zach/Personal Projects/CSE 573 Final Project/yellow_tripdata_2022-07.parquet', engine='pyarrow')

In [4]:
# Calculate the duration of the trip in seconds
july21['tpep_pickup_datetime']= pd.to_datetime(july21['tpep_pickup_datetime'])
july21['tpep_dropoff_datetime']= pd.to_datetime(july21['tpep_dropoff_datetime'])
july21['duration'] = july21['tpep_dropoff_datetime'] - july21['tpep_pickup_datetime']
july21['duration'] = july21['duration'].dt.total_seconds()

july22['tpep_pickup_datetime']= pd.to_datetime(july22['tpep_pickup_datetime'])
july22['tpep_dropoff_datetime']= pd.to_datetime(july22['tpep_dropoff_datetime'])
july22['duration'] = july22['tpep_dropoff_datetime'] - july22['tpep_pickup_datetime']
july22['duration'] = july22['duration'].dt.total_seconds()

In [5]:
# Calculate start and end hours of the trip
july21['start_hour'] = july21['tpep_pickup_datetime'].dt.hour
july21['end_hour'] = july21['tpep_dropoff_datetime'].dt.hour

july22['start_hour'] = july22['tpep_pickup_datetime'].dt.hour
july22['end_hour'] = july22['tpep_dropoff_datetime'].dt.hour

In [6]:
# Drop unwanted columns
july21 = july21.drop(columns = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag', 'total_amount', ])

july22 = july22.drop(columns = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag', 'total_amount', ])

In [7]:
# Drop rows with misleading values
july21 = july21[july21['passenger_count'] > 0]
july21 = july21[july21['passenger_count'] < 7]
july21 = july21[july21['trip_distance'] > 0]
july21 = july21[july21['fare_amount'] > 0]
july21 = july21[july21['RatecodeID'] < 99]
july21 = july21.dropna()
july21 = july21.drop_duplicates()

july22 = july22[july22['passenger_count'] > 0]
july22 = july22[july22['passenger_count'] < 7]
july22 = july22[july22['trip_distance'] > 0]
july22 = july22[july22['fare_amount'] > 0]
july22 = july22[july22['RatecodeID'] < 99]
july22 = july22.dropna()
july22 = july22.drop_duplicates()

In [None]:
# Target encode payment_type and RatecodeID with mean of tip_amount
means_payment = july21['tip_amount'].groupby(july21['payment_type']).agg(['mean'])
july21['payment_type'] = july21['payment_type'].map({1:3.117323, 2:0.000038, 3:0.001095, 4:0.000919})
means_ratecode = july21['tip_amount'].groupby(july21['RatecodeID']).agg(['mean'])
july21['RatecodeID'] = july21['RatecodeID'].map({1.0:2.156033, 2.0:8.802659, 3.0:11.184761, 4.0:7.602690, 5.0:7.964534, 6.0:0})

july22['payment_type'] = july22['payment_type'].map({1:3.117323, 2:0.000038, 3:0.001095, 4:0.000919})
july22['RatecodeID'] = july22['RatecodeID'].map({1.0:2.156033, 2.0:8.802659, 3.0:11.184761, 4.0:7.602690, 5.0:7.964534, 6.0:0})

In [None]:
# Split data sets into features and outcomes
x_21 = july21.drop('tip_amount', axis=1)
y_21 = july21['tip_amount']

x_22 = july22.drop('tip_amount', axis=1)
y_22 = july22['tip_amount']

## Linear Regression

In [None]:
# Load required packages
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# Train model, get predictions for July 2022, and calculate MSE
linreg = LinearRegression()
linreg.fit(x_21, y_21)
linreg_preds = linreg.predict(x_22)
linreg_mse = mean_squared_error(y_22, linreg_preds)
linreg_mse

## Ridge Regression

In [None]:
# Load required package
from sklearn.linear_model import Ridge

In [None]:
# Train model, get predictions for July 2022, and calculate MSE
ridgereg = Ridge(alpha = 0.5)
ridgereg.fit(x_21, y_21)
ridgereg_preds = ridgereg.predict(x_22)
ridgereg_mse = mean_squared_error(y_22, ridgereg_preds)
ridgereg_mse

## Gradient Boosted Tree

In [None]:
# Load required packages
import xgboost as xgb
import sklearn.model_selection as skl
import sklearn.preprocessing as sklp
import sklearn.decomposition as skld
from sklearn.model_selection import train_test_split

In [None]:
# Split train data into train and validation
x_train, x_val, y_train, y_val = train_test_split(x_21, y_21, test_size = 0.2)

In [None]:
# Fit model and check cross validation accuracy
param_tuning = pd.DataFrame(data={'learning': [], 'depth': [], 'child_weight':[],'mse':[]})

for i in [0.01,0.05,0.1]:
    for j in [1,3,5,7,10,15]:
        for k in [1,2,3,4,5]:
            xg_model = xgb.XGBRegressor(objective='reg:squarederror', booster='gbtree', gamma=0.05, 
                             learning_rate=i, colsample_bytree = 0.7, max_depth=j, 
                             min_child_weight=k, n_estimators=250)
            xg_model.fit(x_train, y_train)
            val_preds = xg_model.predict(x_val)
            mse = mean_squared_error(y_val, val_preds)
            param_tuning.loc[len(param_tuning.index)] = [i,j,k,mse] 
            print("SE: %.2f" % mse)

In [None]:
# Save parameter tuning table
param_tuning.to_csv(r'/Users/zach/Personal Projects/CSE 573 Final Project/param_tuning.csv', index=False)

In [None]:
# Fit model on tuned hyperparameters and calculate MSE for July 2022
param_tuning = pd.read_csv('/Users/zach/Personal Projects/CSE 573 Final Project/param_tuning.csv')
best = param_tuning.loc[param_tuning['mse'].idxmin()]
xg_model = xgb.XGBRegressor(objective='reg:squarederror', booster='gbtree', gamma=0.05, 
                             learning_rate=best[0], colsample_bytree = 0.7, max_depth=best[1].astype(int), 
                             min_child_weight=best[2].astype(int), n_estimators=250)
xg_model.fit(x_21, y_21)
xg_preds = xg_model.predict(x_22)
xg_mse = mean_squared_error(y_22, xg_preds)
xg_mse

## K-Nearest Neighbors

In [None]:
# Load required package
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# Train model, get predictions for July 2022, and calculate MSE
knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(x_21, y_21)
knn_preds = knn.predict(x_22)
knn_mse = mean_squared_error(y_22, knn_preds)
knn_mse

## Decision Tree

In [None]:
# Load required package
from sklearn.tree import DecisionTreeRegressor

In [None]:
# Train model, get predictions for July 2022, and calculate MSE
decT = DecisionTreeRegressor(criterion='squared_error', max_depth=15, min_samples_leaf=4, min_impurity_decrease=0.001)
decT.fit(x_21, y_21)
decT_preds = decT.predict(x_22)
decT_mse = mean_squared_error(y_22, decT_preds)
decT_mse

## Random Forest

In [None]:
# Load required package
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Train model, get predictions for July 2022, and calculate MSE
rfT = RandomForestRegressor(criterion='squared_error', max_depth=15, min_samples_leaf=4, max_features=0.75) 
rfT.fit(x_21, y_21)
rfT_preds = rfT.predict(x_22)
rfT_mse = mean_squared_error(y_22, rfT_preds)
rfT_mse

## Support Vector Machine

In [None]:
# Load required packages
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [None]:
# Scale features
scX_21 = StandardScaler()
scaledX_21 = scX_21.fit_transform(x_21)

scX_22 = StandardScaler()
scaledX_22 = scX_22.fit_transform(x_22)

In [None]:
# Train model, get predictions for July 2022, and calculate MSE- Takes too long to run
svmM = SVR(kernel='linear')
svmM.fit(scaledX_21, y_21)
svm_preds = svmM.predict(scaledX_22)
svm_mse = mean_squared_error(y_22, svm_preds)
svm_mse

## Output

In [None]:
# Create and output dataframe with model errors
data = [['Linear Regression', linreg_mse], ['Ridge Regression', ridgereg_mse], ['XGBoost', xg_mse], ['KNN', knn_mse], ['Decision Tree', decT_mse], ['Random Forest', rfT_mse]]
model_errors = pd.DataFrame(data, columns=['Model', 'MSE'])
model_errors.to_csv(r'/Users/zach/Personal Projects/CSE 573 Final Project/model_errors.csv', index=False)
