## 1. Preprocessing

In [3]:
from imp import reload
from src import config, data_utils, preprocessing
import pandas as pd

### Getting the data

In [4]:
app_train, app_test = data_utils.get_datasets()
app_train.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

## Agregating columns

In [5]:

reload(data_utils)
app_train =  data_utils.agregate_columns(app_train)
app_test =  data_utils.agregate_columns(app_test)

## Split targets from dataset

In [6]:
(
 X_train,
 y_train_total_amount,
 y_train_duration_in_minutes,
 X_test,
 y_test_total_amount,
 y_test_duration_in_minutes
) = data_utils.get_feature_target(app_train, app_test)

## Stract validation dataset

In [7]:
(
X_train,
X_val,
y_train_total_amount,
y_val_total_amount,
y_train_duration_in_minutes,
y_val_duration_in_minutes
) = data_utils.get_train_val_sets(X_train, y_train_total_amount, y_train_duration_in_minutes)

In [132]:
print(X_train.shape, X_val.shape, X_test.shape)

(2767016, 21) (691755, 21) (3425676, 21)


## Clear and encode datasets

In [133]:
reload(preprocessing)

<module 'src.preprocessing' from '/Users/yulissaterancerquin/Documents/taxi-price-predictor/src/preprocessing.py'>

In [8]:
X_train, X_val, X_test = preprocessing.preprocess_data(X_train, X_val, X_test)

Input train data shape:  (2767016, 21)
Input val data shape:  (691755, 21)
Input test data shape:  (3425676, 21) 

columns Index(['passenger_count', 'PULocationID', 'DOLocationID', 'payment_type',
       'improvement_surcharge', 'airport_fee', 'pickup_year', 'pickup_day',
       'pickup_day_of_week', 'pickup_hour', 'pickup_minute'],
      dtype='object')


In [135]:
print(X_train.shape)
print(y_train_total_amount.shape)

(2767016, 11)
(2767016,)


## Models

### Linear Regression

[[0.2        0.90458015 0.94656489 ... 1.         0.95652174 0.10169492]
 [0.         1.         0.87022901 ... 0.66666667 0.69565217 0.50847458]
 [0.2        0.61832061 0.33969466 ... 1.         0.91304348 0.33898305]
 ...
 [0.         0.29770992 0.59923664 ... 0.66666667 0.56521739 0.88135593]
 [0.         0.5        0.8740458  ... 0.33333333 0.43478261 0.96610169]
 [0.         0.32824427 0.79389313 ... 0.66666667 0.52173913 0.96610169]]


#### Total amount prediction

In [137]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from src.save_model import save_total_trip_model,save_duration_trip_model
import math

# Create a linear regression model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train_total_amount)

save_total_trip_model(model)
# Predict using the model

y_pred_train = model.predict(X_train)
print('MAE:', mean_absolute_error(y_train_total_amount, y_pred_train))
print('MSE:', mean_squared_error(y_train_total_amount, y_pred_train))
print('RMSE:', math.sqrt(mean_squared_error(y_train_total_amount, y_pred_train)))
print('R2:', r2_score(y_train_total_amount, y_pred_train))
print('_________________________:')

y_pred_val = model.predict(X_val)
print('MAE:', mean_absolute_error(y_val_total_amount, y_pred_val))
print('MSE:', mean_squared_error(y_val_total_amount, y_pred_val))
print('RMSE:', math.sqrt(mean_squared_error(y_val_total_amount, y_pred_val)))
print('R2:', r2_score(y_val_total_amount, y_pred_val))

MAE: 8.499936202490034
MSE: 227.1152783615407
RMSE: 15.070344334537971
R2: 0.3392703528507055
_________________________:
MAE: 8.459752577545961
MSE: 204.6382722531288
RMSE: 14.305183405085334
R2: 0.35828908484859767


#### Trip duration prediction

In [138]:
# Create a linear regression model
model_duration = LinearRegression()

print(X_train.shape)
print(y_train_duration_in_minutes.shape)
# Fit the model
model_duration.fit(X_train, y_train_duration_in_minutes)

save_duration_trip_model(model_duration)
# Predict using the model

y_pred_train = model_duration.predict(X_train)
print('MAE:', mean_absolute_error(y_train_duration_in_minutes, y_pred_train))
print('MSE:', mean_squared_error(y_train_duration_in_minutes, y_pred_train))
print('RMSE:', math.sqrt(mean_squared_error(y_train_duration_in_minutes, y_pred_train)))
print('R2:', r2_score(y_train_duration_in_minutes, y_pred_train))
print('_________________________:')

y_pred_val = model_duration.predict(X_val)
print('MAE:', mean_absolute_error(y_val_duration_in_minutes, y_pred_val))
print('MSE:', mean_squared_error(y_val_duration_in_minutes, y_pred_val))
print('RMSE:', math.sqrt(mean_squared_error(y_val_duration_in_minutes, y_pred_val)))
print('R2:', r2_score(y_val_duration_in_minutes, y_pred_val))

(2767016, 11)
(2767016,)
MAE: 10.507362989449026
MSE: 2689.339306865035
RMSE: 51.85884019976763
R2: 0.017765573487283692
_________________________:
MAE: 10.524926375876877
MSE: 2739.279361595603
RMSE: 52.33812531602181
R2: 0.01666234906141406


### DecisionTreeRegressor

In [139]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import math

In [140]:
tree = DecisionTreeRegressor(max_depth=10)
tree.fit(X_train, y_train_total_amount)

# Evaluate the model
y_pred_train_DTR = tree.predict(X_train)



print('MAE:', mean_absolute_error(y_train_total_amount, y_pred_train_DTR))
print('MSE:', mean_squared_error(y_train_total_amount, y_pred_train_DTR))
print('RMSE:', math.sqrt(mean_squared_error(y_train_total_amount, y_pred_train_DTR)))
print('R2:', r2_score(y_train_total_amount, y_pred_train_DTR))

y_pred_val_DTR = tree.predict(X_val)

print('val:')

print('MAE:', mean_absolute_error(y_val_total_amount, y_pred_val_DTR))
print('MSE:', mean_squared_error(y_val_total_amount, y_pred_val_DTR))
print('RMSE:', math.sqrt(mean_squared_error(y_val_total_amount, y_pred_val_DTR)))
print('R2:', r2_score(y_val_total_amount, y_pred_val_DTR))

MAE: 5.927281148257893
MSE: 128.48125407285585
RMSE: 11.334957171196363
R2: 0.6262190096532394
val:
MAE: 5.91380714453071
MSE: 108.65590724153505
RMSE: 10.423814428582995
R2: 0.6592735029236191


### XGBRegressor

In [143]:
import xgboost as xgb
print('__version__',xgb.__version__)
from sklearn.datasets import make_regression
#XGBRegressor
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_jobs=1, random_state=42,
                               n_estimators=500, max_depth=5, learning_rate=00.1, 
                               subsample=0.8, colsample_bytree=0.8)

# Fit the model
model_xgb.fit(X_train, y_train_total_amount)

y_pred_train_xgb = model.predict(X_train)

print('MAE:', mean_absolute_error(y_train_total_amount, y_pred_train_xgb))
print("MSE:", mean_squared_error(y_train_total_amount, y_pred_train_xgb))
print('RMSE:', math.sqrt(mean_squared_error(y_train_total_amount, y_pred_train_xgb)))
r2 = r2_score(y_train_total_amount, y_pred_train_xgb)
print(f"R2: {r2_score(y_train_total_amount, y_pred_train_xgb):.2f}")

# Predicting the Test set results
y_pred_val_xgb = model.predict(X_val)

print("Val:")
print('MAE:', mean_absolute_error(y_val_total_amount, y_pred_val_xgb))
print("MSE:", mean_squared_error(y_val_total_amount, y_pred_val_xgb))
print('RMSE:', math.sqrt(mean_squared_error(y_val_total_amount, y_pred_val_xgb)))
print(f"R2: {r2_score(y_val_total_amount, y_pred_val_xgb):.2f}")

__version__ 2.0.3
MAE: 8.499936202490034
MSE: 227.1152783615407
RMSE: 15.070344334537971
R2: 0.34
Val:
MAE: 8.459752577545961
MSE: 204.6382722531288
RMSE: 14.305183405085334
R2: 0.36


#### Random Forest Regressor

In [150]:
rf_reg = RandomForestRegressor(n_estimators=6, max_depth=5, random_state=42, n_jobs=1)
rf_reg.fit(X_train, y_train_total_amount)

# Evaluate the model
y_pred_train_RFR = rf_reg.predict(X_train)

print('MAE:', mean_absolute_error(y_train_total_amount, y_pred_train_RFR))
print("MSE:", mean_squared_error(y_train_total_amount, y_pred_train_RFR))
print('RMSE:', math.sqrt(mean_squared_error(y_train_total_amount, y_pred_train_RFR)))
r2 = r2_score(y_train_total_amount, y_pred_train_RFR)
print(f"R2: {r2_score(y_train_total_amount, y_pred_train_RFR):.2f}")

# Predicting the Test set results
y_pred_val_RFR = model.predict(X_val)

print("Val:")
print('MAE:', mean_absolute_error(y_val_total_amount, y_pred_val_RFR))
print("MSE:", mean_squared_error(y_val_total_amount, y_pred_val_RFR))
print('RMSE:', math.sqrt(mean_squared_error(y_val_total_amount, y_pred_val_RFR)))
print(f"R2: {r2_score(y_val_total_amount, y_pred_val_RFR):.2f}")

MAE: 7.254991895333542
MSE: 172.27699229871806
RMSE: 13.125433032807644
R2: 0.50
Val:
MAE: 8.459752577545961
MSE: 204.6382722531288
RMSE: 14.305183405085334
R2: 0.36


## Advanced cross validation



In [153]:
from sklearn.model_selection import cross_val_score, KFold
X, y = X_train, y_train_total_amount
# Create a random forest regression model
model = RandomForestRegressor(n_estimators=10)

# Configure the cross-validation procedure
cv = KFold(n_splits=10, shuffle=True)

# Evaluate the model using the cross-validation procedure
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv)

# The scores are negative, which is how scikit-learn shows them for MSE (want to maximize negative MSE)
mse_scores = -scores

# Mean and standard deviation of MSE across all folds
print("MSE:", mse_scores.mean(), "Std:", mse_scores.std())

MSE: 115.78741919896211 Std: 49.406062188298016


In [154]:
from sklearn.ensemble import StackingRegressor
import xgboost as xgb
# Base models
model1 = RandomForestRegressor(random_state=42)
model2 = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, seed=123)
model3 = RandomForestRegressor(n_estimators=100)

# Defining the meta model, using a regressor 
meta_model = RandomForestRegressor(random_state=42)

# Create Stacking model
stacking_model = StackingRegressor(
    estimators=[('rf1', model1), ('xgb', model2), ('rf2', model3)],
    final_estimator=meta_model,
    cv=5
)

# Training the model
stacking_model.fit(X_train, y_train_total_amount)

# Predictions
y_pred = stacking_model.predict(X_test)

# Evaluation metrics
# Calculate MSE
mse = mean_squared_error(y_test_total_amount, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2:.2f}")