In [1]:
import sys

import pandas as pd
import numpy as np

sys.path.insert(1, "../src/features")

import data_cleaning

# allows all columns to be displayed
pd.set_option('display.max_columns', None)

In [2]:
OFFER_DATA_DIR = "../data/offer_acceptance_offers.csv"
ORDER_DATA_DIR = "../data/offer_acceptance_orders.csv"

### Data Cleaning

In [3]:
offers = pd.read_csv(OFFER_DATA_DIR, low_memory=False)[["CARRIER_ID", "REFERENCE_NUMBER", "CREATED_ON_HQ", "RATE_USD", "OFFER_TYPE", "LOAD_DELIVERED_FROM_OFFER"]]
orders = pd.read_csv(ORDER_DATA_DIR, low_memory=False)[["REFERENCE_NUMBER", "ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST", "ORIGIN_3DIGIT_ZIP", "DESTINATION_3DIGIT_ZIP", "APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET"]]

offers = data_cleaning.change_to_date(offers, ["CREATED_ON_HQ"])
orders = data_cleaning.change_to_date(orders, ["ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST"])

In [4]:
orders = data_cleaning.parse_zipcode(orders)

orders = data_cleaning.parse_datetime(orders)

offers = data_cleaning.flatten_ref_num(offers)
orders = data_cleaning.flatten_ref_num(orders)

merged = data_cleaning.join_offers_orders(offers, orders, how="inner")

merged = data_cleaning.get_remaining_time(merged)

merged = data_cleaning.during_business_hours(merged)

merged = data_cleaning.impute_mileage(merged)

merged = data_cleaning.get_business_hours(merged)

pooled = data_cleaning.get_prorated_rate(merged)

In [5]:
# Filter offers that were actually be delivered
pooled = pooled[pooled["LOAD_DELIVERED_FROM_OFFER"] == True]

In [6]:
pooled.head(1)

Unnamed: 0,CARRIER_ID,REFERENCE_NUMBER,CREATED_ON_HQ,RATE_USD,LOAD_DELIVERED_FROM_OFFER,ORDER_DATETIME_PST,PICKUP_DEADLINE_PST,ORIGIN_3DIGIT_ZIP,DESTINATION_3DIGIT_ZIP,APPROXIMATE_DRIVING_ROUTE_MILEAGE,PALLETIZED_LINEAR_FEET,ORIGIN_CITY,DESTINATION_CITY,ORDER_DAY,ORDER_MONTH,ORDER_HOUR,PICKUP_DAY,PICKUP_MONTH,PICKUP_HOUR,REMAINIG_TIME,BUSINESS_HOURS,BUSINESS_HOURS_ORDER_PICKUP,PRORATED_RATE_USD
1,0629d63476c157a6cfe234f7a34422eb0ad94998c2d082...,ce3548db155049cc1ccce2da041cec607942e4f779fc2d...,2021-11-03 08:57:27,9159.0,True,2021-11-02 12:56:49,2021-11-03 16:00:00,945,601,2131.0,4.0,Oakland CA,Chicago IL,1,11,12,2,11,16,25353.0,True,13.053056,763.25


### Regression Model for Pooled Offers (OFFER_TYPE == pool)

In [7]:
import sklearn.preprocessing as pp
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression

In [8]:
X = pooled[["APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET", 
            "BUSINESS_HOURS", "REMAINIG_TIME",
            "ORDER_DAY", "ORDER_MONTH", "ORDER_HOUR",
            "PICKUP_DAY", "PICKUP_MONTH", "PICKUP_HOUR",
            "BUSINESS_HOURS_ORDER_PICKUP",
            "ORIGIN_CITY", "DESTINATION_CITY"]]

y = pooled["PRORATED_RATE_USD"].to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
# Numerical columns and associated transformers
num_feat = ["APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET",
            "BUSINESS_HOURS_ORDER_PICKUP", "REMAINIG_TIME"]
num_transformer = Pipeline(steps=[
    ('scaler', pp.MaxAbsScaler())
])

# Categorical columns and associated transformers
cat_feat = ["ORIGIN_CITY", "DESTINATION_CITY", "BUSINESS_HOURS",
            "ORDER_DAY", "ORDER_MONTH", "ORDER_HOUR",
            "PICKUP_DAY", "PICKUP_MONTH", "PICKUP_HOUR"]
cat_transformer = Pipeline(steps=[('onehot', pp.OneHotEncoder(max_categories = 30, handle_unknown = 'ignore'))
])

# Preprocessing pipeline (put them together)
preproc = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_feat),
        ('cat', cat_transformer, cat_feat)
    ])

In [10]:
pl_ridge = Pipeline(steps=[("preprocessor", preproc), ("regressor", Ridge())])

pl_ridge.fit(X_train, y_train)

pred_train = pl_ridge.predict(X_train)
mse_train = mean_squared_error(y_train, pred_train)

# Performance on test data
pred_test = pl_ridge.predict(X_test)
mean_squared_error(y_test, pred_test)

1172279.9069743403

In [11]:
pl_lasso = Pipeline(steps=[("preprocessor", preproc), ("regressor", Lasso())])

pl_lasso.fit(X_train, y_train)

pred_train = pl_lasso.predict(X_train)
mse_train = mean_squared_error(y_train, pred_train)

# Performance on test data
pred_test = pl_lasso.predict(X_test)
mean_squared_error(y_test, pred_test)

1176828.6487140632

In [12]:
# params = {"regressor__alpha": np.arange(0.01, 50, 0.01)}
# ridge = GridSearchCV(estimator=pl_ridge, param_grid=params, 
#                      scoring="neg_mean_squared_error", 
#                      cv=5, n_jobs=-1)

# grid_search = ridge.fit(X_train, y_train)

# # best estimator
# print("Best alpha is {}".format(grid_search.best_params_))
# print("Best score is {}".format(grid_search.best_score_))

In [13]:
# params = {"regressor__alpha": np.arange(0.01, 50, 0.01)}
# lasso = GridSearchCV(estimator=pl_lasso, param_grid=params, 
#                      scoring="neg_mean_squared_error", 
#                      cv=5, n_jobs=-1)

# grid_search = lasso.fit(X_train, y_train)

# # best estimator
# print("Best alpha is {}".format(grid_search.best_params_))
# print("Best score is {}".format(grid_search.best_score_))

In [14]:
pl_ridge = Pipeline(steps=[("preprocessor", preproc), ("regressor", Ridge(2.88))])

pl_ridge.fit(X_train, y_train)

pred_train = pl_ridge.predict(X_train)
mse_train = mean_squared_error(y_train, pred_train)

# Performance on test data
pred_test = pl_ridge.predict(X_test)
mean_squared_error(y_test, pred_test)

1171811.8520371814

In [15]:
pl_lasso = Pipeline(steps=[("preprocessor", preproc), ("regressor", Lasso(0.21))])

pl_lasso.fit(X_train, y_train)

pred_train = pl_lasso.predict(X_train)
mse_train = mean_squared_error(y_train, pred_train)

# Performance on test data
pred_test = pl_lasso.predict(X_test)
mean_squared_error(y_test, pred_test)

1171483.6213386317

Commented out the grid search code to save runtime when I restarted the kernel. After adding all the features, Lasso Regression with 0.21 alpha had the best performance.