In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import sys
sys.path.insert(1, "../src/features")
import data_cleaning

# allows all columns to be displayed
pd.set_option('display.max_columns', None)

### data cleaning

In [2]:
OFFER_DATA_DIR = "../data/offer_acceptance_offers.csv"
ORDER_DATA_DIR = "../data/offer_acceptance_orders.csv"

offers = pd.read_csv(OFFER_DATA_DIR, low_memory=False)[["CARRIER_ID", "REFERENCE_NUMBER", "CREATED_ON_HQ", "RATE_USD", "OFFER_TYPE", "LOAD_DELIVERED_FROM_OFFER"]]
orders = pd.read_csv(ORDER_DATA_DIR, low_memory=False)[["REFERENCE_NUMBER", "ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST", "ORIGIN_3DIGIT_ZIP", "DESTINATION_3DIGIT_ZIP", "APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET"]]

offers = data_cleaning.change_to_date(offers, ["CREATED_ON_HQ"])
orders = data_cleaning.change_to_date(orders, ["ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST"])

orders = data_cleaning.parse_zipcode(orders)

orders = data_cleaning.parse_datetime(orders)

offers = data_cleaning.flatten_ref_num(offers)
orders = data_cleaning.flatten_ref_num(orders)

merged = data_cleaning.join_offers_orders(offers, orders, how="inner")

merged = data_cleaning.get_remaining_time(merged)

merged = data_cleaning.during_business_hours(merged)

pooled = data_cleaning.get_prorated_rate(merged)

merged = data_cleaning.impute_mileage(merged)

merged = data_cleaning.get_business_hours(merged)

In [4]:
# merged.to_pickle("../data/pickels/merged.pkl")
# pooled.to_pickle("../data/pickels/pooled.pkl")
# orders.to_pickle("../data/pickels/orders.pkl")
# offers.to_pickle("../data/pickels/offers.pkl")

merged = pd.read_pickle("../data/pickels/merged.pkl") 
pooled = pd.read_pickle("../data/pickels/pooled.pkl") 
orders = pd.read_pickle("../data/pickels/orders.pkl") 
offers = pd.read_pickle("../data/pickels/offers.pkl") 

In [5]:
merged.columns

Index(['CARRIER_ID', 'REFERENCE_NUMBER', 'CREATED_ON_HQ', 'RATE_USD',
       'OFFER_TYPE', 'LOAD_DELIVERED_FROM_OFFER', 'ORDER_DATETIME_PST',
       'PICKUP_DEADLINE_PST', 'ORIGIN_3DIGIT_ZIP', 'DESTINATION_3DIGIT_ZIP',
       'APPROXIMATE_DRIVING_ROUTE_MILEAGE', 'PALLETIZED_LINEAR_FEET',
       'ORIGIN_CITY', 'DESTINATION_CITY', 'ORDER_DAY', 'ORDER_MONTH',
       'ORDER_HOUR', 'PICKUP_DAY', 'PICKUP_MONTH', 'PICKUP_HOUR',
       'REMAINIG_TIME', 'BUSINESS_HOURS', 'BUSINESS_HOURS_ORDER_PICKUP'],
      dtype='object')

In [6]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [7]:
df_full = merged
max_categories=30

# check df_full is a DataFrame
if not isinstance(df_full, pd.DataFrame): AssertionError("Parameter must be Pandas DataFrame")

# checks if dataframe has required columns
req_cols = ['RATE_USD', 'APPROXIMATE_DRIVING_ROUTE_MILEAGE', 'PALLETIZED_LINEAR_FEET',
      'ORIGIN_CITY', 'DESTINATION_CITY', 'ORDER_DAY', 'ORDER_MONTH',
      'ORDER_HOUR', 'PICKUP_DAY', 'PICKUP_MONTH', 'PICKUP_HOUR',
      'REMAINIG_TIME', 'BUSINESS_HOURS', 'BUSINESS_HOURS_ORDER_PICKUP', "OFFER_TYPE", "LOAD_DELIVERED_FROM_OFFER"]

if not set(req_cols).issubset(set(df_full.columns)): AssertionError("DataFrame does not contain required columns")

# filter for delivered offers
df_full = df_full[df_full["LOAD_DELIVERED_FROM_OFFER"]].reset_index(drop=True)

# select required columns only 
df_full = df_full[req_cols]

# only quote
df_full = df_full[df_full["OFFER_TYPE"] == "quote"].reset_index(drop=True)
df_full = df_full.drop(["OFFER_TYPE"], axis=1)

# split features and labels
df_X = df_full.drop(["RATE_USD"], axis=1)
df_y = df_full["RATE_USD"]

# split train test
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=42)

In [8]:
# create numerical value transformer
num_feat = ["APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET", "REMAINIG_TIME", 'BUSINESS_HOURS_ORDER_PICKUP']
num_transformer = Pipeline(steps=[
    ('scaler', MaxAbsScaler())
])

cat_feat = ['ORIGIN_CITY', 'DESTINATION_CITY', 'BUSINESS_HOURS', 'ORDER_DAY', 'ORDER_MONTH', 'ORDER_HOUR', 'PICKUP_DAY', 'PICKUP_MONTH', 'PICKUP_HOUR']
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(max_categories = max_categories, handle_unknown = 'ignore'))     # output from Ordinal becomes input to OneHot
])

# combine numerical and categorical transformers
preproc = ColumnTransformer(
  transformers=[
      ("numerical", num_transformer, num_feat),
      ("categorization", cat_transformer, cat_feat)
  ])

### Grid search to find the best type of model and best parameter

In [9]:
from sklearn.metrics import mean_squared_error

In [10]:
Ridge_MSE = []

for parameter in [0.001,0.01,0.1,1,10,20,30,40]:

  pl = Pipeline(steps=[('preprocessor', preproc), ("regressor", Ridge(alpha = parameter))])

  pl.fit(df_X_train, df_y_train)
  y_preds = pl.predict(df_X_test)

  mse = mean_squared_error(df_y_test, y_preds)
  Ridge_MSE.append(mse)
  print("MSE of %s%f is %s"%("Ridge_", parameter, mse))
  print("-------------------------")
  
Lasso_MSE = []

for parameter in [0.001,0.01,0.1,1,10]:

  pl = Pipeline(steps=[('preprocessor', preproc), ("regressor", Lasso(alpha = parameter))])

  pl.fit(df_X_train, df_y_train)
  y_preds = pl.predict(df_X_test)

  mse = mean_squared_error(df_y_test, y_preds)
  Lasso_MSE.append(mse)
  print("MSE of %s%f is %s"%("Lasso_", parameter, mse))
  print("-------------------------")

MSE of Ridge_0.001000 is 1259154.8791528998
-------------------------
MSE of Ridge_0.010000 is 1259153.3268063997
-------------------------
MSE of Ridge_0.100000 is 1259137.5028663229
-------------------------
MSE of Ridge_1.000000 is 1258982.4854549037
-------------------------
MSE of Ridge_10.000000 is 1257857.4503400302
-------------------------
MSE of Ridge_20.000000 is 1257379.6250992888
-------------------------
MSE of Ridge_30.000000 is 1257180.91192431
-------------------------
MSE of Ridge_40.000000 is 1257228.1366947063
-------------------------


  model = cd_fast.sparse_enet_coordinate_descent(


MSE of Lasso_0.001000 is 1258846.7434316224
-------------------------


  model = cd_fast.sparse_enet_coordinate_descent(


MSE of Lasso_0.010000 is 1258842.9863877585
-------------------------
MSE of Lasso_0.100000 is 1258554.3791530626
-------------------------
MSE of Lasso_1.000000 is 1264086.2147975056
-------------------------
MSE of Lasso_10.000000 is 1406826.8964651367
-------------------------


In [11]:
pl = Pipeline(steps=[('preprocessor', preproc), ('regressor', Ridge(alpha = 1))])

parameters = {
    "regressor__alpha":np.arange(34, 36, 0.1)
}

# initialize
grid_pipeline = GridSearchCV(pl,parameters, cv = 8)

grid_pipeline.fit(df_X_train,df_y_train)

pl = Pipeline(steps=[('preprocessor', preproc), ('regressor', Ridge(alpha = grid_pipeline.best_params_['regressor__alpha']))])

# train model
pl.fit(df_X_train, df_y_train)

_preds = pl.predict(df_X_test)

mse = mean_squared_error(df_y_test, _preds)
print("MSE of %s is %s"%("Ridge", mse))

MSE of Ridge is 1257143.2762379143


In [12]:
grid_pipeline.best_params_

{'regressor__alpha': 34.0}

In [13]:
pl = Pipeline(steps=[('preprocessor', preproc), ('regressor', Lasso(alpha = 1))])

parameters = {
    "regressor__alpha":np.arange(0.2, 0.3, 0.01)
}

# initialize
grid_pipeline = GridSearchCV(pl,parameters, cv = 8)

grid_pipeline.fit(df_X_train,df_y_train)

pl = Pipeline(steps=[('preprocessor', preproc), ('regressor', Lasso(alpha = grid_pipeline.best_params_['regressor__alpha']))])

# train model
pl.fit(df_X_train, df_y_train)

_preds = pl.predict(df_X_test)

mse = mean_squared_error(df_y_test, _preds)
print("MSE of %s is %s"%("Lasso", mse))

MSE of Lasso is 1258450.652228233


In [16]:
grid_pipeline.best_params_

{'regressor__alpha': 0.2}

Ridge with alpha 34 is better than Lasso with alpha 0.2 so we are going to use Ridge

In [17]:
sys.path.insert(1, "../src/models")
import quote_regression_model

pl, (df_X_test, df_y_test) = quote_regression_model.generate_quote_regression_pipeline(merged)

_preds = pl.predict(df_X_test)

mse = mean_squared_error(df_y_test, _preds)
print("MSE of %s is %s"%("Ridge", mse))

MSE of Ridge is 1257143.2762379143
