In [1]:
import sys

import pandas as pd
import numpy as np

sys.path.insert(1, "../src/features")

import data_cleaning

# allows all columns to be displayed
pd.set_option('display.max_columns', None)

### Reading Data

Only keep relevant columns

In [2]:
OFFER_DATA_DIR = "../data/offer_acceptance_offers.csv"
ORDER_DATA_DIR = "../data/offer_acceptance_orders.csv"

offers = pd.read_csv(OFFER_DATA_DIR, low_memory=False)[["CARRIER_ID", "REFERENCE_NUMBER", "CREATED_ON_HQ", "RATE_USD", "OFFER_TYPE", "LOAD_DELIVERED_FROM_OFFER"]]
orders = pd.read_csv(ORDER_DATA_DIR, low_memory=False)[["REFERENCE_NUMBER", "ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST", "ORIGIN_3DIGIT_ZIP", "DESTINATION_3DIGIT_ZIP", "APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET"]]

### Data Cleaning

Convert date columns to DateTime objects

In [3]:
offers = data_cleaning.change_to_date(offers, ["CREATED_ON_HQ"])
orders = data_cleaning.change_to_date(orders, ["ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST"])

Extract nearest city and state name from zipcode

In [4]:
orders = data_cleaning.parse_zipcode(orders)

Offers can have multiple REFERENCE_NUMBER values if the offer is being made to multiple orders (i.e., PTL, or the carrier offers to pick up multiple orders with one truck).  
We need to flatten offers with multiple REFERENCE_NUMBER values.

In [5]:
offers = data_cleaning.flatten_ref_num(offers)
orders = data_cleaning.flatten_ref_num(orders)

Match each offer with respective order

In [6]:
merged = data_cleaning.join_offers_orders(offers, orders)

Drop the columns that have NA to avoid value error later

In [7]:
merged = merged.dropna()

Calculate time remaining from when offer was made

In [8]:
merged = data_cleaning.get_remaining_time(merged)

Calculate whether offer was made during business hours

In [9]:
merged = data_cleaning.during_business_hours(merged)

Calculates prorated rate of pooled items

In [10]:
pooled = data_cleaning.get_prorated_rate(merged)

Count the number of hours between order time and pickup time that are in business hours

In [None]:
pooled = data_cleaning.get_business_hours(pooled)

Count the number of hours between order time and pickup time that are in business hours

In [None]:
pooled = data_cleaning.get_off_business_hours(pooled)

Get the weekday of PICKUP_DEADLINE_PST

In [None]:
pooled = data_cleaning.get_weekday(pooled)

Note!!! 

I only applied the functions of get_business_hours, get_off_business_hours, and get_weekday in the pooled dataset to save runtime. It should be the same when
you apply them to the unpooled dataset.

In [None]:
merged.head(1)

In [None]:
pooled.head(1)