In [1]:
import pandas as pd
import numpy as np

# import sys
# sys.path.insert(1, "../src/features")

# allows all columns to be displayed
pd.set_option('display.max_columns', None)

# Removing Outliers

### Read In Data

Read in the "offers" dataframe which contains all offers FlockFreight ever received, merged with their respective order information

In [2]:
offers = pd.read_pickle("../data/pickles/merged_clean2.pkl")

Create an aggregated table of "orders" which includes the total number of offers and the lowest offer rate each order received

In [3]:
orders = offers.copy().sort_values("RATE_USD", ascending=True)
orders["OFFER_COUNT"] = orders.groupby("REFERENCE_NUMBER")["ORDER_DATETIME_PST"].transform("count")
orders["FTL_OFFER_COUNT"] = orders.groupby("REFERENCE_NUMBER")["OFFER_IS_FTL"].transform("sum")
orders = orders.groupby("REFERENCE_NUMBER").first().sort_values("ORDER_DATETIME_PST")

### Remove Anomalies

Create a set of offer and order anomalies to remove

In [4]:
orders_to_drop = set() # set of order REFERENCE_NUMBERs to remove
offers_to_drop = set() # set of indicies in the "orders" table to remove

##### Identify Abnormal Offers

From the "offers" dataframe which contains all offers ever received, remove offers that suggested abnormally high or low offer rates: above 2.5 z-score.

In [5]:
rates = offers["RATE_USD"]
lower_threshold = np.max([0, np.mean(rates) - 2.5 * np.std(rates)])
upper_threshold = np.mean(rates) + 2.5 * np.std(rates)

# get outliers
offer_outlier = set(offers.index[(rates < lower_threshold) | (rates > upper_threshold)])

# remove outliers from data
offers_to_drop = offers_to_drop.union(offer_outlier)

##### Identify Abnormal Orders

The orders dataframe contains aggregated values of unique orders, each of which holds the best offer rate the carriers have offered. 

This cell removes orders that received abnormally high or low offers: greater than 2.5 z-score.

In [6]:
rates = orders["RATE_USD"]
lower_threshold = np.max([0, np.mean(rates) - 2.5 * np.std(rates)])
upper_threshold = np.mean(rates) + 2.5 * np.std(rates)

# get outliers
order_outlier = set(orders.index[(orders["RATE_USD"] < lower_threshold) | (orders["RATE_USD"] > upper_threshold)])

# add to remove list
orders_to_drop = orders_to_drop.union(order_outlier)

From the aggregated "orders" dataframe, remove all orders that received abnormally high or low number of offers: 2.5 z-score.

In [7]:
offer_count = orders["OFFER_COUNT"]
lower_threshold = np.max([0, np.mean(offer_count) - 2.5 * np.std(offer_count)])
upper_threshold = np.mean(offer_count) + 2.5 * np.std(offer_count)

# get outliers
order_outlier = set(orders.index[(offer_count < lower_threshold) | (offer_count > upper_threshold)])

# add to remove list
orders_to_drop = orders_to_drop.union(order_outlier)

From the aggregated "orders" dataframe, remove all orders that were placed with abnormally small or large amount of time until the pickup deadline: 2.5 z-score.

In [8]:
hours = orders["GIVEN_HOURS"]
lower_threshold = np.max([0, np.mean(hours) - 2.5 * np.std(hours)])
upper_threshold = np.mean(hours) + 2.5 * np.std(hours)

# get outliers
order_outlier = set(orders.index[(hours < lower_threshold) | (hours > upper_threshold)])

# add to remove list
orders_to_drop = orders_to_drop.union(order_outlier)

From the aggregated "orders" dataframe, remove all orders that have abnormally small or large Palletized Linear Feet values: 2.5 z-score.

In [9]:
length = orders["PALLETIZED_LINEAR_FEET"]
lower_threshold = np.max([0, np.mean(length) - 2.5 * np.std(length)])
upper_threshold = np.mean(length) + 2.5 * np.std(length)

# get outliers
order_outlier = set(orders.index[(length < lower_threshold) | (length > upper_threshold)])

# add to remove list
orders_to_drop = orders_to_drop.union(order_outlier)

##### Remove anomalies from the "offers"dataframe

In [10]:
offer_id_to_keep = [offers["REFERENCE_NUMBER"].apply(lambda ref: ref not in orders_to_drop)]
offer_id_to_keep = ([pd.Series(offers.index).apply(lambda id: id not in offers_to_drop)] and offer_id_to_keep)[0]

offers = offers[offer_id_to_keep].reset_index(drop=True)

Save the new "offers" dataframe

In [11]:
# offers.to_pickle("../data/pickles/merged_clean2_outlier_removed.pkl")

### Recreate "orders" dataframe based on the new "offers" dataframe

In [12]:
orders = offers.copy().sort_values("RATE_USD", ascending=True)
orders["OFFER_COUNT"] = orders.groupby("REFERENCE_NUMBER")["ORDER_DATETIME_PST"].transform("count")
orders["FTL_OFFER_COUNT"] = orders.groupby("REFERENCE_NUMBER")["OFFER_IS_FTL"].transform("sum")
orders = orders.groupby("REFERENCE_NUMBER").first().sort_values("ORDER_DATETIME_PST")

Save the new "orders" dataframe

In [13]:
# orders.to_pickle("../data/pickles/orders_clean2_outlier_removed.pkl")