<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1">Imports</a></span></li><li><span><a href="#Set-parameters-and-load-filenames" data-toc-modified-id="Set-parameters-and-load-filenames-2">Set parameters and load filenames</a></span></li><li><span><a href="#Functions" data-toc-modified-id="Functions-3">Functions</a></span></li><li><span><a href="#Extract-data-from-files" data-toc-modified-id="Extract-data-from-files-4">Extract data from files</a></span></li><li><span><a href="#Cross-validation" data-toc-modified-id="Cross-validation-5">Cross-validation</a></span></li></ul></div>

# Imports

In [8]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import datetime
import os.path
import random
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
import sklearn.metrics 
from sklearn import tree, ensemble
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
import utils.bookdroputils as bd
import sklearn.model_selection as model_selection


# Set parameters and load filenames

In [9]:
# Specify the dataset to use
data_set = 'boardgames' # 'boardgames' or 'paperbacks'
# Find directory for that dataset
data_folder = os.path.join('data',data_set)

# Collect list of all files
file_list = [f for f in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, f))]
# Split list into 4 sets for cross-validation
P, Q = train_test_split(file_list, test_size=0.5, random_state=111)
A, B = train_test_split(P, test_size=0.5, random_state=222)
C, D = train_test_split(Q, test_size=0.5, random_state=333)
file_sets = [A, B, C, D]

# Number of timepoints to search ahead for a price drop
future = 60
# Fraction of the price considered a 'drop'
# E.g., 0.1 for a 10% price drop
drop_frac = .1

# Functions

In [10]:
def extract_feature_data(files, data_set, future, drop_frac, verbose=False):
    # Store features and labels
    X = []
    Y = []
    # Also store dollars spent and time waited for each strategy
    dollars_spent = []
    time_waited = []
    
    # Display messages when progress crosses this threshold
    progress_threshold = 0
    num_processed = 0
    
    # For each product
    for file_name in files:
        # Read file
        df = pd.read_csv(os.path.join('data', data_set, file_name))
        # Drop missing values
        df.dropna(inplace=True)
        # Convert prices to array
        prices = np.array(df['price'])

        # Sample data at 1-month intervals, starting at a random point in time
        for k in range(
                random.randint(0, 30) + 120 + 5, df.shape[0] - future - 30,
                60):

            # Extract price features
            features = bd.compute_features(prices[:k + 1], drop_frac)

            # Store the features
            X.append(features)

            # Store the class label
            if np.any(prices[(k + 1):(k + future + 1)] < (1 - drop_frac) *
                      prices[k]):
                Y.append(1)
            else:
                Y.append(0)

            # Extract the prices in the lookahead period
            future_prices = prices[(k + 1):(k + future + 1)]

            # Compute dollars spent and time waited for each strategy
            if Y[-1] == 1:
                idx_below = np.argwhere(future_prices < (1 - drop_frac) *
                                        prices[k])[0].astype(int)[0]
                bookdrop_price = future_prices[idx_below]
                bookdrop_wait = idx_below + 1
                tracker_price = future_prices[idx_below]
                tracker_wait = idx_below + 1
                perfect_price = future_prices[idx_below]
                perfect_wait = idx_below + 1

            else:
                bookdrop_price = prices[k + future]
                bookdrop_wait = future
                tracker_price = prices[k + future]
                tracker_wait = future
                perfect_price = prices[k]
                perfect_wait = 0

            # Store the dollars spent
            dollars_spent.append(
                np.array([
                    perfect_price, bookdrop_price, tracker_price, prices[k],
                    prices[k + future]
                ]))

            # Store the time waited
            time_waited.append(
                np.array([perfect_wait, bookdrop_wait, tracker_wait]))

        # Show progress
        if verbose:
            num_processed += 1
            if num_processed / len(files) > progress_threshold:
                print(str(round(progress_threshold * 100)) + '% done')
                progress_threshold += .2
            
    return np.array(X), np.array(Y), np.array(dollars_spent), np.array(time_waited)

In [11]:
# Define a function to evaluate the performance of a model on a test set
def evaluate_model(model, x_test, y_test, dollars_spent_test, time_waited_test, future):
    # Make predictions on test set
    y_pred = model.predict(x_test)
    # Calculate accuracy
    accuracy = 100 * sum(y_pred == y_test)/len(y_pred)
    
    # Compute errors for the confusion matrix
    TN = np.sum(np.bitwise_and(y_test == 0, y_pred == 0))
    FP = np.sum(np.bitwise_and(y_test == 0, y_pred == 1))
    FN = np.sum(np.bitwise_and(y_test == 1, y_pred == 0))
    TP = np.sum(np.bitwise_and(y_test == 1, y_pred == 1))
    
    # Calculate precision
    precision = TP / (TP + FP)
    # Calculate recall
    recall = TP / (TP + FN)
    
    # Compute spending for each strategy
    cost_perfect = np.sum(dollars_spent_test[:, 0])
    cost_bookdrop = np.sum(dollars_spent_test[y_pred == 0, 3]) + np.sum(
        dollars_spent_test[y_pred == 1, 1])
    cost_tracker = np.sum(dollars_spent_test[:, 2])
    cost_now = np.sum(dollars_spent_test[:, 3])
    cost_wait_only = np.sum(dollars_spent_test[:, 4])
    
    # Spending for each strategy
    cost_perfect = dollars_spent_test[:, 0]
    cost_bookdrop = dollars_spent_test[np.arange(len(dollars_spent_test)), [3 if y == 0 else 1 for y in y_pred]]
    cost_tracker = dollars_spent_test[:, 2]
    cost_now = dollars_spent_test[:, 3]
    cost_wait_only = dollars_spent_test[:, 4]
    
    cost_perfect = sum(cost_perfect)
    cost_bookdrop = sum(cost_bookdrop)
    cost_tracker = sum(cost_tracker)
    cost_now = sum(cost_now)
    cost_wait_only = sum(cost_wait_only)
    
    # Time waited for each strategy
    lag_perfect = time_waited_test[:, 0]
    lag_bookdrop = np.append(time_waited_test[y_pred == 1, 1],np.zeros((sum(y_pred==0),1)))
    lag_tracker = time_waited_test[:, 2]
    
    # Median cost savings for each strategy
    perfect_savings = 100 * (1 - np.median(cost_perfect / cost_now))
    bookdrop_savings = 100 * (1 - np.median(cost_bookdrop / cost_now))
    tracker_savings = 100 * (1 - np.median(cost_tracker / cost_now))
    wait_only_savings = 100 * (1 - np.median(cost_wait_only / cost_now))
      
    # Efficiency
    bookdrop_efficiency = bookdrop_savings / perfect_savings
    tracker_efficiency = tracker_savings / perfect_savings
    wait_only_efficiency = wait_only_savings / perfect_savings
    
    # Days waited
    perfect_days = np.mean(lag_perfect) / 2
    bookdrop_days = np.mean(lag_bookdrop) / 2
    tracker_days = np.mean(lag_tracker) / 2
    wait_only_days = future / 2
     
    # Time saved
    perfect_time_savings = 100 - 100 * (perfect_days / wait_only_days)
    bookdrop_time_savings = 100 - 100 * (bookdrop_days / wait_only_days)
    
    simulation_results = pd.DataFrame(
        [[perfect_savings, 1, perfect_days,perfect_time_savings],
         [bookdrop_savings, bookdrop_efficiency, bookdrop_days, bookdrop_time_savings],
         [tracker_savings,tracker_efficiency,tracker_days,''],
         [wait_only_savings,wait_only_efficiency,wait_only_days,'']
        ],
        columns=['median_savings', 'efficiency', 'mean_wait_time', 'time_saved'],
        index=['Perfect','BookDrop','Tracker only','Wait'])
    
    return accuracy, precision, recall, simulation_results, TP, FP, TN, FN

# Extract data from files

In [12]:
# Set random seeds for reproducibility
random.seed(99)
np.random.seed(99)

# Initialize data storage
X = []
Y = []
dollars_spent = []
time_waited = []

# Collect data from each file set
for idx, S in enumerate(file_sets):
    X_S, Y_S, dollars_spent_S, time_waited_S = extract_feature_data(S, data_set, future, drop_frac)
    X.append(X_S)
    Y.append(Y_S)
    dollars_spent.append(dollars_spent_S)
    time_waited.append(time_waited_S)

# Cross-validation

In [24]:
# Initialize list of dataframes
results_list = []

# Define parameters for the random forest model
parameters = {
    'bootstrap': True,
    'max_depth': 10,
    'max_features': 'sqrt',
    'min_samples_leaf': 4,
    'min_samples_split': 4,
    'n_estimators': 300
}

# Calculate cross-validated metrics
for fold in range(4):
    # Choose test set
    X_test = X[fold]
    Y_test = Y[fold]
    dollars_spent_test = dollars_spent[fold]
    time_waited_test = time_waited[fold]
    
    # Build training set
    train_idx = list(set(range(4)) - set([fold]))
    X_train = X[train_idx[0]]
    Y_train = Y[train_idx[0]]
    for i in range(1,3):
        X_train = np.append(X_train, X[train_idx[i]], axis=0)
        Y_train = np.append(Y_train, Y[train_idx[i]], axis=0)
    
    # Train the model
    RF_model = BalancedRandomForestClassifier(**parameters)
    RF_model.fit(X_train, Y_train)
    
    # Evaluate the model
    accuracy, precision, recall, simulation_results, TP, FP, TN, FN = \
        evaluate_model(RF_model, X_test, Y_test, dollars_spent_test, time_waited_test, future)
    results_list.append(simulation_results)

In [36]:
# Display cross-validated metrics

# Average across folds
results_cv = np.array(results_list[0])
results_cv[2:4,3] = 0 # handle blank cells
for i in range(1,4):
    temp = np.array(results_list[i])
    temp[2:4,3] = 0
    results_cv += temp
results_cv = results_cv / 4
results_cv[2:4,3] = ''

# Display results
pd.DataFrame(
    results_cv,
    columns=['median_savings', 'efficiency', 'mean_wait_time', 'time_saved'],
    index=['Perfect','BookDrop','Tracker only','Wait'])

Unnamed: 0,median_savings,efficiency,mean_wait_time,time_saved
Perfect,5.84232,1.0,3.79802,87.3399
BookDrop,3.29354,0.565333,8.91934,70.2689
Tracker only,3.28814,0.560374,24.6831,
Wait,-0.00467501,-0.00145954,30.0,
