# Imports

In [18]:
# Third-party modules
from IPython.display import display, HTML
from math import log
import csv
import itertools
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import os
import pandas as pd
import math
import pylab
import random
import scipy.stats as ss

# Custom modules
from common.localio import *
from common.summary import *

# Functions

In [29]:
def rmse(pred, obs):
    
    # pred: a pandas dataframe or series of predicted values
    # obs: a pandas dataframe or series of observed (actual) values
    # The length of pred and obs should be equal.    
    
    n = len(obs)
    if len(pred) != n:
        raise_custom_error(ValueError, 'The number of predictions and observations do not match.')
    
    return (sum((pred - obs) ** 2) / n) ** (1/2)

# Datasets

Load each dataset and display the first few entries.

In [2]:
# Load the datasets

hist = CSVReader('../datasets/historical_transactions.csv', datetime_cols = ['purchase_date'])
merch = CSVReader('../datasets/merchants.csv')
new_merch = CSVReader('../datasets/new_merchant_transactions.csv', datetime_cols = ['purchase_date'])
train = CSVReader('../datasets/train.csv')
test = CSVReader('../datasets/test.csv')

In [37]:
train.data[['target']].describe()

Unnamed: 0,target
count,201917.0
mean,-0.393636
std,3.8505
min,-33.219281
25%,-0.88311
50%,-0.023437
75%,0.765453
max,17.965068


In [40]:
mean_vec = [train.data['target'].mean()] * len(train.data['target'])

Calculate the RMSE of always guessing the output value to be the sample mean of the target.

This approach is quite basic (even crude) and subject to strong volatility. It serves as a minimum benchmark for assessing other predictions; we demand that more sophisticated techniques significantly outperform this approach.

In [42]:
print(rmse(mean_vec, train.data['target']))

3.85049046062


We have also observed that there are significant outliers below -30, and the rest of the target values form a normal-like distribution. Let us try to split the target values into two parts, then apply the mean-value guess approach to each part:

In [52]:
train_regular = train.data['target'][train.data['target'] > -30]
train_outliers = train.data['target'][train.data['target'] <= -30]

In [56]:
print("RMSE for estimating non-outliers using mean:", 
      rmse([train_regular.mean()] * len(train_regular),
          train_regular))

RMSE for estimating non-outliers using mean: 1.71778862724


In [55]:
print("RMSE for estimating outliers using mean:", 
      rmse([train_outliers.mean()] * len(train_outliers),
          train_outliers))

RMSE for estimating outliers using mean: 1.19371179608e-12


These appear to be a significant improvement over a gross guess over the entire target set. Being able to guess outliers correctly, therefore, is the first step to enhancing performance significantly.