In [1]:
# PyData
import pandas
import numpy
from pandas import DataFrame, Series
from sklearn import datasets, linear_model, preprocessing, cross_validation
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge

# System
import datetime
import os
import math

# Graphing
%matplotlib inline 
import seaborn

# Spark
# import pyspark
# from pyspark.sql import SQLContext

# from pyspark.ml import Pipeline
# from pyspark.ml.regression import RandomForestRegressor
# from pyspark.ml.feature import VectorIndexer
# from pyspark.ml.evaluation import RegressionEvaluator
# from pyspark.mllib.util import MLUtils
# from pyspark.mllib.regression import LabeledPoint

# sc = pyspark.SparkContext('local[8]')
# sqlContext = SQLContext(sc)

### Random Forests is popular for problems that have non-linear solutions. How do they perform on linear problems?


## The simplest linear problem

To warm up, let's model the easiest problem possible: y = x

In [2]:
values = []
for i in range (0, 1000):
    values = values + [i]

In [3]:
all_df = DataFrame.from_dict({"Actual": values, "Predictor": values})
all_df.head()

Unnamed: 0,Actual,Predictor
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


In [4]:
train, test = cross_validation.train_test_split(all_df)
train = train.sort()
test = test.sort()

Not surprisingly, Linear regression gets a perfect score

In [5]:
def score(prediction, actual):
    pcts = (actual - prediction) / actual
    return math.sqrt( (pcts * pcts).mean() )


In [7]:
regressor = LinearRegression()
print ("Cross Validation Score: ", cross_validation.cross_val_score(regressor, train[['Predictor']], train['Actual'], n_jobs=-1).mean())


regressor.fit(train[['Predictor']], train['Actual'])
test['Prediction-LinearRegression'] = regressor.predict(test[['Predictor']])
print ("Test RMSPE: ", score(test['Prediction-LinearRegression'], test['Actual']))



Cross Validation Score:  1.0
Test RMSPE:  2.1075795787320918e-15


Random forests flop.

In [8]:
regressor = RandomForestRegressor(n_estimators=100)
print ("Cross Validation Score: ", cross_validation.cross_val_score(regressor, train[['Predictor']], train['Actual'], n_jobs=-1).mean())

regressor.fit(train[['Predictor']], train['Actual'])
test['Prediction-RandomForest'] = regressor.predict(test[['Predictor']])
print ("Test RMSPE: ", score(test['Prediction-RandomForest'], test['Actual']))

Cross Validation Score:  -1.96374249647
Test RMSPE:  0.01428864957459743


I often find better results with SciKit's Extra Trees regressor. Experimenting with hyperparameters is usually worthwhile.


In [9]:
regressor = ExtraTreesRegressor(n_estimators=100)

print ("Cross Validation Score: ", cross_validation.cross_val_score(regressor, train[['Predictor']], train['Actual'], n_jobs=-1).mean())

regressor.fit(train[['Predictor']], train['Actual'])
test['Prediction-ExtraTrees'] = regressor.predict(test[['Predictor']])
print ("Test RMSPE: ", score(test['Prediction-ExtraTrees'], test['Actual']))

Cross Validation Score:  -1.62742319924
Test RMSPE:  0.002759360874809161


In [None]:
test.plot()


##### [GRAPHS]

### Stir with randomness

In [17]:

# Add some noise, by adding or removing a few perecent of randomness
# Create gaussian randomness, with mean=1, stdev=.1

random_percents = numpy.random.normal(1, .1, len(train['Predictor'] ))
train['Random'] = random_percents
train['Predictor-Rnd'] = train['Predictor'] * random_percents

random_percents = numpy.random.normal(1, .1, len(test['Predictor'] ))
test['Random'] = random_percents
test['Predictor-Rnd'] = test['Predictor'] * random_percents


The data is "roughly" the same, and Linear Regression should do great.

what's the actual error introduced by randomness:

In [18]:
score(train['Predictor-Rnd'], train['Actual'])

0.10363934541265082

There's no surprise: the average error is equal to the stdev of our random series

In [19]:
regressor = LinearRegression()
print ("Cross Validation Score: ", cross_validation.cross_val_score(regressor, train[['Predictor-Rnd']], train['Actual'], n_jobs=-1).mean())

regressor.fit(train[['Predictor-Rnd']], train['Actual'])
test['Prediction-LinearRegression'] = regressor.predict(test[['Predictor-Rnd']])
print ("Test RMSPE: ", score(test['Prediction-LinearRegression'], test['Actual']))

Cross Validation Score:  0.493137491452
Test RMSPE:  0.3601612793949942


In [20]:
regressor = ExtraTreesRegressor(n_estimators=50)
print ("Cross Validation Score: ", cross_validation.cross_val_score(regressor, train[['Predictor-Rnd']], train['Actual'], n_jobs=-1).mean())

regressor.fit(train[['Predictor-Rnd']], train['Actual'])
test['Prediction-ExtraTrees'] = regressor.predict(test[['Predictor-Rnd']])
print ("Test RMSPE: ", score(test['Prediction-ExtraTrees'], test['Actual']))

Cross Validation Score:  -2.39079215879
Test RMSPE:  0.13836775018433242


What if we give the model a hint: we tell it what the random scaling was

In [27]:
regressor = LinearRegression()
features = ['Predictor-Rnd', 'Random']
print ("Cross Validation Score: ", cross_validation.cross_val_score(regressor, train[features], train['Actual'], n_jobs=-1).mean())

regressor.fit(train[features], train['Actual'])
test['Prediction-LinearRegression'] = regressor.predict(test[features])
print ("Test RMSPE: ", score(test['Prediction-LinearRegression'], test['Actual']))

Cross Validation Score:  0.756845572826
Test RMSPE:  0.9925517607141426


Linear Regression actually gets worse when we give it the hint of random

In [33]:
regressor = ExtraTreesRegressor(n_estimators=50)
features = ['Predictor-Rnd', 'Random']
print ("Cross Validation Score: ", cross_validation.cross_val_score(regressor, train[features], train['Actual'], n_jobs=-1).mean())

regressor.fit(train[features], train['Actual'])
test['Prediction-ExtraTrees'] = regressor.predict(test[features])
print ("Test RMSPE: ", score(test['Prediction-ExtraTrees'], test['Actual']))

Cross Validation Score:  -1.94586594326
Test RMSPE:  0.025160345419728884


## Results:
- With a purely linear mapping, Linear Regression performs best
- With some noise, Extra Trees performs better
- When there's noise, plus a feature that "explains" the noise, Extra Trees performs even better, while Linear Regression degrades
- Still: with predictors fully explaining the results, it doesn't quite hit perfection

# What if we built an ensemble?

## Linear Regression on top of Linear Estimates from multiple models rocks.

## What if we add a chunk of non-linear

In [None]:
df.head()

In [None]:
def reduceIfSmall
df['Predictor-nonlinear'] = df['Predictor'].map(lambda x)