# Zilow Zestimate

In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from random import shuffle
from gc import collect

from bokeh.charts import show
from bokeh.plotting import figure
from bokeh.layouts import gridplot
from bokeh.io import output_notebook
output_notebook()

import warnings

warnings.filterwarnings("ignore")

The bokeh.charts API has moved to a separate 'bkcharts' package.

This compatibility shim will remain until Bokeh 1.0 is released.
After that, if you want to use this API you will have to install
the bkcharts package explicitly.

  warn(message)


# The Data

In [2]:
def normalize(column):
    norm_column = (column - column.mean()) / (column.max() - column.min())
    return norm_column
    
def normalize_columns(data, columns):
    df = data.copy()
    for column in columns:
        df[column] = normalize(df[column])
    return df

In [3]:
# data = pd.read_csv("../data/train_complete_2016.csv")
# data.shape

train = pd.read_csv("../data/train.csv", index_col=0)
test = pd.read_csv("../data/test.csv", index_col=0)

# test_target = test["logerror"]
# train_target = train["logerror"]

# del test["logerror"]
# del train["logerror"]

In [4]:
train.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
66450,12149460,0.0363,2016-08-12,1.0,,,3.0,4.0,,4.0,...,,,180637.0,722923.0,2015.0,542286.0,8080.91,,,60373000000000.0
16762,11962551,0.0218,2016-03-16,1.0,,,4.0,3.0,,4.0,...,,,182151.0,291440.0,2015.0,109289.0,3690.38,,,60371860000000.0
63556,10734901,0.01,2016-08-04,1.0,,,3.0,3.0,,4.0,...,,,141380.0,193171.0,2015.0,51791.0,6234.7,,,60371370000000.0
36025,11104067,0.006,2016-05-17,,,,3.0,4.0,,7.0,...,,,163569.0,272612.0,2015.0,109043.0,4848.37,,,60379200000000.0
30783,12678699,-0.0111,2016-04-29,,,,2.0,3.0,,7.0,...,,,27108.0,91344.0,2015.0,64236.0,1305.15,,,60376510000000.0


In [5]:
test.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
1,14366692,-0.1684,2016-01-01,,,,3.5,4.0,,,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
3,12643413,0.0218,2016-01-02,1.0,,,2.0,2.0,,4.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,-0.005,2016-01-02,,,,2.5,4.0,,,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0
5,11509835,-0.2705,2016-01-02,1.0,,,4.0,4.0,,1.0,...,,,880650.0,2447951.0,2015.0,1567301.0,27126.57,,,60376210000000.0
11,11672170,-0.0161,2016-01-03,1.0,,,4.0,5.0,,1.0,...,,,559040.0,1090127.0,2015.0,531087.0,13428.94,,,60372630000000.0


In [6]:
numeric_columns = ['basementsqft', 'bathroomcnt', 'bedroomcnt', 
                   'threequarterbathnbr', 'finishedfloor1squarefeet', 
                   'calculatedfinishedsquarefeet', 
                   'finishedsquarefeet6', 'finishedsquarefeet12', 'finishedsquarefeet13', 
                   'finishedsquarefeet15', 'finishedsquarefeet50', 'fireplacecnt', 
                   'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'latitude', 
                   'longitude', 'lotsizesquarefeet', 'poolsizesum', 'roomcnt', 'unitcnt', 
                   'yardbuildingsqft17', 'yardbuildingsqft26', 'yearbuilt', 'taxamount',
                   'structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt']

### Attribute correlation
Plotting attribute correlation with the target column.

In [7]:
def plot_correlation(data, column_name, alpha=0.2, sample_ratio=1):
    p = figure(plot_width=300, plot_height=300, title=column_name + " vs " + "logerror", tools=["xwheel_zoom", "xpan", "save"])
    n_sample = int(sample_ratio * len(data))
    data_plot = data.sample(n_sample)
    p.circle(data_plot[column_name], data_plot["logerror"], alpha=alpha)
    return p

def correlations_df(data, column_name):
    corr = data[column_name].corr(data["logerror"])
    sign = '+' if corr >= 0 else '-'
    corr = corr if corr >= 0 else corr * -1
    
    return corr_df.append(pd.DataFrame([{"attribute": column_name, "sign": sign, "corr": corr}]))
    

In [63]:
corr_df = pd.DataFrame()
grid = [[]]

for column_name in numeric_columns: 
    corr_df = correlations_df(train, column_name)
    
    p = plot_correlation(train, column_name, alpha=0.1, sample_ratio=0.25)
    
    if len(grid[-1]) % 3 == 0:
        grid.append([p])
    else:
        grid[-1].append(p)
        
show(gridplot(grid))

W-1002 (EMPTY_LAYOUT): Layout has no children: Row(id='40a585a6-ea2e-4e8e-8a70-96c414ccad4f', ...)


In [9]:
# corr_df.sort_values("corr", ascending=False)

### Missing values
Replacing missing values with column <b>mode</b>

In [10]:
def column_mode(column):
    return column.value_counts().sort_values(ascending=False).index.tolist()[0]

In [65]:
data = train.append(test)

In [12]:
sample5 = train.sample(5)
sample5[numeric_columns]

Unnamed: 0,basementsqft,bathroomcnt,bedroomcnt,threequarterbathnbr,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet6,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,...,lotsizesquarefeet,poolsizesum,roomcnt,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,taxamount,structuretaxvaluedollarcnt,landtaxvaluedollarcnt
41094,,1.0,3.0,,,1441.0,,1441.0,,,...,5498.0,,0.0,1.0,,,1920.0,8052.24,107000.0,480000.0
49510,,1.5,3.0,1.0,,1721.0,,1721.0,,,...,6060.0,,7.0,,,,1962.0,1429.14,49963.0,25779.0
6381,,2.0,4.0,,,1488.0,,,,1488.0,...,5015.0,,0.0,2.0,,,1966.0,1780.89,95645.0,35139.0
84643,,2.0,2.0,,,1492.0,,1492.0,,,...,7440.0,,5.0,,,,1955.0,5182.6,95836.0,337880.0
80937,,2.0,3.0,,,1396.0,,1396.0,,,...,10095.0,,0.0,1.0,,,1962.0,1245.93,51382.0,25681.0


In [13]:
for column_name in data.columns.tolist():
    mode = column_mode(data[column_name])
    data[column_name] = data[column_name].fillna(mode)

In [14]:
sample5 = data.loc[sample5.index.tolist()]
sample5[numeric_columns]

Unnamed: 0,basementsqft,bathroomcnt,bedroomcnt,threequarterbathnbr,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet6,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,...,lotsizesquarefeet,poolsizesum,roomcnt,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,taxamount,structuretaxvaluedollarcnt,landtaxvaluedollarcnt
41094,1528.0,1.0,3.0,1.0,1260.0,1441.0,720.0,1441.0,1440.0,1680.0,...,5498.0,450.0,0.0,1.0,240.0,120.0,1920.0,8052.24,107000.0,480000.0
49510,1528.0,1.5,3.0,1.0,1260.0,1721.0,720.0,1721.0,1440.0,1680.0,...,6060.0,450.0,7.0,1.0,240.0,120.0,1962.0,1429.14,49963.0,25779.0
6381,1528.0,2.0,4.0,1.0,1260.0,1488.0,720.0,1200.0,1440.0,1488.0,...,5015.0,450.0,0.0,2.0,240.0,120.0,1966.0,1780.89,95645.0,35139.0
84643,1528.0,2.0,2.0,1.0,1260.0,1492.0,720.0,1492.0,1440.0,1680.0,...,7440.0,450.0,5.0,1.0,240.0,120.0,1955.0,5182.6,95836.0,337880.0
80937,1528.0,2.0,3.0,1.0,1260.0,1396.0,720.0,1396.0,1440.0,1680.0,...,10095.0,450.0,0.0,1.0,240.0,120.0,1962.0,1245.93,51382.0,25681.0


In [15]:
for column in data.columns.tolist():
    #columns must not contains nan
    assert (False == (True in data[column].isnull().tolist()))

##### Normalizing the data

In [66]:
norm_data = normalize_columns(data, numeric_columns)

sample5 = norm_data.loc[sample5.index.tolist()]
sample5[numeric_columns]

Unnamed: 0,basementsqft,bathroomcnt,bedroomcnt,threequarterbathnbr,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet6,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,...,lotsizesquarefeet,poolsizesum,roomcnt,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,taxamount,structuretaxvaluedollarcnt,landtaxvaluedollarcnt
41094,0.000265,-0.063891,-0.00198,-0.000385,-0.000876,-0.014276,-0.001053,-0.013649,2.4e-05,-0.001239,...,-0.002997,-0.000433,-0.081655,-0.0005,-0.00077,-0.000149,-0.371849,0.006508,-0.007266,0.008293
49510,0.000265,-0.038891,-0.00198,-0.000385,-0.000876,-0.001963,-0.001053,0.000343,2.4e-05,-0.001239,...,-0.002917,-0.000433,0.307234,-0.0005,-0.00077,-0.000149,-0.048772,-0.014068,-0.013,-0.010246
6381,0.000265,-0.013891,0.06052,-0.000385,-0.000876,-0.012209,-0.001053,-0.025692,2.4e-05,-0.009895,...,-0.003066,-0.000433,-0.081655,0.006543,-0.00077,-0.000149,-0.018003,-0.012975,-0.008408,-0.009864
84643,0.000265,-0.013891,-0.06448,-0.000385,-0.000876,-0.012034,-0.001053,-0.0111,2.4e-05,-0.001239,...,-0.002719,-0.000433,0.196123,-0.0005,-0.00077,-0.000149,-0.102618,-0.002407,-0.008389,0.002493
80937,0.000265,-0.013891,-0.00198,-0.000385,-0.000876,-0.016255,-0.001053,-0.015898,2.4e-05,-0.001239,...,-0.002338,-0.000433,-0.081655,-0.0005,-0.00077,-0.000149,-0.048772,-0.014637,-0.012857,-0.01025


In [81]:
data = norm_data
train = data.iloc[train.index.tolist()]
test = data.iloc[test.index.tolist()]

data = None
norm_data = None
collect()

7410

# Modeling v1 - Raw Attributes

In [18]:
from sklearn.linear_model import SGDRegressor, Ridge
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from numpy import median

In [19]:
def scorer(estimator, X, y):
    pred = estimator.predict(X)
    return mean_absolute_error(pred, y)

def scores(model, train, train_target, test, test_target, k, scorer=scorer):
    trained_model = model.fit(train, train_target)
    
    test_score = scorer(trained_model, test, test_target)
    train_score = scorer(trained_model, train, train_target)

    cv_scores = cross_val_score(model, train, train_target, cv=k, scoring=scorer)
    
    return {"train": train_score, "test": test_score, "validation": median(cv_scores)}

##### Use columns

In [20]:
skb = SelectKBest(k=10)
fit = skb.fit(train[numeric_columns], train["logerror"])

use_columns = train[numeric_columns].columns[fit.get_support()].tolist()
use_columns

['bathroomcnt',
 'threequarterbathnbr',
 'calculatedfinishedsquarefeet',
 'finishedsquarefeet12',
 'finishedsquarefeet15',
 'fullbathcnt',
 'yearbuilt',
 'taxamount',
 'structuretaxvaluedollarcnt',
 'landtaxvaluedollarcnt']

In [21]:
results = []
k = 5

##### Train and test partitioning

In [22]:
# train, test = partition(data[use_columns + ["logerror"]], train_proportion=0.7)

In [23]:
collect()

7

### <font color="blue">Linear Regression</font>

##### Stochastic Gradient Descent Regressor

In [None]:
model = SGDRegressor(alpha=0.001, max_iter=1000)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict

In [None]:
results.append({"model": "linear SGDRegressor", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha0.001, max_iter=1000"})

In [None]:
model = SGDRegressor(alpha=0.0001, max_iter=2000)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict

In [None]:
results.append({"model": "linear SGDRegressor", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha0.0001, max_iter=2000"})

In [None]:
model = SGDRegressor(alpha=0.0001, max_iter=1000)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict

In [None]:
results.append({"model": "linear SGDRegressor", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha0.0001, max_iter=1000"})

##### Ridge

In [None]:
model = Ridge()
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict

In [None]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"]})

In [None]:
model = Ridge(alpha=2.0)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict

In [None]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha=2"})

In [None]:
model = Ridge(alpha=10.0)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict

In [None]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha=10"})

In [None]:
model = Ridge(alpha=0.5)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict

In [None]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha=0.5"})

In [None]:
model = Ridge(alpha=0.2)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict

In [None]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha=0.2"})

### <font color="blue">Polynomial Regression</font>

In [24]:
from sklearn.preprocessing import PolynomialFeatures

def polynomial_features(df, degree=2):
    poly = PolynomialFeatures(degree=degree)
    polinomial_data = pd.DataFrame(poly.fit_transform(df))
    polinomial_data.columns = poly.get_feature_names()
    return polinomial_data

In [25]:
poly_test = polynomial_features(test[use_columns], degree=3)
poly_test.head()

poly_train = polynomial_features(train[use_columns], degree=3)
poly_train.head()

Unnamed: 0,1,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x7^3,x7^2 x8,x7^2 x9,x7 x8^2,x7 x8 x9,x7 x9^2,x8^3,x8^2 x9,x8 x9^2,x9^3
0,1.0,0.036109,-0.000385,0.048611,0.057812,-0.001239,0.040175,0.297382,0.02772,0.020478,...,2.129953e-05,1.573498e-05,1.072184e-06,1.162418e-05,7.920732e-07,5.397197e-08,8.587337e-06,5.851423e-07,3.987167e-08,2.716861e-09
1,1.0,0.036109,-0.000385,0.028074,0.034475,-0.001239,0.040175,-0.141079,0.024734,0.005759,...,1.513096e-05,3.523404e-06,1.544065e-05,8.204617e-07,3.595519e-06,1.575668e-05,1.910531e-07,8.372544e-07,3.66911e-06,1.607918e-05
2,1.0,-0.013891,-0.000385,-0.012165,-0.01125,-0.001239,-0.012456,0.135844,-0.007943,-0.003389,...,-5.011686e-07,-2.138371e-07,-6.098546e-07,-9.123938e-08,-2.602109e-07,-7.421108e-07,-3.892974e-08,-1.11026e-07,-3.166416e-07,-9.030488e-07
3,1.0,-0.013891,-0.000385,-0.023863,-0.024543,-0.001239,-0.012456,-0.11031,-0.015733,-0.015343,...,-3.894692e-06,-3.797994e-06,-2.563864e-06,-3.703697e-06,-2.500209e-06,-1.687785e-06,-3.611742e-06,-2.438133e-06,-1.64588e-06,-1.111064e-06
4,1.0,-0.013891,-0.000385,-0.007988,-0.006503,-0.001239,-0.012456,-0.341079,0.004268,0.002437,...,7.77279e-08,4.439313e-08,8.133659e-08,2.535448e-08,4.645418e-08,8.511281e-08,1.448083e-08,2.653161e-08,4.861091e-08,8.906435e-08


In [26]:
skb = SelectKBest(k=10)
fit = skb.fit(poly_train, train["logerror"])

use_poly_columns = poly_train.columns[fit.get_support()].tolist()
poly_train = poly_train[use_poly_columns]
print(poly_train.columns.tolist())
poly_train.head()

['x0 x4 x9', 'x2 x4 x7', 'x2 x4 x9', 'x4^2 x7', 'x4^2 x9', 'x4 x5 x9', 'x4 x7^2', 'x4 x7 x9', 'x4 x8 x9', 'x4 x9^2']


Unnamed: 0,x0 x4 x9,x2 x4 x7,x2 x4 x9,x4^2 x7,x4^2 x9,x4 x5 x9,x4 x7^2,x4 x7 x9,x4 x8 x9,x4 x9^2
0,-6.241286e-08,-1.669161e-06,-8.402281e-08,4.253379e-08,2.141081e-09,-6.944187e-08,-9.51814e-07,-4.791276e-08,-3.539544e-08,-2.41185e-09
1,-1.128945e-06,-8.601249e-07,-8.777294e-07,3.795185e-08,3.872862e-08,-1.256089e-06,-7.577915e-07,-7.733015e-07,-1.800714e-07,-7.89129e-07
2,-1.663218e-07,-1.197004e-07,-1.456593e-07,-1.218818e-08,-1.483137e-08,-1.491399e-07,-7.815581e-08,-9.510508e-08,-4.057915e-08,-1.157301e-07
3,-1.782209e-07,-4.650818e-07,-3.06162e-07,-2.414173e-08,-1.589243e-08,-1.598097e-07,-3.066343e-07,-2.018565e-07,-1.968448e-07,-1.328815e-07
4,7.684463e-08,4.222622e-08,4.418666e-08,6.548418e-09,6.852442e-09,6.890617e-08,-2.256091e-08,-2.360835e-08,-1.348356e-08,-2.470442e-08


##### SGD Regressor

In [None]:
model = SGDRegressor(max_iter=2000)
scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
scores_dict

In [None]:
results.append({"model": "Polynomial SGD Regressor", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"],
                "score_cv": scores_dict["validation"], 
                "tags": "degree=4"})

##### Ridge

In [None]:
model = Ridge()
scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
scores_dict

In [None]:
results.append({"model": "Polynomial Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"],
                "score_cv": scores_dict["validation"], 
                "tags": "degree=4"})

In [None]:
model = Ridge(alpha=0.5)
scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
scores_dict

In [None]:
results.append({"model": "Polynomial Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "degree=4, alpha=0.5"})

In [None]:
model = Ridge(alpha=10)
scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
scores_dict

In [None]:
results.append({"model": "Polynomial Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "degree=4, alpha=10.0"})

In [None]:
results_df = pd.DataFrame(results)
results_df.sort_values("score_cv")

In [None]:
results_df.to_csv("results_df.csv", index=False)

In [27]:
mlp_results = []

##### Multi Layer Perceptron Regressor

In [None]:
# [int(len(train[use_columns].columns)) * 1.5] * 10

In [28]:
for hidden_layer_sizes in [10,50,100,300,500,700,900]:
    model = MLPRegressor(hidden_layer_sizes=[int(len(train[use_columns].columns) * 1.5)] * hidden_layer_sizes)
    print("hidden_layer_sizes:", hidden_layer_sizes)
    scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
    mlp_results.append({"model": "linear MLP", "score_test": scores_dict["test"], "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "hidden_layer_sizes=" + str(hidden_layer_sizes)})
    print(pd.DataFrame(mlp_results))

hidden_layer_sizes: 10
        model  score_cv  score_test  score_train                   tags
0  linear MLP  0.069097    0.069247     0.069418  hidden_layer_sizes=10
hidden_layer_sizes: 50
        model  score_cv  score_test  score_train                   tags
0  linear MLP  0.069097    0.069247     0.069418  hidden_layer_sizes=10
1  linear MLP  0.069145    0.069128     0.069323  hidden_layer_sizes=50
hidden_layer_sizes: 100
        model  score_cv  score_test  score_train                    tags
0  linear MLP  0.069097    0.069247     0.069418   hidden_layer_sizes=10
1  linear MLP  0.069145    0.069128     0.069323   hidden_layer_sizes=50
2  linear MLP  0.071099    0.069052     0.069254  hidden_layer_sizes=100
hidden_layer_sizes: 300
        model  score_cv  score_test  score_train                    tags
0  linear MLP  0.069097    0.069247     0.069418   hidden_layer_sizes=10
1  linear MLP  0.069145    0.069128     0.069323   hidden_layer_sizes=50
2  linear MLP  0.071099    0.069052

In [29]:
for hidden_layer_sizes in [10,50,100,300,500,700, 900]:
    model = MLPRegressor(hidden_layer_sizes=[int(len(poly_train.columns) * 1)] * hidden_layer_sizes)
    print("hidden_layer_sizes:", hidden_layer_sizes)
    scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
    mlp_results.append({"model": "polynomial MLP", "score_test": scores_dict["test"], "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "hidden_layer_sizes=" + str(hidden_layer_sizes)})
    print(pd.DataFrame(mlp_results))

hidden_layer_sizes: 10
            model  score_cv  score_test  score_train                    tags
0      linear MLP  0.069097    0.069247     0.069418   hidden_layer_sizes=10
1      linear MLP  0.069145    0.069128     0.069323   hidden_layer_sizes=50
2      linear MLP  0.071099    0.069052     0.069254  hidden_layer_sizes=100
3      linear MLP  0.069216    0.069340     0.069522  hidden_layer_sizes=300
4      linear MLP  0.069920    0.069137     0.069332  hidden_layer_sizes=500
5      linear MLP  0.069643    0.069040     0.069243  hidden_layer_sizes=700
6      linear MLP  0.069072    0.068937     0.069147  hidden_layer_sizes=900
7  polynomial MLP  0.069099    0.069272     0.069457   hidden_layer_sizes=10
hidden_layer_sizes: 50
            model  score_cv  score_test  score_train                    tags
0      linear MLP  0.069097    0.069247     0.069418   hidden_layer_sizes=10
1      linear MLP  0.069145    0.069128     0.069323   hidden_layer_sizes=50
2      linear MLP  0.071099   

In [31]:
mlp_results_df = pd.DataFrame(mlp_results).sort_values("score_cv")

In [32]:
mlp_results_df["hidden_layer_sizes"] = mlp_results_df["tags"].apply(lambda value: int(value.split("=")[1]))
mlp_results_df = mlp_results_df.sort_values("hidden_layer_sizes")
mlp_results_df.sort_values("score_cv")

Unnamed: 0,model,score_cv,score_test,score_train,tags,hidden_layer_sizes
6,linear MLP,0.069072,0.068937,0.069147,hidden_layer_sizes=900,900
0,linear MLP,0.069097,0.069247,0.069418,hidden_layer_sizes=10,10
7,polynomial MLP,0.069099,0.069272,0.069457,hidden_layer_sizes=10,10
1,linear MLP,0.069145,0.069128,0.069323,hidden_layer_sizes=50,50
3,linear MLP,0.069216,0.06934,0.069522,hidden_layer_sizes=300,300
9,polynomial MLP,0.069269,0.070406,0.070532,hidden_layer_sizes=100,100
12,polynomial MLP,0.069286,0.071103,0.071202,hidden_layer_sizes=700,700
13,polynomial MLP,0.069315,0.069185,0.069451,hidden_layer_sizes=900,900
11,polynomial MLP,0.069315,0.069193,0.06946,hidden_layer_sizes=500,500
5,linear MLP,0.069643,0.06904,0.069243,hidden_layer_sizes=700,700


In [33]:
mlp_results_df.to_csv("mlp_results_df.csv", index=False)

In [34]:
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot, row
output_notebook()

In [35]:
poli_mlp = mlp_results_df[mlp_results_df["model"] == "polynomial MLP"]
linear_mlp = mlp_results_df[mlp_results_df["model"] == "linear MLP"]

In [59]:
def plot_mlp_scores(data, range_y, title):
    p = figure(width=400, height=400, tools=["save", "xpan", "xwheel_zoom", "reset"], 
               x_axis_label = "n_layers", y_axis_label = "score", title=title, y_range=range_y)
    
    p.line(x=data["hidden_layer_sizes"], y=data["score_train"], 
       line_width=2, color="red", legend="score_train")
    
    p.line(x=data["hidden_layer_sizes"], y=data["score_cv"], 
       line_width=2, color="green", legend="score_cv")
    
#     p.line(x=data["hidden_layer_sizes"], y=data["score_test"], 
#        line_width=2, color="blue", legend="score_test")
    
    return p

In [62]:
range_y = (0.068, 0.073)

p1 = plot_mlp_scores(linear_mlp, range_y, title="Linear MLP")
p2 = plot_mlp_scores(poli_mlp, range_y, title="Polynomial MLP")
grid = gridplot([[p1, p2]])
show(grid)