# Modeling v2 - The New Features 

In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from numpy import median
from gc import collect

# from bokeh.charts import show
# from bokeh.plotting import figure
# from bokeh.layouts import gridplot
# from bokeh.io import output_notebook
# output_notebook()

import warnings

warnings.filterwarnings("ignore")

from sklearn.neighbors import NearestNeighbors
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import SGDRegressor, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

In [2]:
def column_mode(column):
    return column.value_counts().sort_values(ascending=False).index.tolist()[0]

def scorer(estimator, X, y):
    pred = estimator.predict(X)
    return mean_absolute_error(pred, y)

def scores(model, train, train_target, test, test_target, k, scorer=scorer):
    trained_model = model.fit(train, train_target)
    
    test_score = scorer(trained_model, test, test_target)
    train_score = scorer(trained_model, train, train_target)

    cv_scores = cross_val_score(model, train, train_target, cv=k, scoring=scorer)
    
    return {"train": train_score, "test": test_score, "validation": median(cv_scores)}

In [3]:
numeric_columns = ['basementsqft', 'bathroomcnt', 'bedroomcnt', 
                   'threequarterbathnbr', 'finishedfloor1squarefeet', 
                   'calculatedfinishedsquarefeet', 
                   'finishedsquarefeet6', 'finishedsquarefeet12', 'finishedsquarefeet13', 
                   'finishedsquarefeet15', 'finishedsquarefeet50', 'fireplacecnt', 
                   'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'latitude', 
                   'longitude', 'lotsizesquarefeet', 'poolsizesum', 'roomcnt', 'unitcnt', 
                   'yardbuildingsqft17', 'yardbuildingsqft26', 'yearbuilt', 'taxamount',
                   'structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt']

### Geolocalization Data

In [4]:
geoloc_train = pd.read_csv("../data/train.csv")[["parcelid", "logerror" ,"latitude", "longitude"]]
geoloc_train = geoloc_train[geoloc_train["latitude"].notnull()]
geoloc_train.head()

Unnamed: 0,parcelid,logerror,latitude,longitude
0,12149460,0.0363,34225911.0,-118238480.0
1,11962551,0.0218,34109753.0,-118227743.0
2,10734901,0.01,34172194.0,-118644314.0
3,11104067,0.006,34429140.0,-118466425.0
4,12678699,-0.0111,33835813.0,-118358044.0


In [5]:
geoloc_test = pd.read_csv("../data/test.csv")[["parcelid", "logerror" ,"latitude", "longitude"]]

for column_name in ["latitude", "longitude"]:
    mode = column_mode(geoloc_test[column_name])
    geoloc_test[column_name] = geoloc_test[column_name].fillna(mode)
geoloc_test.head()

Unnamed: 0,parcelid,logerror,latitude,longitude
0,14366692,-0.1684,33668120.0,-117677556.0
1,12643413,0.0218,33755800.0,-118309000.0
2,14432541,-0.005,33485643.0,-117700234.0
3,11509835,-0.2705,33870089.0,-118402768.0
4,11672170,-0.0161,34072220.0,-118547322.0


In [6]:
nbrs = NearestNeighbors(n_neighbors=4).fit(geoloc_train[["latitude", "longitude"]])

In [7]:
distances_train, indices_train = nbrs.kneighbors(geoloc_train[["latitude", "longitude"]])
geoloc_nn_train = pd.DataFrame(indices_train)
geoloc_nn_train.head()

Unnamed: 0,0,1,2,3
0,0,49784,48917,33909
1,1,5722,45897,48565
2,2,59270,46722,59939
3,3,42881,21948,4544
4,4,59642,5195,40087


In [8]:
distances_test, indices_test = nbrs.kneighbors(geoloc_test[["latitude", "longitude"]])
geoloc_nn_test = pd.DataFrame(indices_test)
geoloc_nn_test.head()

Unnamed: 0,0,1,2,3
0,33827,930,4178,47902
1,49528,24109,19081,34443
2,6283,50993,58290,34681
3,21109,23291,2776,50785
4,62319,31040,53464,34332


### New feature: 3nn_logerror_median

The new feature <b>3nn_logerror_median</b> represents the logerror median of the three known closests neighbors.

In [9]:
def map_index_logerror(index, data):
    return data.iloc[index]["logerror"]

def map_index_parcelid(index, data):
    return int(data.iloc[index]["parcelid"])

def get_n_neighbors_logerror(k, geoloc_data, neighbors_indexes, is_train):
    nn_logerror = pd.DataFrame()
    if is_train:
        from_index = 1
    else:
        from_index = 0
        
    k = k + from_index
        
    print(list(range(from_index, k)))
    for column in list(range(from_index, k)):
        nn_logerror[str(column)] = neighbors_indexes[column].apply(lambda value: map_index_logerror(value, geoloc_data))

    return nn_logerror

In [10]:
nn_logerror_train = get_n_neighbors_logerror(2, geoloc_data=geoloc_train, neighbors_indexes=geoloc_nn_train, is_train=True)
nn_logerror_train.head()

[1, 2]


Unnamed: 0,1,2
0,0.0354,-0.0661
1,0.1115,0.0677
2,-0.0398,0.0545
3,0.0296,0.006
4,-0.0315,0.0129


In [11]:
nn_logerror_test = get_n_neighbors_logerror(2, geoloc_data=geoloc_train, neighbors_indexes=geoloc_nn_test, is_train=False)
nn_logerror_test.head()

[0, 1]


Unnamed: 0,0,1
0,-0.1625,-0.0471
1,0.009,-0.005
2,-0.0212,0.0478
3,-0.0954,0.2343
4,0.001,-0.007


In [12]:
nn_logerror_median_train = nn_logerror_train.apply(lambda row: median(row.tolist()), axis=1)
geoloc_train["nn_logerror_median"] = nn_logerror_median_train.tolist()
print(len(geoloc_train))
geoloc_train.head()

63196


Unnamed: 0,parcelid,logerror,latitude,longitude,nn_logerror_median
0,12149460,0.0363,34225911.0,-118238480.0,-0.01535
1,11962551,0.0218,34109753.0,-118227743.0,0.0896
2,10734901,0.01,34172194.0,-118644314.0,0.00735
3,11104067,0.006,34429140.0,-118466425.0,0.0178
4,12678699,-0.0111,33835813.0,-118358044.0,-0.0093


In [13]:
nn_logerror_median_test = nn_logerror_test.apply(lambda row: median(row.tolist()), axis=1)
geoloc_test["nn_logerror_median"] = nn_logerror_median_test.tolist()
print(len(geoloc_test))
geoloc_test.head()

27244


Unnamed: 0,parcelid,logerror,latitude,longitude,nn_logerror_median
0,14366692,-0.1684,33668120.0,-117677556.0,-0.1048
1,12643413,0.0218,33755800.0,-118309000.0,0.002
2,14432541,-0.005,33485643.0,-117700234.0,0.0133
3,11509835,-0.2705,33870089.0,-118402768.0,0.06945
4,11672170,-0.0161,34072220.0,-118547322.0,-0.003


In [14]:
train = pd.read_csv("../data/train.csv", index_col=0)
test = pd.read_csv("../data/test.csv", index_col=0)

In [15]:
train = train.merge(geoloc_train[["parcelid", "nn_logerror_median"]], on="parcelid", how="inner") 
train.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,nn_logerror_median
0,12149460,0.0363,2016-08-12,1.0,,,3.0,4.0,,4.0,...,,180637.0,722923.0,2015.0,542286.0,8080.91,,,60373000000000.0,-0.01535
1,11962551,0.0218,2016-03-16,1.0,,,4.0,3.0,,4.0,...,,182151.0,291440.0,2015.0,109289.0,3690.38,,,60371860000000.0,0.0896
2,10734901,0.01,2016-08-04,1.0,,,3.0,3.0,,4.0,...,,141380.0,193171.0,2015.0,51791.0,6234.7,,,60371370000000.0,0.00735
3,11104067,0.006,2016-05-17,,,,3.0,4.0,,7.0,...,,163569.0,272612.0,2015.0,109043.0,4848.37,,,60379200000000.0,0.0178
4,12678699,-0.0111,2016-04-29,,,,2.0,3.0,,7.0,...,,27108.0,91344.0,2015.0,64236.0,1305.15,,,60376510000000.0,-0.0093


In [16]:
test = test.merge(geoloc_test[["parcelid", "nn_logerror_median"]], on="parcelid", how="inner") 
test.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,nn_logerror_median
0,14366692,-0.1684,2016-01-01,,,,3.5,4.0,,,...,,346458.0,585529.0,2015.0,239071.0,10153.02,,,,-0.1048
1,12643413,0.0218,2016-01-02,1.0,,,2.0,2.0,,4.0,...,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0,0.002
2,14432541,-0.005,2016-01-02,,,,2.5,4.0,,,...,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0,0.0133
3,11509835,-0.2705,2016-01-02,1.0,,,4.0,4.0,,1.0,...,,880650.0,2447951.0,2015.0,1567301.0,27126.57,,,60376210000000.0,0.06945
4,11672170,-0.0161,2016-01-03,1.0,,,4.0,5.0,,1.0,...,,559040.0,1090127.0,2015.0,531087.0,13428.94,,,60372630000000.0,-0.003


### Missing Values

In [17]:
for column_name in train.columns.tolist():
    mode = column_mode(train.append(test)[column_name])
    train[column_name] = train[column_name].fillna(mode)
    
train.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,nn_logerror_median
0,12149460,0.0363,2016-08-12,1.0,7.0,1528.0,3.0,4.0,4.0,4.0,...,True,180637.0,722923.0,2015.0,542286.0,8080.91,Y,14.0,60373000000000.0,-0.01535
1,11962551,0.0218,2016-03-16,1.0,7.0,1528.0,4.0,3.0,4.0,4.0,...,True,182151.0,291440.0,2015.0,109289.0,3690.38,Y,14.0,60371860000000.0,0.0896
2,10734901,0.01,2016-08-04,1.0,7.0,1528.0,3.0,3.0,4.0,4.0,...,True,141380.0,193171.0,2015.0,51791.0,6234.7,Y,14.0,60371370000000.0,0.00735
3,11104067,0.006,2016-05-17,1.0,7.0,1528.0,3.0,4.0,4.0,7.0,...,True,163569.0,272612.0,2015.0,109043.0,4848.37,Y,14.0,60379200000000.0,0.0178
4,12678699,-0.0111,2016-04-29,1.0,7.0,1528.0,2.0,3.0,4.0,7.0,...,True,27108.0,91344.0,2015.0,64236.0,1305.15,Y,14.0,60376510000000.0,-0.0093


In [18]:
for column_name in test.columns.tolist():
    mode = column_mode(train.append(test)[column_name])
    test[column_name] = test[column_name].fillna(mode)
    
test.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,nn_logerror_median
0,14366692,-0.1684,2016-01-01,1.0,7.0,1528.0,3.5,4.0,4.0,7.0,...,True,346458.0,585529.0,2015.0,239071.0,10153.02,Y,14.0,60379200000000.0,-0.1048
1,12643413,0.0218,2016-01-02,1.0,7.0,1528.0,2.0,2.0,4.0,4.0,...,True,171518.0,244880.0,2015.0,73362.0,3048.74,Y,14.0,60372960000000.0,0.002
2,14432541,-0.005,2016-01-02,1.0,7.0,1528.0,2.5,4.0,4.0,7.0,...,True,169574.0,434551.0,2015.0,264977.0,5488.96,Y,14.0,60590420000000.0,0.0133
3,11509835,-0.2705,2016-01-02,1.0,7.0,1528.0,4.0,4.0,4.0,1.0,...,True,880650.0,2447951.0,2015.0,1567301.0,27126.57,Y,14.0,60376210000000.0,0.06945
4,11672170,-0.0161,2016-01-03,1.0,7.0,1528.0,4.0,5.0,4.0,1.0,...,True,559040.0,1090127.0,2015.0,531087.0,13428.94,Y,14.0,60372630000000.0,-0.003


### Normalizing Data

In [19]:
def normalize(column):
    norm_column = (column - column.mean()) / (column.max() - column.min())
    return norm_column
    
def normalize_columns(data, columns):
    df = data.copy()
    for column in columns:
        df[column] = normalize(df[column])
    return df

In [20]:
norm_data = normalize_columns(train.append(test), numeric_columns + ["nn_logerror_median"])

norm_train = norm_data.iloc[0 : train.shape[0]]
norm_test = norm_data.iloc[train.shape[0]: train.shape[0] + test.shape[0]]

train = None
test = None

#### Use Columns

In [21]:
skb = SelectKBest(k=10)
fit = skb.fit(norm_train[numeric_columns + ["nn_logerror_median"]], norm_train["logerror"])

use_columns = norm_train[numeric_columns + ["nn_logerror_median"]].columns[fit.get_support()].tolist()
use_columns

['calculatedfinishedsquarefeet',
 'finishedsquarefeet6',
 'finishedsquarefeet12',
 'finishedsquarefeet15',
 'garagecarcnt',
 'yearbuilt',
 'taxamount',
 'structuretaxvaluedollarcnt',
 'landtaxvaluedollarcnt',
 'nn_logerror_median']

In [22]:
norm_train["nn_logerror_median"].corr(norm_train["logerror"])

0.079943765104766265

In [23]:
train = norm_train
test = norm_test

### <font color="blue">Linear Regression</font>

In [24]:
results = []
k = 5

##### Stochastic Gradient Descent Regressor

In [25]:
model = SGDRegressor(alpha=0.001, max_iter=1000)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict

{'test': 0.06995937486929614,
 'train': 0.068096881097302761,
 'validation': 0.06742710158840956}

In [26]:
results.append({"model": "linear SGDRegressor", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha0.001, max_iter=1000"})

In [27]:
model = SGDRegressor(alpha=0.0001, max_iter=2000)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict

{'test': 0.070071746029917073,
 'train': 0.068098698480517819,
 'validation': 0.06754335292475995}

In [28]:
results.append({"model": "linear SGDRegressor", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha0.0001, max_iter=2000"})

In [29]:
model = SGDRegressor(alpha=0.0001, max_iter=1000)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict

{'test': 0.070179709583395505,
 'train': 0.068193906526026901,
 'validation': 0.067598544323541954}

In [30]:
results.append({"model": "linear SGDRegressor", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha0.0001, max_iter=1000"})

##### Ridge

In [31]:
model = Ridge()
scores_dict = scores(model, norm_train[use_columns], norm_train["logerror"], 
                            norm_test[use_columns],  norm_test["logerror"], k, scorer=scorer)
scores_dict

{'test': 0.07032271549471962,
 'train': 0.06828175646913319,
 'validation': 0.067716473140865874}

In [32]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"]})

In [33]:
model = Ridge(alpha=2.0)
scores_dict = scores(model, norm_train[use_columns], norm_train["logerror"], 
                            norm_test[use_columns],  norm_test["logerror"], k, scorer=scorer)
scores_dict

{'test': 0.070272109712326378,
 'train': 0.068250506803358424,
 'validation': 0.067671120854921482}

In [34]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha=2"})

In [35]:
model = Ridge(alpha=10.0)
scores_dict = scores(model, norm_train[use_columns], norm_train["logerror"], 
                            norm_test[use_columns],  norm_test["logerror"], k, scorer=scorer)
scores_dict

{'test': 0.07005649104426577,
 'train': 0.068093665664245351,
 'validation': 0.067458125976738059}

In [36]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha=10"})

In [37]:
model = Ridge(alpha=0.5)
scores_dict = scores(model, norm_train[use_columns], norm_train["logerror"], 
                            norm_test[use_columns],  norm_test["logerror"], k, scorer=scorer)
scores_dict

{'test': 0.070359309454329791,
 'train': 0.068303173572004619,
 'validation': 0.067747598760950017}

In [38]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha=0.5"})

In [39]:
model = Ridge(alpha=0.2)
scores_dict = scores(model, norm_train[use_columns], norm_train["logerror"], 
                            norm_test[use_columns],  norm_test["logerror"], k, scorer=scorer)
scores_dict

{'test': 0.070395271896179293,
 'train': 0.06832359356279083,
 'validation': 0.067778055603993581}

In [40]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha=0.2"})

### <font color="blue">Polynomial Regression</font>

In [41]:
from sklearn.preprocessing import PolynomialFeatures

def polynomial_features(df, degree=2):
    poly = PolynomialFeatures(degree=degree)
    polinomial_data = pd.DataFrame(poly.fit_transform(df))
    polinomial_data.columns = poly.get_feature_names()
    return polinomial_data

In [42]:
poly_test = polynomial_features(norm_test[use_columns + ["nn_logerror_median"]], degree=3)
poly_test.head()

poly_train = polynomial_features(norm_train[use_columns + ["nn_logerror_median"]], degree=3)
poly_train.head()

Unnamed: 0,1,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x8^3,x8^2 x9,x8^2 x10,x8 x9^2,x8 x9 x10,x8 x10^2,x9^3,x9^2 x10,x9 x10^2,x10^3
0,1.0,-0.004175,-0.001056,-0.002158,-0.001245,0.002592,0.243215,0.006541,0.000109,0.010794,...,1.257528e-06,-5.820364e-07,-5.820364e-07,2.693907e-07,2.693907e-07,2.693907e-07,-1.246853e-07,-1.246853e-07,-1.246853e-07,-1.246853e-07
1,1.0,0.017945,-0.001056,0.022978,-0.001245,0.002592,0.297061,-0.007099,0.000261,-0.00688,...,-3.25605e-07,6.819477e-07,6.819477e-07,-1.428272e-06,-1.428272e-06,-1.428272e-06,2.991375e-06,2.991375e-06,2.991375e-06,2.991375e-06
2,1.0,0.023575,-0.001056,0.029374,-0.001245,0.002592,0.050907,0.000806,-0.003837,-0.009226,...,-7.85428e-07,-6.799505e-08,-6.799505e-08,-5.886379e-09,-5.886379e-09,-5.886379e-09,-5.095879e-10,-5.095879e-10,-5.095879e-10,-5.095879e-10
3,1.0,0.024718,-0.001056,0.030674,-0.001245,0.002592,-0.002939,-0.003501,-0.001606,-0.00689,...,-3.270328e-07,5.379865e-08,5.379865e-08,-8.850166e-09,-8.850166e-09,-8.850166e-09,1.4559e-09,1.4559e-09,1.4559e-09,1.4559e-09
4,1.0,-0.026648,-0.001056,-0.027694,-0.001245,0.002592,-0.095247,-0.014509,-0.015324,-0.008719,...,-6.627145e-07,-2.947153e-07,-2.947153e-07,-1.310626e-07,-1.310626e-07,-1.310626e-07,-5.828474e-08,-5.828474e-08,-5.828474e-08,-5.828474e-08


In [55]:
skb = SelectKBest(k=10)
fit = skb.fit(poly_train, train["logerror"])

use_poly_columns = poly_train.columns[fit.get_support()].tolist()
poly_train = poly_train[use_poly_columns]
poly_train.head()

Unnamed: 0,x3 x4,x0 x1 x7,x0 x3 x4,x1^2 x7,x3^2 x4,x3 x4 x6,x3 x4 x8,x3 x6 x7,x3 x7 x8,x3 x8 x10
0,-3e-06,4.818817e-10,1.34703e-08,1.218663e-10,4.015822e-09,-2.110434e-08,-3.482372e-08,-8.899974e-10,-1.468562e-09,6.711961e-08
1,-3e-06,-4.954962e-09,-5.7897e-08,2.915445e-10,4.015822e-09,2.290209e-08,2.219557e-08,2.310539e-09,2.23926e-09,1.233841e-07
2,-3e-06,9.550861e-08,-7.605806e-08,-4.277774e-09,4.015822e-09,-2.599713e-09,2.976721e-08,3.848368e-09,-4.406454e-08,-9.173048e-09
3,-3e-06,4.192639e-08,-7.974703e-08,-1.790992e-09,4.015822e-09,1.129552e-08,2.222797e-08,-7.000565e-09,-1.377611e-08,9.719549e-09
4,-3e-06,-4.31165e-07,8.597267e-08,-1.708455e-08,4.015822e-09,4.680933e-08,2.812839e-08,-2.767384e-07,-1.66296e-07,-4.207579e-08


##### Stochastic Gradient Descent Regressor

In [45]:
model = SGDRegressor(max_iter=2000)
scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
scores_dict

{'test': 0.069908209979089295,
 'train': 0.068110226143618308,
 'validation': 0.067508550880775828}

In [46]:
results.append({"model": "Polynomial SGD Regressor", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "degree=4"})

In [47]:
model = Ridge()
scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
scores_dict

{'test': 0.070010821969964471,
 'train': 0.068203632015274629,
 'validation': 0.0675219928205298}

In [48]:
results.append({"model": "Polynomial Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "degree=4"})

In [49]:
model = Ridge(alpha=0.5)
scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
scores_dict

{'test': 0.070010752525431666,
 'train': 0.068203545734068075,
 'validation': 0.067522017501242107}

In [50]:
results.append({"model": "Polynomial Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "degree=4, alpha=0.5"})

In [51]:
model = Ridge(alpha=10)
scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
scores_dict

{'test': 0.070010885403783682,
 'train': 0.068203710757182567,
 'validation': 0.067521970451618799}

In [52]:
results.append({"model": "Polynomial Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "degree=4, alpha=10.0"})

In [53]:
results_df = pd.DataFrame(results)
results_df.sort_values("score_cv")

Unnamed: 0,model,score_cv,score_test,score_train,tags
0,linear SGDRegressor,0.067427,0.069959,0.068097,"alpha0.001, max_iter=1000"
5,linear Ridge,0.067458,0.070056,0.068094,alpha=10
8,Polynomial SGD Regressor,0.067509,0.069908,0.06811,degree=4
11,Polynomial Ridge,0.067522,0.070011,0.068204,"degree=4, alpha=10.0"
9,Polynomial Ridge,0.067522,0.070011,0.068204,degree=4
10,Polynomial Ridge,0.067522,0.070011,0.068204,"degree=4, alpha=0.5"
1,linear SGDRegressor,0.067543,0.070072,0.068099,"alpha0.0001, max_iter=2000"
2,linear SGDRegressor,0.067599,0.07018,0.068194,"alpha0.0001, max_iter=1000"
4,linear Ridge,0.067671,0.070272,0.068251,alpha=2
3,linear Ridge,0.067716,0.070323,0.068282,


In [54]:
results_df.to_csv("nn_results_df.csv", index=False)

##### Multi Layer Perceptron Regressor

In [56]:
mlp_results = []

In [57]:
for hidden_layer_sizes in [10,50,100,300,500,700]:
    model = MLPRegressor(hidden_layer_sizes=[int(len(train[use_columns].columns) * 1.5)] * hidden_layer_sizes)
    print("hidden_layer_sizes:", hidden_layer_sizes)
    scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
    mlp_results.append({"model": "linear MLP", "score_test": scores_dict["test"], "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "hidden_layer_sizes=" + str(hidden_layer_sizes)})
    print(pd.DataFrame(mlp_results))

hidden_layer_sizes: 10
        model  score_cv  score_test  score_train                   tags
0  linear MLP  0.069422    0.069914     0.067981  hidden_layer_sizes=10
hidden_layer_sizes: 50
        model  score_cv  score_test  score_train                   tags
0  linear MLP  0.069422    0.069914     0.067981  hidden_layer_sizes=10
1  linear MLP  0.068362    0.072330     0.070871  hidden_layer_sizes=50
hidden_layer_sizes: 100
        model  score_cv  score_test  score_train                    tags
0  linear MLP  0.069422    0.069914     0.067981   hidden_layer_sizes=10
1  linear MLP  0.068362    0.072330     0.070871   hidden_layer_sizes=50
2  linear MLP  0.069256    0.072646     0.070743  hidden_layer_sizes=100
hidden_layer_sizes: 300
        model  score_cv  score_test  score_train                    tags
0  linear MLP  0.069422    0.069914     0.067981   hidden_layer_sizes=10
1  linear MLP  0.068362    0.072330     0.070871   hidden_layer_sizes=50
2  linear MLP  0.069256    0.072646

In [58]:
for hidden_layer_sizes in [10,50,100,300,500,700]:
    model = MLPRegressor(hidden_layer_sizes=[int(len(poly_train.columns) * 1.5)] * hidden_layer_sizes)
    print("hidden_layer_sizes:", hidden_layer_sizes)
    scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
    mlp_results.append({"model": "polynomial MLP", "score_test": scores_dict["test"], "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "hidden_layer_sizes=" + str(hidden_layer_sizes)})
    print(pd.DataFrame(mlp_results))

hidden_layer_sizes: 10
            model  score_cv  score_test  score_train                    tags
0      linear MLP  0.069422    0.069914     0.067981   hidden_layer_sizes=10
1      linear MLP  0.068362    0.072330     0.070871   hidden_layer_sizes=50
2      linear MLP  0.069256    0.072646     0.070743  hidden_layer_sizes=100
3      linear MLP  0.068844    0.069920     0.068120  hidden_layer_sizes=300
4      linear MLP  0.069729    0.070668     0.069105  hidden_layer_sizes=500
5      linear MLP  0.069410    0.069604     0.067878  hidden_layer_sizes=700
6  polynomial MLP  0.068510    0.070471     0.068630   hidden_layer_sizes=10
hidden_layer_sizes: 50
            model  score_cv  score_test  score_train                    tags
0      linear MLP  0.069422    0.069914     0.067981   hidden_layer_sizes=10
1      linear MLP  0.068362    0.072330     0.070871   hidden_layer_sizes=50
2      linear MLP  0.069256    0.072646     0.070743  hidden_layer_sizes=100
3      linear MLP  0.068844   

In [59]:
mlp_results_df = pd.DataFrame(mlp_results).sort_values("score_cv")

In [60]:
mlp_results_df["hidden_layer_sizes"] = mlp_results_df["tags"].apply(lambda value: int(value.split("=")[1]))
mlp_results_df = mlp_results_df.sort_values("hidden_layer_sizes")
mlp_results_df.to_csv("nn_mlp_results_df.csv", index=False)
mlp_results_df.sort_values("score_cv")

Unnamed: 0,model,score_cv,score_test,score_train,tags,hidden_layer_sizes
11,polynomial MLP,0.067126,0.071801,0.069914,hidden_layer_sizes=700,700
9,polynomial MLP,0.067525,0.071771,0.069885,hidden_layer_sizes=300,300
10,polynomial MLP,0.067771,0.071121,0.069252,hidden_layer_sizes=500,500
7,polynomial MLP,0.068061,0.070801,0.068944,hidden_layer_sizes=50,50
8,polynomial MLP,0.068166,0.070451,0.068611,hidden_layer_sizes=100,100
1,linear MLP,0.068362,0.07233,0.070871,hidden_layer_sizes=50,50
6,polynomial MLP,0.06851,0.070471,0.06863,hidden_layer_sizes=10,10
3,linear MLP,0.068844,0.06992,0.06812,hidden_layer_sizes=300,300
2,linear MLP,0.069256,0.072646,0.070743,hidden_layer_sizes=100,100
5,linear MLP,0.06941,0.069604,0.067878,hidden_layer_sizes=700,700


In [None]:
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot, row
output_notebook()

In [None]:
poli_mlp = mlp_results_df[mlp_results_df["model"] == "polynomial MLP"]
linear_mlp = mlp_results_df[mlp_results_df["model"] == "linear MLP"]

In [None]:
def plot_mlp_scores(data, range_y, title):
    p = figure(width=400, height=400, tools=["save", "xpan", "xwheel_zoom", "reset"], 
               x_axis_label = "n_layers", y_axis_label = "score", title=title, y_range=range_y)
    
    p.line(x=data["hidden_layer_sizes"], y=data["score_train"], 
       line_width=2, color="green", legend="score_train")
    
    p.line(x=data["hidden_layer_sizes"], y=data["score_cv"], 
       line_width=2, color="red", legend="score_cv")
    
        p.line(x=data["hidden_layer_sizes"], y=data["score_test"], 
       line_width=2, color="red", legend="score_test")
    
    
    
    return p

In [None]:
range_y = (0.066, 0.0754)

p1 = plot_mlp_scores(linear_mlp, range_y, title="Linear MLP")
p2 = plot_mlp_scores(poli_mlp, range_y, title="Polynomial MLP")
grid = gridplot([[p1, p2]])
show(grid)

In [62]:
train[use_columns].head()

Unnamed: 0,calculatedfinishedsquarefeet,finishedsquarefeet6,finishedsquarefeet12,finishedsquarefeet15,garagecarcnt,yearbuilt,taxamount,structuretaxvaluedollarcnt,landtaxvaluedollarcnt,nn_logerror_median
0,-0.004175,-0.001056,-0.002158,-0.001245,0.002592,0.243215,0.006541,0.000109,0.010794,-0.004996
1,0.017945,-0.001056,0.022978,-0.001245,0.002592,0.297061,-0.007099,0.000261,-0.00688,0.014409
2,0.023575,-0.001056,0.029374,-0.001245,0.002592,0.050907,0.000806,-0.003837,-0.009226,-0.000799
3,0.024718,-0.001056,0.030674,-0.001245,0.002592,-0.002939,-0.003501,-0.001606,-0.00689,0.001133
4,-0.026648,-0.001056,-0.027694,-0.001245,0.002592,-0.095247,-0.014509,-0.015324,-0.008719,-0.003877
