# Zilow Zestimate

In [68]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from random import shuffle
from gc import collect

from bokeh.charts import show
from bokeh.plotting import figure
from bokeh.layouts import gridplot
from bokeh.io import output_notebook
output_notebook()

import warnings

warnings.filterwarnings("ignore")

# The Data

In [69]:
def normalize(column):
    norm_column = (column - column.min()) / (column.max() - column.min())
    return norm_column
    
def normalize_columns(data, columns):
    df = data.copy()
    for column in columns:
        df[column] = normalize(df[column])
    return df

In [70]:
# data = pd.read_csv("../data/train_complete_2016.csv")
# data.shape

train = pd.read_csv("../data/train.csv", index_col=0)
test = pd.read_csv("../data/test.csv", index_col=0)

# test_target = test["logerror"]
# train_target = train["logerror"]

# del test["logerror"]
# del train["logerror"]

In [71]:
train.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
66450,12149460,0.0363,2016-08-12,1.0,,,3.0,4.0,,4.0,...,,,180637.0,722923.0,2015.0,542286.0,8080.91,,,60373000000000.0
16762,11962551,0.0218,2016-03-16,1.0,,,4.0,3.0,,4.0,...,,,182151.0,291440.0,2015.0,109289.0,3690.38,,,60371860000000.0
63556,10734901,0.01,2016-08-04,1.0,,,3.0,3.0,,4.0,...,,,141380.0,193171.0,2015.0,51791.0,6234.7,,,60371370000000.0
36025,11104067,0.006,2016-05-17,,,,3.0,4.0,,7.0,...,,,163569.0,272612.0,2015.0,109043.0,4848.37,,,60379200000000.0
30783,12678699,-0.0111,2016-04-29,,,,2.0,3.0,,7.0,...,,,27108.0,91344.0,2015.0,64236.0,1305.15,,,60376510000000.0


In [72]:
test.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
1,14366692,-0.1684,2016-01-01,,,,3.5,4.0,,,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
3,12643413,0.0218,2016-01-02,1.0,,,2.0,2.0,,4.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,-0.005,2016-01-02,,,,2.5,4.0,,,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0
5,11509835,-0.2705,2016-01-02,1.0,,,4.0,4.0,,1.0,...,,,880650.0,2447951.0,2015.0,1567301.0,27126.57,,,60376210000000.0
11,11672170,-0.0161,2016-01-03,1.0,,,4.0,5.0,,1.0,...,,,559040.0,1090127.0,2015.0,531087.0,13428.94,,,60372630000000.0


In [73]:
numeric_columns = ['basementsqft', 'bathroomcnt', 'bedroomcnt', 
                   'threequarterbathnbr', 'finishedfloor1squarefeet', 
                   'calculatedfinishedsquarefeet', 
                   'finishedsquarefeet6', 'finishedsquarefeet12', 'finishedsquarefeet13', 
                   'finishedsquarefeet15', 'finishedsquarefeet50', 'fireplacecnt', 
                   'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'latitude', 
                   'longitude', 'lotsizesquarefeet', 'poolsizesum', 'roomcnt', 'unitcnt', 
                   'yardbuildingsqft17', 'yardbuildingsqft26', 'yearbuilt', 'taxamount',
                   'structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt']

### Attribute correlation
Plotting attribute correlation with the target column.

In [74]:
def plot_correlation(data, column_name, alpha=0.2, sample_ratio=1):
    p = figure(plot_width=300, plot_height=300, title=column_name + " vs " + "logerror", tools=["xwheel_zoom", "xpan"])
    n_sample = int(sample_ratio * len(data))
    data_plot = data.sample(n_sample)
    p.circle(data_plot[column_name], data_plot["logerror"], alpha=alpha)
    return p

def correlations_df(data, column_name):
    corr = data[column_name].corr(data["logerror"])
    sign = '+' if corr >= 0 else '-'
    corr = corr if corr >= 0 else corr * -1
    
    return corr_df.append(pd.DataFrame([{"attribute": column_name, "sign": sign, "corr": corr}]))
    

In [75]:
corr_df = pd.DataFrame()
grid = [[]]

for column_name in numeric_columns: 
    corr_df = correlations_df(train, column_name)
    
    p = plot_correlation(train, column_name, alpha=0.1, sample_ratio=0.25)
    
    if len(grid[-1]) % 3 == 0:
        grid.append([p])
    else:
        grid[-1].append(p)
        
# show(gridplot(grid))

In [108]:
corr_df.sort_values("corr", ascending=False)

Unnamed: 0,attribute,corr,sign
0,basementsqft,0.226881,+
0,finishedsquarefeet13,0.054005,-
0,yardbuildingsqft26,0.050029,+
0,finishedsquarefeet6,0.046738,+
0,poolsizesum,0.04186,+
0,finishedsquarefeet12,0.040015,+
0,calculatedfinishedsquarefeet,0.036703,+
0,finishedfloor1squarefeet,0.030734,+
0,finishedsquarefeet50,0.029448,+
0,fullbathcnt,0.026546,+


### Missing values
Replacing missing values with column <b>mode</b>

In [76]:
def column_mode(column):
    return column.value_counts().sort_values(ascending=False).index.tolist()[0]

In [77]:
data = train.append(test)

In [78]:
sample5 = train.sample(5)
sample5[numeric_columns]

Unnamed: 0,basementsqft,bathroomcnt,bedroomcnt,threequarterbathnbr,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet6,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,...,lotsizesquarefeet,poolsizesum,roomcnt,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,taxamount,structuretaxvaluedollarcnt,landtaxvaluedollarcnt
17547,,2.0,3.0,,,1131.0,,1131.0,,,...,4961.0,,0.0,1.0,,,1950.0,4992.75,140043.0,242449.0
80034,,2.0,2.0,,,1089.0,,1089.0,,,...,116052.0,,0.0,1.0,,,1990.0,3138.58,126932.0,111065.0
47040,,2.5,5.0,1.0,1130.0,2417.0,,2417.0,,,...,6226.0,,9.0,,,,1999.0,7122.82,219000.0,407000.0
82414,,1.0,1.0,,,678.0,,678.0,,,...,,,0.0,,,,1990.0,2987.84,85824.0,156905.0
66603,,4.5,4.0,1.0,4425.0,4425.0,,4425.0,,,...,23528.0,,9.0,,,,2014.0,16543.6,535106.0,994000.0


In [79]:
for column_name in data.columns.tolist():
    mode = column_mode(data[column_name])
    data[column_name] = data[column_name].fillna(mode)

In [80]:
sample5 = data.loc[sample5.index.tolist()]
sample5[numeric_columns]

Unnamed: 0,basementsqft,bathroomcnt,bedroomcnt,threequarterbathnbr,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet6,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,...,lotsizesquarefeet,poolsizesum,roomcnt,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,taxamount,structuretaxvaluedollarcnt,landtaxvaluedollarcnt
17547,1528.0,2.0,3.0,1.0,1260.0,1131.0,720.0,1131.0,1440.0,1680.0,...,4961.0,450.0,0.0,1.0,240.0,120.0,1950.0,4992.75,140043.0,242449.0
80034,1528.0,2.0,2.0,1.0,1260.0,1089.0,720.0,1089.0,1440.0,1680.0,...,116052.0,450.0,0.0,1.0,240.0,120.0,1990.0,3138.58,126932.0,111065.0
47040,1528.0,2.5,5.0,1.0,1130.0,2417.0,720.0,2417.0,1440.0,1680.0,...,6226.0,450.0,9.0,1.0,240.0,120.0,1999.0,7122.82,219000.0,407000.0
82414,1528.0,1.0,1.0,1.0,1260.0,678.0,720.0,678.0,1440.0,1680.0,...,6000.0,450.0,0.0,1.0,240.0,120.0,1990.0,2987.84,85824.0,156905.0
66603,1528.0,4.5,4.0,1.0,4425.0,4425.0,720.0,4425.0,1440.0,1680.0,...,23528.0,450.0,9.0,1.0,240.0,120.0,2014.0,16543.6,535106.0,994000.0


In [81]:
for column in data.columns.tolist():
    #columns must not contains nan
    assert (False == (True in data[column].isnull().tolist()))

##### Normalizing the data

In [82]:
norm_data = normalize_columns(data, numeric_columns)

sample5 = norm_data.loc[sample5.index.tolist()]
sample5[numeric_columns]

Unnamed: 0,basementsqft,bathroomcnt,bedroomcnt,threequarterbathnbr,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet6,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,...,lotsizesquarefeet,poolsizesum,roomcnt,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,taxamount,structuretaxvaluedollarcnt,landtaxvaluedollarcnt
17547,0.981443,0.1,0.1875,0.0,0.160401,0.04965,0.066456,0.056419,0.727273,0.050494,...,0.000688,0.245064,0.0,0.0,0.08104,0.075668,0.5,0.015358,0.014067,0.009895
80034,0.981443,0.1,0.125,0.0,0.160401,0.047803,0.066456,0.05432,0.727273,0.050494,...,0.016624,0.245064,0.0,0.0,0.08104,0.075668,0.807692,0.009598,0.012749,0.004532
47040,0.981443,0.125,0.3125,0.0,0.143253,0.106205,0.066456,0.120684,0.727273,0.050494,...,0.000869,0.245064,0.5,0.0,0.08104,0.075668,0.876923,0.021976,0.022004,0.016611
82414,0.981443,0.05,0.0625,0.0,0.160401,0.029729,0.066456,0.033781,0.727273,0.050494,...,0.000837,0.245064,0.0,0.0,0.08104,0.075668,0.807692,0.00913,0.008617,0.006403
66603,0.981443,0.225,0.25,0.0,0.577892,0.194512,0.066456,0.221028,0.727273,0.050494,...,0.003351,0.245064,0.5,0.0,0.08104,0.075668,0.992308,0.051243,0.05378,0.040571


In [83]:
data = norm_data
train = data.iloc[train.index.tolist()]
test = data.iloc[test.index.tolist()]

data = None
norm_data = None
collect()

9091

# Modeling v1 - Raw Attributes

In [17]:
from sklearn.linear_model import SGDRegressor, Ridge
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from numpy import median

In [18]:
def scorer(estimator, X, y):
    pred = estimator.predict(X)
    return mean_absolute_error(pred, y)

# def partition(data, train_proportion=0.7):
#     train = data.sample( int(len(data) * train_proportion) )

#     test_indexes = set(data.index.tolist()) - set(train.index.tolist())
#     test = data.loc[test_indexes]
    
#     return train, test

def scores(model, train, target, k, scorer=scorer):
    train_score = scorer(model.fit(train, target), train, target)

    cv_scores = cross_val_score(model, train, target, cv=k, scoring=scorer)
    
    return {"train": train_score, "validation": median(cv_scores)}

##### Use columns

In [20]:
skb = SelectKBest(k=10)
fit = skb.fit(train[numeric_columns], train["logerror"])

use_columns = train[numeric_columns].columns[fit.get_support()].tolist()
use_columns

['bathroomcnt',
 'threequarterbathnbr',
 'calculatedfinishedsquarefeet',
 'finishedsquarefeet12',
 'finishedsquarefeet15',
 'fullbathcnt',
 'yearbuilt',
 'taxamount',
 'structuretaxvaluedollarcnt',
 'landtaxvaluedollarcnt']

In [21]:
results = []
k = 5

##### Train and test partitioning

In [23]:
# train, test = partition(data[use_columns + ["logerror"]], train_proportion=0.7)

### <font color="blue">Linear Regression</font>

##### Stochastic Gradient Descent Regressor

In [24]:
#model = SGDRegressor(alpha=0.001, max_iter=1000)
# cv_scores = cross_val_score(model, train[use_columns], train["logerror"], cv=k, scoring=scorer)
# print("median:", median(cv_scores))
# cv_scores

model = SGDRegressor(alpha=0.001, max_iter=1000)
scores_dict = scores(model, train[use_columns], train["logerror"], k, scorer=scorer)
scores_dict

{'train': 0.069143339639002876, 'validation': 0.069195049576681697}

In [25]:
results.append({"model": "linear SGDRegressor", "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "alpha0.001, max_iter=1000"})

##### Ridge

In [26]:
# model = Ridge()
# cv_scores = cross_val_score(model, train[use_columns], train["logerror"], cv=k, scoring=scorer)
# print("median:", median(cv_scores))
# cv_scores

model = Ridge()
scores_dict = scores(model, train[use_columns], train["logerror"], k, scorer=scorer)
scores_dict

{'train': 0.069254386114012356, 'validation': 0.069241820182844749}

In [27]:
results.append({"model": "linear Ridge", "score_train": scores_dict["train"], "score_cv": scores_dict["validation"]})

In [28]:
# model = Ridge(alpha=2.0)
# cv_scores = cross_val_score(model, train[use_columns], train["logerror"], cv=k, scoring=scorer)
# print("median:", median(cv_scores))
# cv_scores

model = Ridge(alpha=2.0)
scores_dict = scores(model, train[use_columns], train["logerror"], k, scorer=scorer)
scores_dict

{'train': 0.069249229942448476, 'validation': 0.069222834904631922}

In [29]:
results.append({"model": "linear Ridge", "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "alpha=2"})

In [30]:
# model = Ridge(alpha=10.0)
# cv_scores = cross_val_score(model, train[use_columns], train["logerror"], cv=k, scoring=scorer)
# print("median:", median(cv_scores))
# cv_scores

model = Ridge(alpha=10.0)
scores_dict = scores(model, train[use_columns], train["logerror"], k, scorer=scorer)
scores_dict

{'train': 0.069227841108432733, 'validation': 0.069201428365949003}

In [31]:
results.append({"model": "linear Ridge", "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "alpha=10"})

In [32]:
model = Ridge(alpha=0.5)
scores_dict = scores(model, train[use_columns], train["logerror"], k, scorer=scorer)
scores_dict

{'train': 0.069261427274144513, 'validation': 0.06926358705731521}

In [33]:
results.append({"model": "linear Ridge", "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "alpha=0.5"})

In [34]:
model = Ridge(alpha=0.2)
scores_dict = scores(model, train[use_columns], train["logerror"], k, scorer=scorer)
scores_dict

{'train': 0.069273219167527003, 'validation': 0.069276719752139795}

In [35]:
results.append({"model": "linear Ridge", "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "alpha=0.2"})

### <font color="blue">Polynomial Regression</font>

In [36]:
from sklearn.preprocessing import PolynomialFeatures

def polynomial_features(df, degrees=2):
    poly = PolynomialFeatures(degree=2)
    polinomial_data = pd.DataFrame(poly.fit_transform(df))
    polinomial_data.columns = poly.get_feature_names()
    return polinomial_data

In [41]:
poly_test = polynomial_features(test[use_columns], degrees=5)
poly_test.head()

poly_train = polynomial_features(train[use_columns], degrees=5)
poly_train.head()

Unnamed: 0,1,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x6^2,x6 x7,x6 x8,x6 x9,x7^2,x7 x8,x7 x9,x8^2,x8 x9,x9^2
0,1.0,0.15,0.0,0.126171,0.143371,0.050494,0.105263,0.938462,0.046075,0.03849,...,0.88071,0.04324,0.036122,0.011912,0.002123,0.001773,0.000585,0.001481,0.000489,0.000161112
1,1.0,0.15,0.0,0.105633,0.120034,0.050494,0.105263,0.5,0.043089,0.023772,...,0.25,0.021545,0.011886,0.018269,0.001857,0.001024,0.001574,0.000565,0.000869,0.001334992
2,1.0,0.1,0.0,0.065394,0.074309,0.050494,0.052632,0.776923,0.010412,0.014623,...,0.603609,0.00809,0.011361,0.001268,0.000108,0.000152,1.7e-05,0.000214,2.4e-05,2.662896e-06
3,1.0,0.1,0.0,0.053696,0.061016,0.050494,0.052632,0.530769,0.002622,0.002669,...,0.281716,0.001392,0.001417,0.000499,7e-06,7e-06,2e-06,7e-06,3e-06,8.842156e-07
4,1.0,0.1,0.0,0.069572,0.079057,0.050494,0.052632,0.3,0.022623,0.02045,...,0.09,0.006787,0.006135,0.004729,0.000512,0.000463,0.000357,0.000418,0.000322,0.0002484861


##### SGD Regressor

In [42]:
# model = SGDRegressor(max_iter=2000)
# cv_scores = cross_val_score(model, poly_train, train["logerror"], cv=k, scoring=scorer)
# print("median:", median(cv_scores))
# cv_scores

model = SGDRegressor(max_iter=2000)
scores_dict = scores(model, poly_train, train["logerror"], k, scorer=scorer)
scores_dict

{'train': 0.069393688996941261, 'validation': 0.069134476015951962}

In [43]:
results.append({"model": "Polynomial SGD Regressor", "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "degree=5"})

##### Ridge

In [44]:
# model = Ridge()
# cv_scores = cross_val_score(model, poly_train, train["logerror"], cv=k, scoring=scorer)
# print("median:", median(cv_scores))
# cv_scores

model = Ridge()
scores_dict = scores(model, poly_train, train["logerror"], k, scorer=scorer)
scores_dict

{'train': 0.069267568773309848, 'validation': 0.069280391365674418}

In [45]:
results.append({"model": "Polynomial Ridge", "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "degree=5"})

In [46]:
model = Ridge(alpha=0.5)
scores_dict = scores(model, poly_train, train["logerror"], k, scorer=scorer)
scores_dict

{'train': 0.069267102171698708, 'validation': 0.069301282283425164}

In [47]:
results.append({"model": "Polynomial Ridge", "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "degree=5, alpha=0.5"})

In [48]:
model = Ridge(alpha=10)
scores_dict = scores(model, poly_train, train["logerror"], k, scorer=scorer)
scores_dict

{'train': 0.06924949854472974, 'validation': 0.06922250059498268}

In [49]:
results.append({"model": "Polynomial Ridge", "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "degree=5, alpha=10.0"})

In [50]:
results_df = pd.DataFrame(results)
results_df.sort_values("score_cv")

Unnamed: 0,model,score_cv,score_train,tags
6,Polynomial SGD Regressor,0.069134,0.069394,degree=5
0,linear SGDRegressor,0.069195,0.069143,"alpha0.001, max_iter=1000"
3,linear Ridge,0.069201,0.069228,alpha=10
9,Polynomial Ridge,0.069223,0.069249,"degree=5, alpha=10.0"
2,linear Ridge,0.069223,0.069249,alpha=2
1,linear Ridge,0.069242,0.069254,
4,linear Ridge,0.069264,0.069261,alpha=0.5
5,linear Ridge,0.069277,0.069273,alpha=0.2
7,Polynomial Ridge,0.06928,0.069268,degree=5
8,Polynomial Ridge,0.069301,0.069267,"degree=5, alpha=0.5"


In [51]:
# results_df.to_csv("results_df.csv", index=False)

In [52]:
mlp_results = []

##### Multi Layer Perceptron Regressor

In [53]:
for hidden_layer_sizes in [10,50,100,150,300,450,600,1000,3000,5000,7000,9000]:
    model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes)
    print("hidden_layer_sizes:", hidden_layer_sizes)
    scores_dict = scores(model, train[use_columns], train["logerror"], k, scorer=scorer)
    mlp_results.append({"model": "linear MLP", "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "hidden_layer_sizes=" + str(hidden_layer_sizes)})
    print(pd.DataFrame(mlp_results))

hidden_layer_sizes: 10
        model  score_cv  score_train                   tags
0  linear MLP  0.069149     0.069428  hidden_layer_sizes=10
hidden_layer_sizes: 50
        model  score_cv  score_train                   tags
0  linear MLP  0.069149     0.069428  hidden_layer_sizes=10
1  linear MLP  0.069432     0.069342  hidden_layer_sizes=50
hidden_layer_sizes: 100
        model  score_cv  score_train                    tags
0  linear MLP  0.069149     0.069428   hidden_layer_sizes=10
1  linear MLP  0.069432     0.069342   hidden_layer_sizes=50
2  linear MLP  0.069399     0.069053  hidden_layer_sizes=100
hidden_layer_sizes: 150
        model  score_cv  score_train                    tags
0  linear MLP  0.069149     0.069428   hidden_layer_sizes=10
1  linear MLP  0.069432     0.069342   hidden_layer_sizes=50
2  linear MLP  0.069399     0.069053  hidden_layer_sizes=100
3  linear MLP  0.070091     0.070335  hidden_layer_sizes=150
hidden_layer_sizes: 300
        model  score_cv  score_tr

In [54]:
for hidden_layer_sizes in [10,50,100,150,300,450,600,1000,3000,5000,7000,9000]:
    model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes)
    print("hidden_layer_sizes:", hidden_layer_sizes)
    scores_dict = scores(model, poly_train, train["logerror"], k, scorer=scorer)
    mlp_results.append({"model": "polynomial MLP", "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "hidden_layer_sizes=" + str(hidden_layer_sizes)})
    print(pd.DataFrame(mlp_results))

hidden_layer_sizes: 10
             model  score_cv  score_train                     tags
0       linear MLP  0.069149     0.069428    hidden_layer_sizes=10
1       linear MLP  0.069432     0.069342    hidden_layer_sizes=50
2       linear MLP  0.069399     0.069053   hidden_layer_sizes=100
3       linear MLP  0.070091     0.070335   hidden_layer_sizes=150
4       linear MLP  0.069267     0.069040   hidden_layer_sizes=300
5       linear MLP  0.070196     0.069056   hidden_layer_sizes=450
6       linear MLP  0.069703     0.069069   hidden_layer_sizes=600
7       linear MLP  0.072752     0.069774  hidden_layer_sizes=1000
8       linear MLP  0.069816     0.069478  hidden_layer_sizes=3000
9       linear MLP  0.069677     0.070703  hidden_layer_sizes=5000
10      linear MLP  0.069322     0.068949  hidden_layer_sizes=7000
11      linear MLP  0.069279     0.070523  hidden_layer_sizes=9000
12  polynomial MLP  0.070166     0.071925    hidden_layer_sizes=10
hidden_layer_sizes: 50
             mod

             model  score_cv  score_train                     tags
0       linear MLP  0.069149     0.069428    hidden_layer_sizes=10
1       linear MLP  0.069432     0.069342    hidden_layer_sizes=50
2       linear MLP  0.069399     0.069053   hidden_layer_sizes=100
3       linear MLP  0.070091     0.070335   hidden_layer_sizes=150
4       linear MLP  0.069267     0.069040   hidden_layer_sizes=300
5       linear MLP  0.070196     0.069056   hidden_layer_sizes=450
6       linear MLP  0.069703     0.069069   hidden_layer_sizes=600
7       linear MLP  0.072752     0.069774  hidden_layer_sizes=1000
8       linear MLP  0.069816     0.069478  hidden_layer_sizes=3000
9       linear MLP  0.069677     0.070703  hidden_layer_sizes=5000
10      linear MLP  0.069322     0.068949  hidden_layer_sizes=7000
11      linear MLP  0.069279     0.070523  hidden_layer_sizes=9000
12  polynomial MLP  0.070166     0.071925    hidden_layer_sizes=10
13  polynomial MLP  0.069862     0.069176    hidden_layer_size

In [55]:
# mlp_results_df = pd.DataFrame(mlp_results).sort_values("score_cv")

In [58]:
mlp_results_df["hidden_layer_sizes"] = mlp_results_df["tags"].apply(lambda value: int(value.split("=")[1]))
mlp_results_df = mlp_results_df.sort_values("hidden_layer_sizes")
# mlp_results_df.to_csv("mlp_results_df.csv", index=False)
mlp_results_df

Unnamed: 0,model,score_cv,score_train,tags,hidden_layer_sizes
0,linear MLP,0.069149,0.069428,hidden_layer_sizes=10,10
12,polynomial MLP,0.070166,0.071925,hidden_layer_sizes=10,10
1,linear MLP,0.069432,0.069342,hidden_layer_sizes=50,50
13,polynomial MLP,0.069862,0.069176,hidden_layer_sizes=50,50
14,polynomial MLP,0.069386,0.072895,hidden_layer_sizes=100,100
2,linear MLP,0.069399,0.069053,hidden_layer_sizes=100,100
3,linear MLP,0.070091,0.070335,hidden_layer_sizes=150,150
15,polynomial MLP,0.069532,0.069174,hidden_layer_sizes=150,150
4,linear MLP,0.069267,0.06904,hidden_layer_sizes=300,300
16,polynomial MLP,0.069929,0.06909,hidden_layer_sizes=300,300


In [59]:
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot, row
output_notebook()

In [60]:
poli_mlp = mlp_results_df[mlp_results_df["model"] == "polynomial MLP"]
linear_mlp = mlp_results_df[mlp_results_df["model"] == "linear MLP"]

In [62]:
p = figure(width=600, height=400, tools=["save", "xpan", "xwheel_zoom", "reset"], x_axis_label = "n_layers", y_axis_label = "score cv")

In [63]:
p = figure(width=600, height=400, tools=["save", "xpan", "xwheel_zoom", "reset"], x_axis_label = "n_layers", y_axis_label = "cv_score")

p.line(x=poli_mlp["hidden_layer_sizes"], y=poli_mlp["score_cv"], 
       line_width=2, color="green", legend="polynomial")

p.line(x=linear_mlp["hidden_layer_sizes"], y=linear_mlp["score_cv"], 
       line_width=2, color="red", legend="linear")

show(p)

In [85]:
geoloc_train.head()

Unnamed: 0,parcelid,logerror,latitude,longitude
0,12149460,0.0363,34225911.0,-118238480.0
1,11962551,0.0218,34109753.0,-118227743.0
2,10734901,0.01,34172194.0,-118644314.0
3,11104067,0.006,34429140.0,-118466425.0
4,12678699,-0.0111,33835813.0,-118358044.0


# Modeling v2 - The New Features 

In [None]:
from sklearn.neighbors import NearestNeighbors

In [86]:
geoloc_train = pd.read_csv("../data/train.csv")[["parcelid", "logerror" ,"latitude", "longitude"]]
geoloc_train = geoloc_train[geoloc_train["latitude"].notnull()]
geoloc_train.head()

Unnamed: 0,parcelid,logerror,latitude,longitude
0,12149460,0.0363,34225911.0,-118238480.0
1,11962551,0.0218,34109753.0,-118227743.0
2,10734901,0.01,34172194.0,-118644314.0
3,11104067,0.006,34429140.0,-118466425.0
4,12678699,-0.0111,33835813.0,-118358044.0


In [89]:
geoloc_test = pd.read_csv("../data/test.csv")[["parcelid", "logerror" ,"latitude", "longitude"]]

for column_name in ["latitude", "longitude"]:
    mode = column_mode(geoloc_test[column_name])
    geoloc_test[column_name] = geoloc_test[column_name].fillna(mode)
geoloc_test.head()

(27244, 4)

In [90]:
nbrs = NearestNeighbors(n_neighbors=4).fit(geoloc_train[["latitude", "longitude"]])

distances_train, indices_train = nbrs.kneighbors(geoloc_train[["latitude", "longitude"]])
distances_test, indices_test = nbrs.kneighbors(geoloc_test[["latitude", "longitude"]])

### New feature: 3nn_logerror_median

The new feature <b>3nn_logerror_median</b> represents the logerror median of the three known closests neighbors.

In [102]:
geoloc_nn_train = pd.DataFrame(indices_train)
geoloc_nn_test = pd.DataFrame(indices_test)
print(geoloc_nn_train.head())
print(geoloc_nn_test.head())

   0      1      2      3
0  0  49784  48917  33909
1  1   5722  45897  48565
2  2  59270  46722  59939
3  3  42881  21948   4544
4  4  59642   5195  40087
       0      1      2      3
0  33827    930   4178  47902
1  49528  24109  19081  34443
2   6283  50993  58290  34681
3  21109  23291   2776  50785
4  62319  31040  53464  34332


In [92]:
def map_index_logerror(index, data):
    return data.iloc[index]["logerror"]

def map_index_parcelid(index, data):
    return int(data.iloc[index]["parcelid"])

In [96]:
nn_logerror_train = pd.DataFrame()
for column in geoloc_nn_train.columns.tolist()[1:]:
    print(column)
    nn_logerror_train[str(column)] = geoloc_nn_train[column].apply(lambda value: map_index_logerror(value, geoloc_train))

nn_logerror_train.head()

1
2
3


Unnamed: 0,1,2,3
0,0.0354,-0.0661,-0.1301
1,0.1115,0.0677,0.0469
2,-0.0398,0.0545,-0.1143
3,0.0296,0.006,0.002
4,-0.0315,0.0129,0.1856


In [98]:
nn_logerror_test = pd.DataFrame()
for column in geoloc_nn_test.columns.tolist()[1:]:
    print(column)
    nn_logerror_test[str(column)] = geoloc_nn_test[column].apply(lambda value: map_index_logerror(value, geoloc_train))

nn_logerror_test.head()

1
2
3


Unnamed: 0,1,2,3
0,-0.0471,-0.1054,0.002
1,-0.005,0.2601,0.0227
2,0.0478,0.001,-0.0131
3,0.2343,0.9884,0.0469
4,-0.007,0.0149,0.0564


In [103]:
nn_logerror_median_train = nn_logerror_train.apply(lambda row: median([row["1"], row["2"], row["3"]]), axis=1)
geoloc_train["3nn_logerror_median"] = nn_logerror_median_train.tolist()
print(len(geoloc_train))
geoloc_train.head()

63196


Unnamed: 0,parcelid,logerror,latitude,longitude,3nn_logerror_median
0,12149460,0.0363,34225911.0,-118238480.0,-0.0661
1,11962551,0.0218,34109753.0,-118227743.0,0.0677
2,10734901,0.01,34172194.0,-118644314.0,-0.0398
3,11104067,0.006,34429140.0,-118466425.0,0.006
4,12678699,-0.0111,33835813.0,-118358044.0,0.0129


In [104]:
nn_logerror_median_test = nn_logerror_test.apply(lambda row: median([row["1"], row["2"], row["3"]]), axis=1)
geoloc_test["3nn_logerror_median"] = nn_logerror_median_test.tolist()
print(len(geoloc_test))
geoloc_test.head()

27244


Unnamed: 0,parcelid,logerror,latitude,longitude,3nn_logerror_median
0,14366692,-0.1684,33668120.0,-117677556.0,-0.0471
1,12643413,0.0218,33755800.0,-118309000.0,0.0227
2,14432541,-0.005,33485643.0,-117700234.0,0.001
3,11509835,-0.2705,33870089.0,-118402768.0,0.2343
4,11672170,-0.0161,34072220.0,-118547322.0,0.0149


In [110]:
geoloc_train["3nn_logerror_median"].corr(geoloc_train["logerror"])

0.054552937942001105

In [113]:
train["3nn_logerror_median"] = geoloc_train["3nn_logerror_median"]
train.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,3nn_logerror_median
9876,13025861,0.0296,2016-02-17,1.0,7.0,0.981443,0.15,0.1875,4.0,4.0,...,True,0.03849,694000.0,2015.0,0.012693,0.046075,Y,14.0,60374010000000.0,0.0602
52158,11647851,0.1151,2016-07-01,1.0,7.0,0.981443,0.15,0.1875,4.0,4.0,...,True,0.023772,1131772.0,2015.0,0.036538,0.043089,Y,14.0,60372650000000.0,-0.0274
46312,11198752,-0.0111,2016-06-15,1.0,7.0,0.981443,0.1,0.1875,4.0,7.0,...,True,0.014623,185572.0,2015.0,0.001632,0.010412,Y,14.0,60379110000000.0,0.0411
79157,13915698,-0.0429,2016-09-23,1.0,7.0,0.981443,0.1,0.1875,4.0,7.0,...,True,0.002669,49715.0,2015.0,0.00094,0.002622,Y,14.0,60590110000000.0,
24214,11882403,-0.0834,2016-04-11,1.0,7.0,0.981443,0.1,0.1875,4.0,7.0,...,True,0.02045,589759.0,2015.0,0.015763,0.022623,Y,14.0,60374810000000.0,0.044


In [None]:
skb = SelectKBest(k=10)
fit = skb.fit(train[numeric_columns], train["logerror"])

use_columns = train[numeric_columns].columns[fit.get_support()].tolist()
use_columns