In [51]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import gc
import warnings

from bokeh.charts import Histogram, Bar, BoxPlot, Scatter, show
from bokeh.plotting import figure
from bokeh.io import output_notebook
from bokeh.layouts import gridplot

output_notebook()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [4]:
train_2016_short = pd.read_csv("../data/train_2016.csv", parse_dates=["transactiondate"])
prop_2016 = pd.read_csv("../data/properties_2016.csv", low_memory=False)

In [7]:
train = train_2016_short.merge(prop_2016, on="parcelid", how="left")
train = train.set_index(train["parcelid"])
train = train[["latitude", "longitude", "logerror"]]
train.head()

Unnamed: 0_level_0,latitude,longitude,logerror
parcelid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11016594,34280990.0,-118488536.0,0.0276
14366692,33668120.0,-117677556.0,-0.1684
12098116,34136312.0,-118175032.0,-0.004
12643413,33755800.0,-118309000.0,0.0218
14432541,33485643.0,-117700234.0,-0.005


In [220]:
def fit_knn(df, cols, k):
    df = df.dropna(subset=cols)
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute').fit(df[cols])
    return nbrs

def k_neighbors_logerror_mean(df_train, df_test, cols, new_column, nbrs):
    logerror_means = []

    df_train = df_train.dropna(subset=cols).copy()
    df_test = df_test.dropna(subset=cols).copy()

    for index, row in df_test.iterrows():
        kn_distances, kn_indexes = nbrs.kneighbors(row[cols])
        kn_indexes = list(kn_indexes[0])

        neighbors = df_train.iloc[kn_indexes]
        neighbors = neighbors[neighbors.index != index]

        neighbors_logerror_mean = neighbors["logerror"].mean()

        logerror_means.append(neighbors_logerror_mean)
        
    df_test[new_column] = logerror_means
        
    return df_test[[new_column]]

In [132]:
cols = ["latitude", "longitude"]
nbrs = fit_knn(train, cols, 2)

nearest_logerror = k_neighbors_logerror_mean(train, train, cols, "nearest_logerror", nbrs)

use_train = train.join(nearest_logerror, how="inner")


### Correlations between <font color='blue'>logerror</font>  and <font color='blue'>logerror of the nearest neighbor</font>

abs means that it was considered the absolute values of both columns logerror and nearest_logerror

In [167]:
print("pearson     :", use_train["logerror"].corr(use_train["nearest_logerror"], method="pearson"))
print("spearman    :", use_train["logerror"].corr(use_train["nearest_logerror"], method="spearman"))
print("abs pearson :", abs(use_train["logerror"]).corr(abs(use_train["nearest_logerror"]), method="pearson"))
print("abs spearman:", abs(use_train["logerror"]).corr(abs(use_train["nearest_logerror"]), method="spearman"))
use_train.head()

pearson     : 0.0483033176882
spearman    : 0.0678755405492
abs pearson : 0.0717932717364
abs spearman: 0.105052610434


Unnamed: 0_level_0,latitude,longitude,logerror,error,nearest_logerror
parcelid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10711738,34220381.0,-118620802.0,0.028,1.0,-0.026
10711755,34222040.0,-118622240.0,-0.018,-1.0,0.028
10711805,34220427.0,-118618549.0,-0.101,-1.0,-0.012
10711816,34222390.0,-118618631.0,-0.012,-1.0,-0.048
10711858,34222544.0,-118617961.0,-0.048,-1.0,-0.012


### Scatter plot of <font color='blue'>x = logerror</font> and <font color='blue'>y = nearest_logerror</font>

In blue with signal, in green without (abs)

In [157]:
p = figure(plot_width=450, plot_height=450, title="")
p.circle(use_train['logerror'], use_train['nearest_logerror'], size=3, alpha=0.2)

p_abs = figure(plot_width=450, plot_height=450, title="")
p_abs.circle(abs(use_train['logerror']), abs(use_train['nearest_logerror']), color="green", size=3, alpha=0.2)

show(gridplot([[p, p_abs]]))

In [144]:
def k_neighbors_ll(df_train, df_test, cols, new_column, nbrs):
    neighbors_lat = []
    neighbors_lon = []
    df_train = df_train.dropna(subset=cols).copy()
    df_test = df_test.dropna(subset=cols).copy()

    for index, row in df_test.iterrows():

        kn_distances, kn_indexes = nbrs.kneighbors(row[cols])
        kn_indexes = list(kn_indexes[0])

        neighbors = df_train.iloc[kn_indexes]
        neighbors = neighbors[neighbors.index != index]
    
        neighbor_lat = neighbors["latitude"].item()
        neighbor_lon = neighbors["longitude"].item()
        
        neighbors_lat.append(neighbor_lat)
        neighbors_lon.append(neighbor_lon)
        
    df_test["n_lat"] = neighbors_lat
    df_test["n_lon"] = neighbors_lon
        
    return df_test[["n_lat", "n_lon"]]

In [145]:
cols = ["latitude", "longitude"]
nbrs = fit_knn(train.head(50), cols, 2)

nearest_logerror = k_neighbors_ll(train.head(50), train.head(50), cols, "nearest_logerror", nbrs)

train_ll = train.join(nearest_logerror, how="inner")

### We can check that the algorithm is working good by checking a line between a point and its closest neighbor

In [181]:
p = figure(plot_width=450, plot_height=350, title="Closest Neighbors")

for index, row in train_ll.head(50).iterrows():  
    p.circle(row['longitude'], row['latitude'], size=5, alpha=1)
    line = pd.DataFrame({"x": [row['longitude'], row["n_lon"]], "y": [row['latitude'], row["n_lat"]] })
    p.line(line["x"], line["y"], line_width=1, color="magenta")
    
show(p)


### Checking the logerror signal of the neighbors

In [262]:
train["error_signal"] = train["logerror"] / abs(train["logerror"])
train["error_signal"] = train["error_signal"].fillna(value=0)

cols = ["longitude", "longitude"]

k_geo = []
for k in list(range(2,10)) + list(range(11,50,4)) + list(range(51,500,20)):
    print("k:", k)
    
    nbrs = fit_knn(train, cols, k)

    err_signal = k_neighbors_logerror_mean(train, train, cols, "n_error_signal", nbrs)
    train_signal = train.join(err_signal, how="inner")
    train_signal["n_error_signal"] = train_signal["n_error_signal"] / abs(train_signal["n_error_signal"])
    train_signal["error_signal"] = train_signal["error_signal"].fillna(value=0)
    
    freq_correct_signal = (train_signal["error_signal"] == train_signal["n_error_signal"]).value_counts()
    ac = freq_correct_signal[True] / len(train_signal)

    k_geo.append({"k": k, "accuracy": ac})
    print(ac)
    print()
    
k_geo_df = pd.DataFrame(k_geo)
k_geo_df = k_geo_df.sort_values(by="accuracy", ascending=False)

k: 2
0.504103748053

k: 3
0.510300794238

k: 4
0.512300197731

k: 5
0.515249594044

k: 6
0.515580986888

k: 7
0.516564118992

k: 8
0.516807140411

k: 9
0.520220486706

k: 11
0.520629204547

k: 15
0.522264075911

k: 19
0.524970450805

k: 23
0.525854165056

k: 27
0.525257657936

k: 31
0.525787886487

k: 35
0.527146597148

k: 39
0.527069272151

k: 43
0.528240193533

k: 47
0.528306472102

k: 51
0.527234968573

k: 71
0.533387829045

k: 91
0.531841329106

k: 111
0.534625028997

k: 131
0.534359914722

k: 151
0.536083157511

k: 171
0.535398278966

k: 191
0.535862228948

k: 211
0.536425596783

k: 231
0.536635478918

k: 251
0.538524418129

k: 271
0.539938360931

k: 291
0.540943585892

k: 311
0.541705789433

k: 331
0.541716835861

k: 351
0.542147646558

k: 371
0.542324389409

k: 391
0.542666828681

k: 411
0.5432633358

k: 431
0.544246467905

k: 451
0.543926121489

k: 471
0.543970307201

k: 491
0.544301700045



In [264]:
freq_correct_signal = (train_signal["error_signal"] == train_signal["n_error_signal"]).value_counts()
print(freq_correct_signal[True] / len(train_signal))
freq_correct_signal

0.544301700045


True     49274
False    41253
dtype: int64

In [266]:
train_signal["error_signal"].value_counts()

1.000     49870
-1.000    39806
0.000       851
Name: error_signal, dtype: int64

PLOTAR GRAFICO DE X = LOGERROR e Y = MÉDIA ERROS DE SINAL

In [233]:
error_0 = train[train["logerror"] == 0]
print(len(error_0))

848


In [229]:
train_signal[train_signal["error_signal"] != train_signal["n_error_signal"]].head()


Unnamed: 0_level_0,latitude,longitude,logerror,error_signal,n_error_signal
parcelid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [197]:
print(len(train["logerror"]), len(train.dropna(subset=["error_signal"])))
print(len(train_signal["logerror"]))
print(len(err_signal))

#(train["logerror"] == train_signal["logerror"]).value_counts()

90811 89963
90219
89963


In [213]:
del train_signal["n_error_signal"]