In [1]:
import pandas as pd
import gc

from bokeh.charts import Histogram, Bar, BoxPlot, Scatter, show
from bokeh.io import output_notebook
from bokeh.layouts import gridplot

output_notebook()

In [2]:
DATA_DIR = "../data/"
NEW_FEATURES_DIR = "../feat-engineering/new features/"

In [3]:
train_2016_short = pd.read_csv(DATA_DIR + "train_2016.csv", parse_dates=["transactiondate"])
prop_2016 = pd.read_csv(DATA_DIR + "properties_2016.csv", low_memory=False)

In [4]:
new_feat_filenames = ["knn-longitude-latitude.csv",
                      "knn-latitude-longitude.csv",
                      "knn-latitude-longitude-2.csv",
                      "knn-latitude-finishedsquarefeet12.csv", 
                      "knn-taxvaluedollarcnt-calculatedfinishedsquarefeet.csv",
                      "knn-taxvaluedollarcnt-finishedsquarefeet12.csv"]

In [5]:
def load_new_features(train_df):
    df = train_df.copy()
    for filename in new_feat_filenames:
        feat_df = pd.read_csv(NEW_FEATURES_DIR + filename)
        print(filename, ">>>",feat_df.columns.tolist())
        df = df.merge(feat_df, on="parcelid", how="left")
        gc.collect()
    return df

In [6]:
features = load_new_features(train_2016_short)
print(len(features))
print(len(prop_2016))
features.head(30)

knn-longitude-latitude.csv >>> ['parcelid', 'longitude--latitude']
knn-latitude-longitude.csv >>> ['parcelid', 'nbrs_logerror_mean', 'basementsqft']
knn-latitude-longitude-2.csv >>> ['parcelid', 'nbrs_logerror_mean']
knn-latitude-finishedsquarefeet12.csv >>> ['parcelid', 'latitude--finishedsquarefeet12']
knn-taxvaluedollarcnt-calculatedfinishedsquarefeet.csv >>> ['parcelid', 'taxvaluedollarcnt--calculatedfinishedsquarefeet']
knn-taxvaluedollarcnt-finishedsquarefeet12.csv >>> ['parcelid', 'taxvaluedollarcnt--finishedsquarefeet12']
90811
2985217


Unnamed: 0,parcelid,logerror,transactiondate,longitude--latitude,nbrs_logerror_mean_x,basementsqft,nbrs_logerror_mean_y,latitude--finishedsquarefeet12,taxvaluedollarcnt--calculatedfinishedsquarefeet,taxvaluedollarcnt--finishedsquarefeet12
0,11016594,0.0276,2016-01-01,0.1098,0.0687,,,0.0208,-0.1087,-0.1087
1,14366692,-0.1684,2016-01-01,-0.1625,-0.16545,,-0.16545,-0.1625,0.0149,0.0149
2,12098116,-0.004,2016-01-01,0.0564,0.0262,,,0.077,0.0411,0.0411
3,12643413,0.0218,2016-01-02,-0.005,0.002,,,-0.005,0.044,0.044
4,14432541,-0.005,2016-01-02,-0.0212,-0.0131,,,0.003,-0.079,-0.079
5,11509835,-0.2705,2016-01-02,0.0421,-0.1142,,,0.2608,0.1178,0.1178
6,12286022,0.044,2016-01-02,0.0926,0.0683,,,0.0296,0.003,0.003
7,17177301,0.1638,2016-01-02,0.0751,0.11945,,,0.0129,0.0129,0.0129
8,14739064,-0.003,2016-01-02,-0.001,-0.002,,-2.0,-0.001,0.006,0.006
9,14677559,0.0843,2016-01-03,-0.0131,0.076,,,0.0188,-0.007,-0.007


In [7]:
for col in features.columns:
    print(col)
    print("NAs", features[col].isnull().sum(), features[col].isnull().sum() / len(features))
    print()

features["logerror"].describe()

parcelid
NAs 0 0.0

logerror
NAs 0 0.0

transactiondate
NAs 0 0.0

longitude--latitude
NAs 536 0.00590236865578

nbrs_logerror_mean_x
NAs 536 0.00590236865578

basementsqft
NAs 90768 0.999526489082

nbrs_logerror_mean_y
NAs 59086 0.650648049245

latitude--finishedsquarefeet12
NAs 5215 0.0574269636938

taxvaluedollarcnt--calculatedfinishedsquarefeet
NAs 1198 0.0131922344209

taxvaluedollarcnt--finishedsquarefeet12
NAs 5216 0.0574379755756



count    90811.000000
mean         0.010816
std          0.163288
min         -4.605000
25%         -0.026300
50%          0.005000
75%          0.039200
max          4.737000
Name: logerror, dtype: float64

In [8]:
del features["basementsqft"]
gc.collect()

284

In [9]:
features.corr()

Unnamed: 0,parcelid,logerror,longitude--latitude,nbrs_logerror_mean_x,nbrs_logerror_mean_y,latitude--finishedsquarefeet12,taxvaluedollarcnt--calculatedfinishedsquarefeet,taxvaluedollarcnt--finishedsquarefeet12
parcelid,1.0,0.002075,0.004915,0.006064,0.004658,0.005305,0.003972,0.005062
logerror,0.002075,1.0,0.08365,0.691713,0.157977,0.039949,0.006536,0.009997
longitude--latitude,0.004915,0.08365,1.0,0.651559,0.109688,0.058004,0.008565,0.005266
nbrs_logerror_mean_x,0.006064,0.691713,0.651559,1.0,0.245846,0.065135,0.010932,0.011388
nbrs_logerror_mean_y,0.004658,0.157977,0.109688,0.245846,1.0,0.030214,-0.00151,0.000771
latitude--finishedsquarefeet12,0.005305,0.039949,0.058004,0.065135,0.030214,1.0,0.004683,0.006495
taxvaluedollarcnt--calculatedfinishedsquarefeet,0.003972,0.006536,0.008565,0.010932,-0.00151,0.004683,1.0,0.930616
taxvaluedollarcnt--finishedsquarefeet12,0.005062,0.009997,0.005266,0.011388,0.000771,0.006495,0.930616,1.0


In [10]:
train_2016_complete = train_2016_short.merge(prop_2016, on="parcelid", how="left")
print(len(train_2016_complete))

train_2016_complete = train_2016_complete.merge(features[["parcelid", "nbrs_logerror_mean"]], on="parcelid", how="left")

train_2016_complete.to_csv("../train.csv", index=False)

90811


KeyError: "['nbrs_logerror_mean'] not in index"

In [12]:
abs(features["logerror"]).corr(abs(features["longitude--latitude"]), method="spearman")


0.13762187796195702

In [9]:
d = pd.read_csv("/home/tales/dev/projects/zillow-zestimate/z-evaluate/data/new_features/knn-longitude-latitude.csv")
d["knn-longitude-latitude-signal"] = d["knn-longitude-latitude"] / abs(d["knn-longitude-latitude"])
d.head(10)


Unnamed: 0,parcelid,knn-longitude-latitude,knn-longitude-latitude-signal
0,10754147,0.0645,1.0
1,10759547,-0.0001,-1.0
2,10843547,0.0616,1.0
3,10859147,0.0497,1.0
4,10879947,-0.0954,-1.0
5,10898347,-0.0292,-1.0
6,10933547,0.15955,1.0
7,10940747,-0.02555,-1.0
8,10954547,0.06995,1.0
9,10976347,-0.1015,-1.0


In [10]:
del d["knn-longitude-latitude"]

In [11]:
d.to_csv("/home/tales/dev/projects/zillow-zestimate/z-evaluate/data/new_features/knn-longitude-latitude.csv", index=False)

Unnamed: 0,parcelid,knn-longitude-latitude-signal
0,10754147,1.0
1,10759547,-1.0
2,10843547,1.0
3,10859147,1.0
4,10879947,-1.0
