In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
train_2016 = pd.read_csv('https://drive.google.com/uc?id=15GlcdLJ79bc5_WhVNViepQaXvsE1vYb8')
properties_2016 = pd.read_csv('/Users/yang/Downloads/properties_2016.csv')

In [None]:
training_data = pd.merge(train_2016, properties_2016, on=['parcelid'], how='inner')
training_data.shape

# Check feature correlations

In [None]:
# Check correlation to logerror
correlations = training_data.drop(['logerror', 'transactiondate'], axis=1).corrwith(training_data['logerror']).sort_values(ascending=False).to_frame('corr')
coverage = (1 - training_data.isnull().sum() / training_data.shape[0]).to_frame('coverage')
correlations.join(coverage)

In [None]:
# Check correlation of high coverage features
cols = coverage[coverage.coverage > 0.8].index.tolist()


corrmat = training_data[cols].drop(['logerror', 'transactiondate'], axis=1).corr(method='spearman')
f, ax = plt.subplots(figsize=(8, 8))

# Draw the heatmap using seaborn
sns.heatmap(corrmat, vmax=1., square=True)
plt.title("Correlation of high coverage variables", fontsize=15)
plt.show()


# Check distribution of single features against logerr

In [None]:
plt.figure(figsize=(20, 10))
sns.kdeplot(
    data=training_data.assign(
        abs_err=np.abs(training_data.logerror),
        bed_bins=pd.qcut(training_data.bedroomcnt, q=4)
    ),
    x='abs_err',
    hue='bed_bins',
    fill=True,
    clip=(-0.1, 1.0),
    common_norm=False
)

In [None]:
plt.figure(figsize=(20, 10))
sns.kdeplot(
    data=training_data.assign(
        abs_err=np.abs(training_data.logerror)
    ),
    x='abs_err',
    hue='heatingorsystemtypeid',
    fill=True,
    clip=(-0.1, 1.0),
    common_norm=False
)

In [None]:
plt.figure(figsize=(20, 10))
sns.kdeplot(
    data=training_data.assign(
        abs_err=np.abs(training_data.logerror)
    ),
    x='abs_err',
    hue='buildingqualitytypeid',
    fill=True,
    clip=(-0.1, 1.0),
    common_norm=False
)

In [None]:
plt.figure(figsize=(20, 10))
sns.kdeplot(
    data=training_data.assign(
        abs_err=np.abs(training_data.logerror),
        has_basement=training_data.basementsqft.notnull()
    ),
    x='abs_err',
    hue='has_basement',
    fill=True,
    clip=(-0.1, 1.0),
    common_norm=False
)

In [None]:
plt.figure(figsize=(20, 10))
sns.kdeplot(
    data=training_data.assign(
        abs_err=np.abs(training_data.logerror),
        has_shed=training_data.yardbuildingsqft26.notnull()
    ),
    x='abs_err',
    hue='has_shed',
    fill=True,
    clip=(-0.1, 1.0),
    common_norm=False
)

In [None]:
plt.figure(figsize=(20, 10))
sns.kdeplot(
    data=training_data.assign(
        abs_err=np.abs(training_data.logerror),
        has_garage=training_data.garagecarcnt.notnull()
    ),
    x='abs_err',
    hue='has_garage',
    fill=True,
    clip=(-0.1, 1.0),
    common_norm=False
)

# Some feature conclusions
- Having a basement will increase error by a lot, but most homes don't have basements
- 2-4 bedrooms seems to be the sweet spot, when bedrooms are higher or lower things get worse
- For a lot of low coverage features, the presence or absence (i.e. a binary variable) is a pretty important indicator for predicted error