In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

from IPython.display import display

ipython = get_ipython()

def exception_handler(exception_type, exception, traceback):
    print("%s: %s" % (exception_type.__name__, exception), file=sys.stderr)

ipython._showtraceback = exception_handler

from utils import *

Using TensorFlow backend.


In [3]:
# Set a few plotting defaults
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

In [4]:
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000
pd.options.display.max_colwidth = 1000

In [5]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

train_copy = train.copy()
train_copy['data'] = 'train'

# remove outlier
train_copy = train_copy[~((train_copy['sqft_living'] > 12000) & (train_copy['price'] < 3000000))].reset_index(drop=True)
    
test_copy = test.copy()
test_copy['data'] = 'test'
test_copy['price'] = np.nan

data = pd.concat([train_copy, test_copy], sort=False).reset_index(drop=True)
data = data[train_copy.columns]
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,data
0,0,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,train
1,1,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,train
2,2,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,train
3,3,20140627T000000,257500.0,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819,train
4,4,20150115T000000,291850.0,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711,train


In [6]:
print(data['lat'].min(), data['lat'].max(), data['long'].min(), data['long'].max())

haversine_dist = haversine_array(data['lat'].min(), data['long'].min(), data['lat'].max(), data['long'].max())
print(haversine_dist)

bearing = bearing_array(data['lat'].min(), data['long'].min(), data['lat'].max(), data['long'].max())
print(bearing)

47.1559 47.7776 -122.519 -121.315
113.88352317220664
52.18389277379239


In [7]:
neighbor_info_df = pd.read_csv('../input/neighbor_info.csv')

In [8]:
neighbor_1km = neighbor_info_df[(neighbor_info_df['data'] == 'train') & (neighbor_info_df['distance'] <= 0.5)].copy()
neighbor_1km['neighbor_price_log'] = np.log1p(neighbor_1km['neighbor_price'])
neighbor_1km_stat = neighbor_1km.groupby('id').agg({
    'neighbor_id': 'count',
    'distance': ['min','max','mean','median','std','skew'],
    'neighbor_price_log': ['mean'],
    'neighbor_sqft_living': ['min','max','mean','median','std','skew'],
    'neighbor_sqft_lot': ['min','max','mean','median','std','skew'],
    'neighbor_bedrooms': ['min','max','mean','median','std','skew'],
    'neighbor_bathrooms': ['min','max','mean','median','std','skew'],
    'neighbor_grade': ['min','max','mean','median','std','skew'],
    'neighbor_view': ['min','max','mean','median','std','skew'],
    'neighbor_condition': ['min','max','mean','median','std','skew'],
}).reset_index()

neighbor_1km_stat.columns = [
    'id','neighbor_1km_count',
    'nb_1km_distance_min','nb_1km_distance_max','nb_1km_distance_mean','nb_1km_distance_median','nb_1km_distance_std','nb_1km_distance_skew',
    'nb_1km_price_mean',
    'nb_1km_sqft_living_min','nb_1km_sqft_living_max','nb_1km_sqft_living_mean','nb_1km_sqft_living_median','nb_1km_sqft_living_std','nb_1km_sqft_living_skew',
    'nb_1km_sqft_lot_min','nb_1km_sqft_lot_max','nb_1km_sqft_lot_mean','nb_1km_sqft_lot_median','nb_1km_sqft_lot_std','nb_1km_sqft_lot_skew',
    'nb_1km_bedrooms_min','nb_1km_bedrooms_max','nb_1km_bedrooms_mean','nb_1km_bedrooms_median','nb_1km_bedrooms_std','nb_1km_bedrooms_skew',
    'nb_1km_bathrooms_min','nb_1km_bathrooms_max','nb_1km_bathrooms_mean','nb_1km_bathrooms_median','nb_1km_bathrooms_std','nb_1km_bathrooms_skew',
    'nb_1km_grade_min','nb_1km_grade_max','nb_1km_grade_mean','nb_1km_grade_median','nb_1km_grade_std','nb_1km_grade_skew',
    'nb_1km_view_min','nb_1km_view_max','nb_1km_view_mean','nb_1km_view_median','nb_1km_view_std','nb_1km_view_skew',
    'nb_1km_condition_min','nb_1km_condition_max','nb_1km_condition_mean','nb_1km_condition_median','nb_1km_condition_std','nb_1km_condition_skew',
]

print(neighbor_1km_stat.shape)
neighbor_1km_stat.head()

(21251, 50)


Unnamed: 0,id,neighbor_1km_count,nb_1km_distance_min,nb_1km_distance_max,nb_1km_distance_mean,nb_1km_distance_median,nb_1km_distance_std,nb_1km_distance_skew,nb_1km_sqft_living_min,nb_1km_sqft_living_max,nb_1km_sqft_living_mean,nb_1km_sqft_living_median,nb_1km_sqft_living_std,nb_1km_sqft_living_skew,nb_1km_sqft_lot_min,nb_1km_sqft_lot_max,nb_1km_sqft_lot_mean,nb_1km_sqft_lot_median,nb_1km_sqft_lot_std,nb_1km_sqft_lot_skew,nb_1km_bedrooms_min,nb_1km_bedrooms_max,nb_1km_bedrooms_mean,nb_1km_bedrooms_median,nb_1km_bedrooms_std,nb_1km_bedrooms_skew,nb_1km_bathrooms_min,nb_1km_bathrooms_max,nb_1km_bathrooms_mean,nb_1km_bathrooms_median,nb_1km_bathrooms_std,nb_1km_bathrooms_skew,nb_1km_grade_min,nb_1km_grade_max,nb_1km_grade_mean,nb_1km_grade_median,nb_1km_grade_std,nb_1km_grade_skew,nb_1km_view_min,nb_1km_view_max,nb_1km_view_mean,nb_1km_view_median,nb_1km_view_std,nb_1km_view_skew,nb_1km_condition_min,nb_1km_condition_max,nb_1km_condition_mean,nb_1km_condition_median,nb_1km_condition_std,nb_1km_condition_skew
0,0,27,0.044478,0.493093,0.3207,0.402396,0.152579,-0.513874,580,3700,1930.074074,2010.0,726.084437,0.261805,4000,11205,6590.37037,6250.0,1691.667228,1.153565,2,6,3.62963,4.0,1.005682,0.350796,0.5,3.5,1.87963,1.75,0.812671,0.380755,5,9,7.074074,7.0,0.957799,-0.440191,0,3,0.925926,0.0,1.14105,0.657999,3,5,3.592593,3.0,0.797074,0.903528
1,1,17,0.143364,0.483951,0.296784,0.305922,0.117966,0.111079,1060,3530,2308.294118,2440.0,776.970219,-0.290922,5803,28405,9418.235294,8345.0,5418.354807,2.956274,2,4,3.294118,3.0,0.587868,-0.108579,1.0,3.0,2.294118,2.5,0.574424,-1.575515,7,10,8.176471,8.0,1.074436,0.292851,0,0,0.0,0.0,0.0,0.0,3,4,3.058824,3.0,0.242536,4.123106
2,2,23,0.011119,0.488489,0.326838,0.386112,0.140826,-0.98619,1350,4170,2063.043478,1900.0,628.290862,1.897091,4322,40510,10103.869565,8022.0,7887.222839,3.224402,3,4,3.304348,3.0,0.470472,0.910939,1.75,3.5,2.304348,2.5,0.426112,0.533675,7,10,8.043478,8.0,0.705708,0.788838,0,0,0.0,0.0,0.0,0.0,3,4,3.086957,3.0,0.288104,3.1404
3,3,8,0.143684,0.402694,0.275891,0.265539,0.092658,0.029932,1380,1840,1530.625,1470.0,177.410692,1.127788,1302,8362,4767.75,4666.0,3331.54712,0.006778,2,3,2.625,3.0,0.517549,-0.644061,1.75,2.5,2.125,2.0,0.267261,0.467707,7,7,7.0,7.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,3,4,3.125,3.0,0.353553,2.828427
4,4,5,0.078469,0.449924,0.230414,0.200151,0.140441,1.006589,1010,3300,2034.0,1840.0,874.631351,0.573797,9009,11525,9982.6,9600.0,956.325415,1.258676,3,4,3.4,3.0,0.547723,0.608581,1.5,2.5,1.85,1.75,0.41833,1.088512,7,8,7.2,7.0,0.447214,2.236068,0,0,0.0,0.0,0.0,0.0,3,5,4.2,4.0,0.83666,-0.512241


In [9]:
neighbor_1km_stat.to_csv('../input/neighbor_1km_stat.csv', index=False)

In [10]:
neighbor_3km = neighbor_info_df[(neighbor_info_df['data'] == 'train') & (neighbor_info_df['distance'] <= 1.5)].copy()
neighbor_3km['neighbor_price_log'] = np.log1p(neighbor_3km['neighbor_price'])
neighbor_3km_stat = neighbor_3km.groupby('id').agg({
    'neighbor_id': 'count',
    'distance': ['min','max','mean','median','std','skew'],
    'neighbor_price_log': ['mean'],
    'neighbor_sqft_living': ['min','max','mean','median','std','skew'],
    'neighbor_sqft_lot': ['min','max','mean','median','std','skew'],
    'neighbor_bedrooms': ['min','max','mean','median','std','skew'],
    'neighbor_bathrooms': ['min','max','mean','median','std','skew'],
    'neighbor_grade': ['min','max','mean','median','std','skew'],
    'neighbor_view': ['min','max','mean','median','std','skew'],
    'neighbor_condition': ['min','max','mean','median','std','skew'],
}).reset_index()

neighbor_3km_stat.columns = [
    'id','neighbor_3km_count',
    'nb_3km_distance_min','nb_3km_distance_max','nb_3km_distance_mean','nb_3km_distance_median','nb_3km_distance_std','nb_3km_distance_skew',
    'nb_3km_price_mean',
    'nb_3km_sqft_living_min','nb_3km_sqft_living_max','nb_3km_sqft_living_mean','nb_3km_sqft_living_median','nb_3km_sqft_living_std','nb_3km_sqft_living_skew',
    'nb_3km_sqft_lot_min','nb_3km_sqft_lot_max','nb_3km_sqft_lot_mean','nb_3km_sqft_lot_median','nb_3km_sqft_lot_std','nb_3km_sqft_lot_skew',
    'nb_3km_bedrooms_min','nb_3km_bedrooms_max','nb_3km_bedrooms_mean','nb_3km_bedrooms_median','nb_3km_bedrooms_std','nb_3km_bedrooms_skew',
    'nb_3km_bathrooms_min','nb_3km_bathrooms_max','nb_3km_bathrooms_mean','nb_3km_bathrooms_median','nb_3km_bathrooms_std','nb_3km_bathrooms_skew',
    'nb_3km_grade_min','nb_3km_grade_max','nb_3km_grade_mean','nb_3km_grade_median','nb_3km_grade_std','nb_3km_grade_skew',
    'nb_3km_view_min','nb_3km_view_max','nb_3km_view_mean','nb_3km_view_median','nb_3km_view_std','nb_3km_view_skew',
    'nb_3km_condition_min','nb_3km_condition_max','nb_3km_condition_mean','nb_3km_condition_median','nb_3km_condition_std','nb_3km_condition_skew',
]

print(neighbor_3km_stat.shape)
neighbor_3km_stat.head()

(21490, 50)


Unnamed: 0,id,neighbor_3km_count,nb_3km_distance_min,nb_3km_distance_max,nb_3km_distance_mean,nb_3km_distance_median,nb_3km_distance_std,nb_3km_distance_skew,nb_3km_sqft_living_min,nb_3km_sqft_living_max,nb_3km_sqft_living_mean,nb_3km_sqft_living_median,nb_3km_sqft_living_std,nb_3km_sqft_living_skew,nb_3km_sqft_lot_min,nb_3km_sqft_lot_max,nb_3km_sqft_lot_mean,nb_3km_sqft_lot_median,nb_3km_sqft_lot_std,nb_3km_sqft_lot_skew,nb_3km_bedrooms_min,nb_3km_bedrooms_max,nb_3km_bedrooms_mean,nb_3km_bedrooms_median,nb_3km_bedrooms_std,nb_3km_bedrooms_skew,nb_3km_bathrooms_min,nb_3km_bathrooms_max,nb_3km_bathrooms_mean,nb_3km_bathrooms_median,nb_3km_bathrooms_std,nb_3km_bathrooms_skew,nb_3km_grade_min,nb_3km_grade_max,nb_3km_grade_mean,nb_3km_grade_median,nb_3km_grade_std,nb_3km_grade_skew,nb_3km_view_min,nb_3km_view_max,nb_3km_view_mean,nb_3km_view_median,nb_3km_view_std,nb_3km_view_skew,nb_3km_condition_min,nb_3km_condition_max,nb_3km_condition_mean,nb_3km_condition_median,nb_3km_condition_std,nb_3km_condition_skew
0,0,144,0.044478,1.493888,0.857278,0.853951,0.374291,-0.119308,580,4940,1722.236111,1660.0,714.094291,1.180153,1989,18205,7326.034722,6554.0,2995.095465,1.829384,1,6,3.229167,3.0,1.008487,0.271571,0.5,3.75,1.748264,1.75,0.7154,0.715232,5,10,6.916667,7.0,0.779995,0.595264,0,4,0.826389,0.0,1.313352,1.340491,3,5,3.402778,3.0,0.702836,1.45503
1,1,186,0.143364,1.488608,0.944375,0.969687,0.320936,-0.50027,900,3700,1884.107527,1820.0,633.994645,0.525955,1116,48788,10087.569892,9421.5,5329.618378,4.008781,2,5,3.424731,3.0,0.647015,0.283784,1.0,3.5,2.010753,2.0,0.62915,-0.28976,5,10,7.456989,7.0,0.785531,1.022949,0,3,0.016129,0.0,0.219971,13.638182,3,5,3.317204,3.0,0.531621,1.429559
2,2,154,0.011119,1.482715,1.010126,1.125882,0.380047,-0.720021,970,5360,2332.928571,2270.0,807.997202,0.738208,3885,65340,13330.064935,10071.5,10727.891683,2.787299,2,6,3.480519,3.0,0.697529,0.772703,1.0,4.25,2.344156,2.5,0.590002,0.220127,6,10,8.25974,8.0,0.861714,0.028706,0,3,0.064935,0.0,0.336754,6.342599,2,5,3.214286,3.0,0.470909,1.754842
3,3,64,0.143684,1.496924,1.023828,1.067302,0.366598,-0.86882,760,3560,2039.546875,1820.0,752.056312,0.534044,1302,18200,9002.671875,8526.5,3080.053772,0.230293,2,6,3.328125,3.0,0.757076,1.177871,1.0,4.0,2.085938,2.125,0.560291,0.330518,6,11,7.765625,8.0,1.034979,1.201171,0,0,0.0,0.0,0.0,0.0,2,5,3.28125,3.0,0.518507,0.973835
4,4,87,0.078469,1.499869,1.121808,1.190439,0.316411,-1.45999,830,5370,1863.126437,1730.0,815.301924,1.355401,5000,32481,10430.287356,9112.0,4781.123,2.202759,1,6,3.218391,3.0,0.94531,-0.031793,1.0,3.5,1.761494,1.75,0.701854,0.571493,5,11,6.931034,7.0,0.937502,1.092931,0,4,0.781609,0.0,1.165645,1.205812,2,5,3.505747,3.0,0.66251,0.224049


In [11]:
neighbor_3km_stat.to_csv('../input/neighbor_3km_stat.csv', index=False)

In [12]:
neighbor_5km = neighbor_info_df[(neighbor_info_df['data'] == 'train') & (neighbor_info_df['distance'] <= 2.5)].copy()
neighbor_5km['neighbor_price_log'] = np.log1p(neighbor_5km['neighbor_price'])
neighbor_5km_stat = neighbor_5km.groupby('id').agg({
    'neighbor_id': 'count',
    'distance': ['min','max','mean','median','std','skew'],
    'neighbor_price_log': ['mean'],
    'neighbor_sqft_living': ['min','max','mean','median','std','skew'],
    'neighbor_sqft_lot': ['min','max','mean','median','std','skew'],
    'neighbor_bedrooms': ['min','max','mean','median','std','skew'],
    'neighbor_bathrooms': ['min','max','mean','median','std','skew'],
    'neighbor_grade': ['min','max','mean','median','std','skew'],
    'neighbor_view': ['min','max','mean','median','std','skew'],
    'neighbor_condition': ['min','max','mean','median','std','skew'],
}).reset_index()

neighbor_5km_stat.columns = [
    'id','neighbor_5km_count',
    'nb_5km_distance_min','nb_5km_distance_max','nb_5km_distance_mean','nb_5km_distance_median','nb_5km_distance_std','nb_5km_distance_skew',
    'nb_5km_price_mean',
    'nb_5km_sqft_living_min','nb_5km_sqft_living_max','nb_5km_sqft_living_mean','nb_5km_sqft_living_median','nb_5km_sqft_living_std','nb_5km_sqft_living_skew',
    'nb_5km_sqft_lot_min','nb_5km_sqft_lot_max','nb_5km_sqft_lot_mean','nb_5km_sqft_lot_median','nb_5km_sqft_lot_std','nb_5km_sqft_lot_skew',
    'nb_5km_bedrooms_min','nb_5km_bedrooms_max','nb_5km_bedrooms_mean','nb_5km_bedrooms_median','nb_5km_bedrooms_std','nb_5km_bedrooms_skew',
    'nb_5km_bathrooms_min','nb_5km_bathrooms_max','nb_5km_bathrooms_mean','nb_5km_bathrooms_median','nb_5km_bathrooms_std','nb_5km_bathrooms_skew',
    'nb_5km_grade_min','nb_5km_grade_max','nb_5km_grade_mean','nb_5km_grade_median','nb_5km_grade_std','nb_5km_grade_skew',
    'nb_5km_view_min','nb_5km_view_max','nb_5km_view_mean','nb_5km_view_median','nb_5km_view_std','nb_5km_view_skew',
    'nb_5km_condition_min','nb_5km_condition_max','nb_5km_condition_mean','nb_5km_condition_median','nb_5km_condition_std','nb_5km_condition_skew',
]

print(neighbor_5km_stat.shape)
neighbor_5km_stat.head()

(21499, 50)


Unnamed: 0,id,neighbor_5km_count,nb_5km_distance_min,nb_5km_distance_max,nb_5km_distance_mean,nb_5km_distance_median,nb_5km_distance_std,nb_5km_distance_skew,nb_5km_sqft_living_min,nb_5km_sqft_living_max,nb_5km_sqft_living_mean,nb_5km_sqft_living_median,nb_5km_sqft_living_std,nb_5km_sqft_living_skew,nb_5km_sqft_lot_min,nb_5km_sqft_lot_max,nb_5km_sqft_lot_mean,nb_5km_sqft_lot_median,nb_5km_sqft_lot_std,nb_5km_sqft_lot_skew,nb_5km_bedrooms_min,nb_5km_bedrooms_max,nb_5km_bedrooms_mean,nb_5km_bedrooms_median,nb_5km_bedrooms_std,nb_5km_bedrooms_skew,nb_5km_bathrooms_min,nb_5km_bathrooms_max,nb_5km_bathrooms_mean,nb_5km_bathrooms_median,nb_5km_bathrooms_std,nb_5km_bathrooms_skew,nb_5km_grade_min,nb_5km_grade_max,nb_5km_grade_mean,nb_5km_grade_median,nb_5km_grade_std,nb_5km_grade_skew,nb_5km_view_min,nb_5km_view_max,nb_5km_view_mean,nb_5km_view_median,nb_5km_view_std,nb_5km_view_skew,nb_5km_condition_min,nb_5km_condition_max,nb_5km_condition_mean,nb_5km_condition_median,nb_5km_condition_std,nb_5km_condition_skew
0,0,296,0.044478,2.498402,1.483615,1.512759,0.698354,-0.19347,390,5470,1682.202703,1620.0,705.781038,1.274557,1612,67953,7348.60473,6437.0,4683.554608,7.951805,0,7,3.260135,3.0,1.030007,0.342137,0.5,4.0,1.712838,1.75,0.707924,0.691393,4,11,6.827703,7.0,0.790222,0.68978,0,4,0.52027,0.0,1.10143,2.001803,2,5,3.371622,3.0,0.701383,1.295947
1,1,365,0.143364,2.49621,1.485641,1.460579,0.633707,-0.063039,900,4890,2008.928767,1870.0,721.385812,0.947724,1116,67269,10294.09863,9266.0,6178.757661,4.133409,2,6,3.465753,3.0,0.731591,0.457891,1.0,4.25,2.104795,2.25,0.630452,-0.100403,5,10,7.547945,7.0,0.852293,1.04097,0,4,0.093151,0.0,0.510421,5.758344,2,5,3.328767,3.0,0.570685,1.458106
2,2,351,0.011119,2.498239,1.5866,1.595807,0.614502,-0.387332,970,6260,2601.85755,2500.0,903.870773,0.740459,2839,284011,13893.293447,10283.0,18291.591865,9.96051,2,7,3.623932,4.0,0.745192,0.652354,1.0,6.5,2.480769,2.5,0.64461,1.221015,6,11,8.595442,9.0,1.08569,0.11664,0,4,0.105413,0.0,0.563407,5.904052,2,5,3.150997,3.0,0.424254,2.229745
3,3,185,0.143684,2.492903,1.715901,1.815454,0.593478,-0.681005,760,4850,1967.902703,1840.0,790.057883,0.806748,1302,56628,9464.243243,8400.0,5650.200473,4.935777,2,6,3.416216,3.0,0.776528,0.881445,1.0,4.5,2.067568,2.25,0.714867,0.185287,6,11,7.518919,7.0,1.083934,1.050943,0,0,0.0,0.0,0.0,0.0,2,5,3.27027,3.0,0.491706,0.752276
4,4,178,0.078469,2.484723,1.541274,1.50912,0.511404,-0.235382,620,5370,1788.634831,1675.0,742.098693,1.444615,4583,54977,10981.134831,9326.0,6587.342951,3.820764,1,6,3.280899,3.0,0.876418,0.131443,1.0,4.5,1.745787,1.75,0.672542,0.87219,5,11,6.983146,7.0,0.819775,1.089187,0,4,0.466292,0.0,0.96345,1.975037,2,5,3.41573,3.0,0.607095,0.711163


In [13]:
neighbor_5km_stat.to_csv('../input/neighbor_5km_stat.csv', index=False)

In [14]:
nearest_neighbor = neighbor_info_df[(neighbor_info_df['data'] == 'train') & (neighbor_info_df['distance'] <= 4)].copy()
nearest_neighbor = nearest_neighbor.sort_values(['id','grade_diff','sqft_living_diff','sqft_living15_diff',
                                                 'bathrooms_diff','distance','bedrooms_diff','view_diff','sqft_lot_diff',
                                                 'condition_diff','waterfront_diff'])
nearest_neighbor['nb_order'] = nearest_neighbor.groupby(['id']).cumcount() + 1
print(nearest_neighbor.shape)
nearest_neighbor.head()

(15818934, 37)


Unnamed: 0,id,neighbor_id,distance,neighbor_sqft_living,neighbor_sqft_living15,neighbor_sqft_lot,neighbor_sqft_lot15,neighbor_bedrooms,neighbor_bathrooms,neighbor_grade,neighbor_waterfront,neighbor_view,neighbor_condition,data,neighbor_price,sqft_living,sqft_living15,sqft_lot,sqft_lot15,bedrooms,bathrooms,grade,waterfront,view,condition,price,sqft_living_diff,sqft_living15_diff,sqft_lot_diff,sqft_lot15_diff,bedrooms_diff,bathrooms_diff,grade_diff,waterfront_diff,view_diff,condition_diff,nb_order
575,0,19126,2.391279,1180,1430,4000,4000,3,1.75,7,0,0,3,test,,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0,0,90,1650,1650,0,0.75,0,0,0,0,1
555,0,18258,1.26393,1190,1410,11400,11400,2,1.0,7,0,0,3,test,,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0,10,70,5750,5750,1,0.0,0,0,0,0,2
170,0,5596,1.844594,1190,1200,6000,6000,3,1.75,7,0,0,3,train,203000.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0,10,140,350,350,0,0.75,0,0,0,0,3
323,0,10207,2.406229,1190,1190,3000,3000,3,2.5,7,0,0,3,train,229000.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0,10,150,2650,2650,0,1.5,0,0,0,0,4
397,0,12489,2.036385,1200,1380,10703,8068,3,1.0,7,0,0,2,train,200000.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0,20,40,5053,2418,0,0.0,0,0,0,1,5


In [15]:
nearest_5_neighbor = nearest_neighbor[nearest_neighbor['nb_order'] <= 5].reset_index(drop=True).copy()
nearest_5_neighbor['neighbor_price_log'] = np.log1p(nearest_5_neighbor['neighbor_price'])

nearest_5_neighbor_stat = nearest_5_neighbor.groupby('id').agg({
    'neighbor_id': 'count',
    'distance': ['min','max','mean','median','std','skew'],
    'neighbor_price_log': ['mean'],
    'neighbor_sqft_living': ['min','max','mean','median','std','skew'],
    'neighbor_sqft_lot': ['min','max','mean','median','std','skew'],
    'neighbor_bedrooms': ['min','max','mean','median','std','skew'],
    'neighbor_bathrooms': ['min','max','mean','median','std','skew'],
    'neighbor_grade': ['min','max','mean','median','std','skew'],
    'neighbor_view': ['min','max','mean','median','std','skew'],
    'neighbor_condition': ['min','max','mean','median','std','skew'],
}).reset_index()

nearest_5_neighbor_stat.columns = [
    'id','n_5_nb_count',
    'n_5_nb_distance_min','n_5_nb_distance_max','n_5_nb_distance_mean','n_5_nb_distance_median','n_5_nb_distance_std','n_5_nb_distance_skew',
    'n_5_nb_price_mean',
    'n_5_nb_sqft_living_min','n_5_nb_sqft_living_max','n_5_nb_sqft_living_mean','n_5_nb_sqft_living_median','n_5_nb_sqft_living_std','n_5_nb_sqft_living_skew',
    'n_5_nb_sqft_lot_min','n_5_nb_sqft_lot_max','n_5_nb_sqft_lot_mean','n_5_nb_sqft_lot_median','n_5_nb_sqft_lot_std','n_5_nb_sqft_lot_skew',
    'n_5_nb_bedrooms_min','n_5_nb_bedrooms_max','n_5_nb_bedrooms_mean','n_5_nb_bedrooms_median','n_5_nb_bedrooms_std','n_5_nb_bedrooms_skew',
    'n_5_nb_bathrooms_min','n_5_nb_bathrooms_max','n_5_nb_bathrooms_mean','n_5_nb_bathrooms_median','n_5_nb_bathrooms_std','n_5_nb_bathrooms_skew',
    'n_5_nb_grade_min','n_5_nb_grade_max','n_5_nb_grade_mean','n_5_nb_grade_median','n_5_nb_grade_std','n_5_nb_grade_skew',
    'n_5_nb_view_min','n_5_nb_view_max','n_5_nb_view_mean','n_5_nb_view_median','n_5_nb_view_std','n_5_nb_view_skew',
    'n_5_nb_condition_min','n_5_nb_condition_max','n_5_nb_condition_mean','n_5_nb_condition_median','n_5_nb_condition_std','n_5_nb_condition_skew',
]

print(nearest_5_neighbor_stat.shape)
nearest_5_neighbor_stat.head()

(21499, 50)


Unnamed: 0,id,n_5_nb_count,n_5_nb_distance_min,n_5_nb_distance_max,n_5_nb_distance_mean,n_5_nb_distance_median,n_5_nb_distance_std,n_5_nb_distance_skew,n_5_nb_sqft_living_min,n_5_nb_sqft_living_max,n_5_nb_sqft_living_mean,n_5_nb_sqft_living_median,n_5_nb_sqft_living_std,n_5_nb_sqft_living_skew,n_5_nb_sqft_lot_min,n_5_nb_sqft_lot_max,n_5_nb_sqft_lot_mean,n_5_nb_sqft_lot_median,n_5_nb_sqft_lot_std,n_5_nb_sqft_lot_skew,n_5_nb_bedrooms_min,n_5_nb_bedrooms_max,n_5_nb_bedrooms_mean,n_5_nb_bedrooms_median,n_5_nb_bedrooms_std,n_5_nb_bedrooms_skew,n_5_nb_bathrooms_min,n_5_nb_bathrooms_max,n_5_nb_bathrooms_mean,n_5_nb_bathrooms_median,n_5_nb_bathrooms_std,n_5_nb_bathrooms_skew,n_5_nb_grade_min,n_5_nb_grade_max,n_5_nb_grade_mean,n_5_nb_grade_median,n_5_nb_grade_std,n_5_nb_grade_skew,n_5_nb_view_min,n_5_nb_view_max,n_5_nb_view_mean,n_5_nb_view_median,n_5_nb_view_std,n_5_nb_view_skew,n_5_nb_condition_min,n_5_nb_condition_max,n_5_nb_condition_mean,n_5_nb_condition_median,n_5_nb_condition_std,n_5_nb_condition_skew
0,0,5,1.26393,2.406229,1.988483,2.036385,0.470301,-0.981319,1180,1200,1190.0,1190,7.071068,0.0,3000,11400,7020.6,6000.0,3842.853341,0.295963,2,3,2.8,3,0.447214,-2.236068,1.0,2.5,1.6,1.75,0.627495,0.512241,7,7,7.0,7.0,0.0,0.0,0,0,0.0,0,0.0,0.0,2,3,2.8,3.0,0.447214,-2.236068
1,1,5,2.892035,3.94079,3.302202,2.93863,0.523212,0.641529,720,1040,892.0,920,134.4247,-0.333587,5820,15120,10344.4,8282.0,4395.88635,0.419003,1,3,2.0,2,0.707107,0.0,0.75,1.0,0.95,1.0,0.111803,-2.236068,6,6,6.0,6.0,0.0,0.0,0,1,0.2,0,0.447214,2.236068,3,5,3.6,3.0,0.894427,1.257788
2,2,5,0.416497,3.162922,2.012995,2.880931,1.329633,-0.60339,1650,1720,1670.0,1650,30.82207,1.451455,4218,10098,6197.0,4648.0,2519.0131,1.193112,3,3,3.0,3,0.0,0.0,2.0,2.5,2.4,2.5,0.223607,-2.236068,8,8,8.0,8.0,0.0,0.0,0,0,0.0,0,0.0,0.0,3,4,3.2,3.0,0.447214,2.236068
3,3,5,3.204587,3.991923,3.703148,3.830455,0.305116,-1.39445,1710,1730,1720.0,1720,7.071068,0.0,6400,9753,8221.6,8320.0,1430.830283,-0.24633,3,4,3.6,4,0.547723,-0.608581,1.75,2.25,1.95,1.75,0.273861,0.608581,7,7,7.0,7.0,0.0,0.0,0,0,0.0,0,0.0,0.0,3,4,3.6,4.0,0.547723,-0.608581
4,4,5,1.190439,3.815175,2.612285,2.402673,1.176787,0.030783,1030,1070,1050.0,1060,18.708287,-0.381802,4583,9954,7822.0,8223.0,2049.866703,-1.09789,3,3,3.0,3,0.0,0.0,1.0,1.5,1.1,1.0,0.223607,2.236068,7,7,7.0,7.0,0.0,0.0,0,0,0.0,0,0.0,0.0,3,4,3.8,4.0,0.447214,-2.236068


In [16]:
nearest_5_neighbor_stat.to_csv('../input/nearest_5_neighbor_stat.csv', index=False)

In [17]:
nearest_10_neighbor = nearest_neighbor[nearest_neighbor['nb_order'] <= 10].reset_index(drop=True).copy()
nearest_10_neighbor['neighbor_price_log'] = np.log1p(nearest_10_neighbor['neighbor_price'])

nearest_10_neighbor_stat = nearest_10_neighbor.groupby('id').agg({
    'neighbor_id': 'count',
    'distance': ['min','max','mean','median','std','skew'],
    'neighbor_price_log': ['mean'],
    'neighbor_sqft_living': ['min','max','mean','median','std','skew'],
    'neighbor_sqft_lot': ['min','max','mean','median','std','skew'],
    'neighbor_bedrooms': ['min','max','mean','median','std','skew'],
    'neighbor_bathrooms': ['min','max','mean','median','std','skew'],
    'neighbor_grade': ['min','max','mean','median','std','skew'],
    'neighbor_view': ['min','max','mean','median','std','skew'],
    'neighbor_condition': ['min','max','mean','median','std','skew'],
}).reset_index()

nearest_10_neighbor_stat.columns = [
    'id','n_10_nb_count',
    'n_10_nb_distance_min','n_10_nb_distance_max','n_10_nb_distance_mean','n_10_nb_distance_median','n_10_nb_distance_std','n_10_nb_distance_skew',
    'n_10_nb_price_mean',
    'n_10_nb_sqft_living_min','n_10_nb_sqft_living_max','n_10_nb_sqft_living_mean','n_10_nb_sqft_living_median','n_10_nb_sqft_living_std','n_10_nb_sqft_living_skew',
    'n_10_nb_sqft_lot_min','n_10_nb_sqft_lot_max','n_10_nb_sqft_lot_mean','n_10_nb_sqft_lot_median','n_10_nb_sqft_lot_std','n_10_nb_sqft_lot_skew',
    'n_10_nb_bedrooms_min','n_10_nb_bedrooms_max','n_10_nb_bedrooms_mean','n_10_nb_bedrooms_median','n_10_nb_bedrooms_std','n_10_nb_bedrooms_skew',
    'n_10_nb_bathrooms_min','n_10_nb_bathrooms_max','n_10_nb_bathrooms_mean','n_10_nb_bathrooms_median','n_10_nb_bathrooms_std','n_10_nb_bathrooms_skew',
    'n_10_nb_grade_min','n_10_nb_grade_max','n_10_nb_grade_mean','n_10_nb_grade_median','n_10_nb_grade_std','n_10_nb_grade_skew',
    'n_10_nb_view_min','n_10_nb_view_max','n_10_nb_view_mean','n_10_nb_view_median','n_10_nb_view_std','n_10_nb_view_skew',
    'n_10_nb_condition_min','n_10_nb_condition_max','n_10_nb_condition_mean','n_10_nb_condition_median','n_10_nb_condition_std','n_10_nb_condition_skew',
]

print(nearest_10_neighbor_stat.shape)
nearest_10_neighbor_stat.head()

(21499, 50)


Unnamed: 0,id,n_10_nb_count,n_10_nb_distance_min,n_10_nb_distance_max,n_10_nb_distance_mean,n_10_nb_distance_median,n_10_nb_distance_std,n_10_nb_distance_skew,n_10_nb_sqft_living_min,n_10_nb_sqft_living_max,n_10_nb_sqft_living_mean,n_10_nb_sqft_living_median,n_10_nb_sqft_living_std,n_10_nb_sqft_living_skew,n_10_nb_sqft_lot_min,n_10_nb_sqft_lot_max,n_10_nb_sqft_lot_mean,n_10_nb_sqft_lot_median,n_10_nb_sqft_lot_std,n_10_nb_sqft_lot_skew,n_10_nb_bedrooms_min,n_10_nb_bedrooms_max,n_10_nb_bedrooms_mean,n_10_nb_bedrooms_median,n_10_nb_bedrooms_std,n_10_nb_bedrooms_skew,n_10_nb_bathrooms_min,n_10_nb_bathrooms_max,n_10_nb_bathrooms_mean,n_10_nb_bathrooms_median,n_10_nb_bathrooms_std,n_10_nb_bathrooms_skew,n_10_nb_grade_min,n_10_nb_grade_max,n_10_nb_grade_mean,n_10_nb_grade_median,n_10_nb_grade_std,n_10_nb_grade_skew,n_10_nb_view_min,n_10_nb_view_max,n_10_nb_view_mean,n_10_nb_view_median,n_10_nb_view_std,n_10_nb_view_skew,n_10_nb_condition_min,n_10_nb_condition_max,n_10_nb_condition_mean,n_10_nb_condition_median,n_10_nb_condition_std,n_10_nb_condition_skew
0,0,10,0.796354,3.723943,2.135682,2.005488,0.919863,0.540445,1160,1210,1188.0,1190.0,16.865481,-0.826855,3000,11400,6620.7,5850.0,2797.021513,0.735803,2,4,2.9,3.0,0.567646,-0.09112,1.0,2.5,1.525,1.625,0.519749,0.484093,7,7,7.0,7.0,0.0,0.0,0,2,0.3,0.0,0.674949,2.276596,2,5,3.2,3.0,0.788811,1.290369
1,1,10,1.092273,3.94079,2.754516,2.936933,1.018474,-0.745347,720,1380,1079.0,1050.0,233.116566,-0.120485,5728,209959,30463.6,9137.5,63246.278788,3.129419,1,4,2.4,2.0,0.843274,0.389108,0.75,1.75,1.1,1.0,0.293447,1.587087,6,6,6.0,6.0,0.0,0.0,0,1,0.1,0.0,0.316228,3.162278,1,5,3.3,3.0,1.159502,-0.192445
2,2,10,0.011119,3.162922,1.508829,1.239888,1.241564,0.165722,1640,1790,1710.0,1700.0,62.893208,0.194283,3993,15258,7749.3,7276.0,3742.064227,0.935612,3,3,3.0,3.0,0.0,0.0,2.0,2.5,2.375,2.5,0.212459,-1.357727,8,8,8.0,8.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,3,4,3.1,3.0,0.316228,3.162278
3,3,10,1.496205,3.991923,3.212492,3.588086,0.852923,-1.310236,1690,1730,1711.0,1715.0,15.238839,-0.207227,6400,11390,8826.3,9507.0,1601.42763,-0.266729,2,4,3.2,3.0,0.632456,-0.131762,1.0,2.25,1.7,1.75,0.437798,-0.552419,7,7,7.0,7.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,3,4,3.6,4.0,0.516398,-0.484123
4,4,10,0.078469,3.878557,2.269261,2.12282,1.476905,-0.062292,1010,1100,1047.0,1045.0,33.681515,0.302278,4583,13444,8536.1,8611.5,2577.024272,0.219592,2,3,2.8,3.0,0.421637,-1.778781,1.0,2.5,1.3,1.0,0.483046,1.959293,7,7,7.0,7.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,3,4,3.5,3.5,0.527046,0.0


In [18]:
nearest_10_neighbor_stat.to_csv('../input/nearest_10_neighbor_stat.csv', index=False)

In [19]:
nearest_20_neighbor = nearest_neighbor[nearest_neighbor['nb_order'] <= 20].reset_index(drop=True).copy()
nearest_20_neighbor['neighbor_price_log'] = np.log1p(nearest_20_neighbor['neighbor_price'])

nearest_20_neighbor_stat = nearest_20_neighbor.groupby('id').agg({
    'neighbor_id': 'count',
    'distance': ['min','max','mean','median','std','skew'],
    'neighbor_price_log': ['mean'],
    'neighbor_sqft_living': ['min','max','mean','median','std','skew'],
    'neighbor_sqft_lot': ['min','max','mean','median','std','skew'],
    'neighbor_bedrooms': ['min','max','mean','median','std','skew'],
    'neighbor_bathrooms': ['min','max','mean','median','std','skew'],
    'neighbor_grade': ['min','max','mean','median','std','skew'],
    'neighbor_view': ['min','max','mean','median','std','skew'],
    'neighbor_condition': ['min','max','mean','median','std','skew'],
}).reset_index()

nearest_20_neighbor_stat.columns = [
    'id','n_20_nb_count',
    'n_20_nb_distance_min','n_20_nb_distance_max','n_20_nb_distance_mean','n_20_nb_distance_median','n_20_nb_distance_std','n_20_nb_distance_skew',
    'n_20_nb_price_mean',
    'n_20_nb_sqft_living_min','n_20_nb_sqft_living_max','n_20_nb_sqft_living_mean','n_20_nb_sqft_living_median','n_20_nb_sqft_living_std','n_20_nb_sqft_living_skew',
    'n_20_nb_sqft_lot_min','n_20_nb_sqft_lot_max','n_20_nb_sqft_lot_mean','n_20_nb_sqft_lot_median','n_20_nb_sqft_lot_std','n_20_nb_sqft_lot_skew',
    'n_20_nb_bedrooms_min','n_20_nb_bedrooms_max','n_20_nb_bedrooms_mean','n_20_nb_bedrooms_median','n_20_nb_bedrooms_std','n_20_nb_bedrooms_skew',
    'n_20_nb_bathrooms_min','n_20_nb_bathrooms_max','n_20_nb_bathrooms_mean','n_20_nb_bathrooms_median','n_20_nb_bathrooms_std','n_20_nb_bathrooms_skew',
    'n_20_nb_grade_min','n_20_nb_grade_max','n_20_nb_grade_mean','n_20_nb_grade_median','n_20_nb_grade_std','n_20_nb_grade_skew',
    'n_20_nb_view_min','n_20_nb_view_max','n_20_nb_view_mean','n_20_nb_view_median','n_20_nb_view_std','n_20_nb_view_skew',
    'n_20_nb_condition_min','n_20_nb_condition_max','n_20_nb_condition_mean','n_20_nb_condition_median','n_20_nb_condition_std','n_20_nb_condition_skew',
]

print(nearest_20_neighbor_stat.shape)
nearest_20_neighbor_stat.head()

(21499, 50)


Unnamed: 0,id,n_20_nb_count,n_20_nb_distance_min,n_20_nb_distance_max,n_20_nb_distance_mean,n_20_nb_distance_median,n_20_nb_distance_std,n_20_nb_distance_skew,n_20_nb_sqft_living_min,n_20_nb_sqft_living_max,n_20_nb_sqft_living_mean,n_20_nb_sqft_living_median,n_20_nb_sqft_living_std,n_20_nb_sqft_living_skew,n_20_nb_sqft_lot_min,n_20_nb_sqft_lot_max,n_20_nb_sqft_lot_mean,n_20_nb_sqft_lot_median,n_20_nb_sqft_lot_std,n_20_nb_sqft_lot_skew,n_20_nb_bedrooms_min,n_20_nb_bedrooms_max,n_20_nb_bedrooms_mean,n_20_nb_bedrooms_median,n_20_nb_bedrooms_std,n_20_nb_bedrooms_skew,n_20_nb_bathrooms_min,n_20_nb_bathrooms_max,n_20_nb_bathrooms_mean,n_20_nb_bathrooms_median,n_20_nb_bathrooms_std,n_20_nb_bathrooms_skew,n_20_nb_grade_min,n_20_nb_grade_max,n_20_nb_grade_mean,n_20_nb_grade_median,n_20_nb_grade_std,n_20_nb_grade_skew,n_20_nb_view_min,n_20_nb_view_max,n_20_nb_view_mean,n_20_nb_view_median,n_20_nb_view_std,n_20_nb_view_skew,n_20_nb_condition_min,n_20_nb_condition_max,n_20_nb_condition_mean,n_20_nb_condition_median,n_20_nb_condition_std,n_20_nb_condition_skew
0,0,20,0.796354,3.723943,2.26917,2.370574,0.931268,0.024036,1130,1240,1188.5,1195.0,32.163235,-0.450803,3000,11571,6922.95,6000.0,2348.655572,0.630116,2,4,2.9,3.0,0.447214,-0.54921,1.0,2.5,1.425,1.5,0.437547,0.773604,7,7,7.0,7.0,0.0,0.0,0,2,0.15,0.0,0.48936,3.435747,2,5,3.3,3.0,0.732695,1.231018
1,1,20,1.092273,3.94079,2.810732,2.936933,0.829468,-0.797978,720,2400,1453.0,1390.0,474.963711,0.389981,4473,209959,20658.8,9649.0,44787.114909,4.396585,1,4,2.75,3.0,0.786398,-0.225465,0.75,2.0,1.3,1.0,0.402296,0.565858,6,6,6.0,6.0,0.0,0.0,0,3,0.2,0.0,0.695852,3.873885,1,5,3.55,3.0,0.998683,-0.328489
2,2,20,0.011119,3.210191,1.757771,1.874027,1.137866,-0.224324,1550,1820,1710.0,1740.0,97.710636,-0.42389,2935,27003,8402.95,7276.0,5494.443226,2.219809,3,4,3.1,3.0,0.307794,2.887939,2.0,2.5,2.35,2.5,0.205196,-0.913762,8,8,8.0,8.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,3,4,3.05,3.0,0.223607,4.472136
3,3,20,1.496205,3.991923,2.942383,3.054536,0.818174,-0.365728,1677,1760,1717.35,1720.0,28.292225,0.136275,6397,12728,8613.95,8400.0,1638.682349,0.790445,2,5,3.55,4.0,0.686333,-0.198068,1.0,2.75,1.9125,1.875,0.467883,-0.234309,7,7,7.0,7.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,3,4,3.5,3.5,0.512989,0.0
4,4,20,0.078469,3.987179,2.722658,3.298578,1.233201,-0.802805,960,1140,1043.5,1030.0,59.93637,0.335011,4583,15190,8882.3,8327.5,2422.558806,0.881233,2,3,2.8,3.0,0.410391,-1.624466,1.0,2.5,1.225,1.0,0.412789,2.046848,7,7,7.0,7.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,2,4,3.35,3.0,0.587143,-0.212354


In [20]:
nearest_20_neighbor_stat.to_csv('../input/nearest_20_neighbor_stat.csv', index=False)