In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

from IPython.display import display

ipython = get_ipython()

def exception_handler(exception_type, exception, traceback):
    print("%s: %s" % (exception_type.__name__, exception), file=sys.stderr)

ipython._showtraceback = exception_handler

from utils import *

Using TensorFlow backend.


In [3]:
# Set a few plotting defaults
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

In [4]:
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000
pd.options.display.max_colwidth = 1000

In [5]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

train_copy = train.copy()
train_copy['data'] = 'train'

# remove outlier
train_copy = train_copy[~((train_copy['sqft_living'] > 12000) & (train_copy['price'] < 3000000))].reset_index(drop=True)
    
test_copy = test.copy()
test_copy['data'] = 'test'
test_copy['price'] = np.nan

data = pd.concat([train_copy, test_copy], sort=False).reset_index(drop=True)
data = data[train_copy.columns]
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,data
0,0,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,train
1,1,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,train
2,2,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,train
3,3,20140627T000000,257500.0,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819,train
4,4,20150115T000000,291850.0,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711,train


In [6]:
print(data['lat'].min(), data['lat'].max(), data['long'].min(), data['long'].max())

haversine_dist = haversine_array(data['lat'].min(), data['long'].min(), data['lat'].max(), data['long'].max())
print(haversine_dist)

bearing = bearing_array(data['lat'].min(), data['long'].min(), data['lat'].max(), data['long'].max())
print(bearing)

47.1559 47.7776 -122.519 -121.315
113.88352317220664
52.18389277379239


In [8]:
from tqdm import tqdm_notebook as tqdm

neighbor_df = pd.DataFrame()
lat2 = data['lat'].values
long2 = data['long'].values

for i, row in tqdm(data.iterrows(), total=data.shape[0]):
    lat1 = np.array(row['lat'])
    long1 = np.array(row['long'])
    dist_arr = haversine_array(lat1, long1, lat2, long2)
    bearing_arr = bearing_array(lat1, long1, lat2, long2)
    tmp_df = pd.DataFrame({
        'id': np.tile(np.array([row['id']]), data.shape[0]),
        'neighbor_id': data['id'],
        'distance': dist_arr,
    })
    tmp_df = tmp_df[tmp_df['distance'] <= 4]
    tmp_df = tmp_df[tmp_df['id'] != tmp_df['neighbor_id']]
    neighbor_df = neighbor_df.append(tmp_df.copy())
    del tmp_df
    gc.collect()
    
print(neighbor_df.shape)
neighbor_df.head()

HBox(children=(IntProgress(value=0, max=21502), HTML(value='')))


(15818934, 3)


Unnamed: 0,id,neighbor_id,distance
16,0,16,2.897145
80,0,80,1.329938
81,0,81,3.822482
83,0,83,2.186419
84,0,84,1.003934


In [10]:
data_df = data.rename(index=str, columns={'id': 'neighbor_id'})
neighbor_info_df = neighbor_df.merge(data_df[['neighbor_id','sqft_living','sqft_living15','sqft_lot','sqft_lot15','bedrooms','bathrooms',
                                              'grade','waterfront','view','condition','data','price']], on='neighbor_id')
neighbor_info_df.columns = ['id','neighbor_id','distance','neighbor_sqft_living',
                            'neighbor_sqft_living15','neighbor_sqft_lot','neighbor_sqft_lot15','neighbor_bedrooms',
                            'neighbor_bathrooms','neighbor_grade','neighbor_waterfront',
                            'neighbor_view','neighbor_condition','data','neighbor_price']
neighbor_info_df = neighbor_info_df.sort_values(['id','neighbor_id']).reset_index(drop=True)
print(neighbor_info_df.shape)
neighbor_info_df.head()

(15818934, 15)


Unnamed: 0,id,neighbor_id,distance,neighbor_sqft_living,neighbor_sqft_living15,neighbor_sqft_lot,neighbor_sqft_lot15,neighbor_bedrooms,neighbor_bathrooms,neighbor_grade,neighbor_waterfront,neighbor_view,neighbor_condition,data,neighbor_price
0,0,16,2.897145,3050,4110,44867,20336,3,2.75,9,0,4,3,train,2000000.0
1,0,80,1.329938,880,1190,6780,6780,2,1.0,6,0,0,4,train,205425.0
2,0,81,3.822482,1570,1880,9600,9000,4,2.0,6,0,0,3,train,171800.0
3,0,83,2.186419,2100,1850,4400,4400,4,1.75,7,0,0,5,train,445000.0
4,0,84,1.003934,2100,2660,8201,8712,3,2.25,8,0,2,3,train,445000.0


In [11]:
neighbor_info_df = neighbor_info_df.merge(data[['id','sqft_living','sqft_living15','sqft_lot','sqft_lot15','bedrooms','bathrooms','grade',
                                                'waterfront','view','condition','price']], on='id').reset_index(drop=True)
print(neighbor_info_df.shape)
neighbor_info_df.head()

(15818934, 26)


Unnamed: 0,id,neighbor_id,distance,neighbor_sqft_living,neighbor_sqft_living15,neighbor_sqft_lot,neighbor_sqft_lot15,neighbor_bedrooms,neighbor_bathrooms,neighbor_grade,neighbor_waterfront,neighbor_view,neighbor_condition,data,neighbor_price,sqft_living,sqft_living15,sqft_lot,sqft_lot15,bedrooms,bathrooms,grade,waterfront,view,condition,price
0,0,16,2.897145,3050,4110,44867,20336,3,2.75,9,0,4,3,train,2000000.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0
1,0,80,1.329938,880,1190,6780,6780,2,1.0,6,0,0,4,train,205425.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0
2,0,81,3.822482,1570,1880,9600,9000,4,2.0,6,0,0,3,train,171800.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0
3,0,83,2.186419,2100,1850,4400,4400,4,1.75,7,0,0,5,train,445000.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0
4,0,84,1.003934,2100,2660,8201,8712,3,2.25,8,0,2,3,train,445000.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0


In [12]:
cols = ['sqft_living','sqft_living15','sqft_lot','sqft_lot15','bedrooms','bathrooms','grade','waterfront','view','condition']

for col in cols:
    neighbor_info_df[col + '_diff'] = abs(neighbor_info_df[col] - neighbor_info_df['neighbor_' + col])

print(neighbor_info_df.shape)
neighbor_info_df.head()

(15818934, 36)


Unnamed: 0,id,neighbor_id,distance,neighbor_sqft_living,neighbor_sqft_living15,neighbor_sqft_lot,neighbor_sqft_lot15,neighbor_bedrooms,neighbor_bathrooms,neighbor_grade,neighbor_waterfront,neighbor_view,neighbor_condition,data,neighbor_price,sqft_living,sqft_living15,sqft_lot,sqft_lot15,bedrooms,bathrooms,grade,waterfront,view,condition,price,sqft_living_diff,sqft_living15_diff,sqft_lot_diff,sqft_lot15_diff,bedrooms_diff,bathrooms_diff,grade_diff,waterfront_diff,view_diff,condition_diff
0,0,16,2.897145,3050,4110,44867,20336,3,2.75,9,0,4,3,train,2000000.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0,1870,2770,39217,14686,0,1.75,2,0,4,0
1,0,80,1.329938,880,1190,6780,6780,2,1.0,6,0,0,4,train,205425.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0,300,150,1130,1130,1,0.0,1,0,0,1
2,0,81,3.822482,1570,1880,9600,9000,4,2.0,6,0,0,3,train,171800.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0,390,540,3950,3350,1,1.0,1,0,0,0
3,0,83,2.186419,2100,1850,4400,4400,4,1.75,7,0,0,5,train,445000.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0,920,510,1250,1250,1,0.75,0,0,0,2
4,0,84,1.003934,2100,2660,8201,8712,3,2.25,8,0,2,3,train,445000.0,1180,1340,5650,5650,3,1.0,7,0,0,3,221900.0,920,1320,2551,3062,0,1.25,1,0,2,0


In [14]:
neighbor_info_df.to_csv('../input/neighbor_info.csv', index=False)