In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
%matplotlib inline
pd.set_option('display.max_columns', 100)

In [2]:
home30day = pd.read_csv('../data/home30day.csv',low_memory=False)
university = pd.read_csv('../data/university.csv',low_memory=False)

In [3]:
home30day_5 = home30day.head(5)
home30day_5_grouped = home30day_5.groupby('state')
university_grouped = university.groupby('state')
dfMin = pd.DataFrame()

In [5]:
home30day.describe()

Unnamed: 0,latitude,longitude,numbed,num_bath_full,num_bath_part,rentalprice_min,rentalprice_max,saleprice,floor_plan,size,pool,gatedCommunity,stainlessAppliances,fireplace,renovation
count,183460.0,183460.0,183460.0,183460.0,183460.0,183460.0,183460.0,183460.0,0.0,183460.0,183460.0,183460.0,183460.0,183460.0,183460.0
mean,33.545487,-93.689711,1.966156,1.393595,-0.6971,984.602317,-1.0,133827.3,,1222.413938,0.221765,0.068609,0.09928,0.242985,0.089807
std,5.212622,14.886998,1.897425,1.669656,0.716976,3306.473516,0.0,641216.6,,1223.742908,0.415435,0.252789,0.299039,0.428887,0.285906
min,19.090885,-161.78659,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,,-1.0,0.0,0.0,0.0,0.0,0.0
25%,30.361464,-94.455809,1.0,1.0,-1.0,-1.0,-1.0,-1.0,,-1.0,0.0,0.0,0.0,0.0,0.0
50%,33.323529,-87.718835,3.0,2.0,-1.0,-1.0,-1.0,0.0,,1157.0,0.0,0.0,0.0,0.0,0.0
75%,36.088063,-85.468457,3.0,2.0,-1.0,1395.0,-1.0,135000.0,,1812.0,0.0,0.0,0.0,0.0,0.0
max,64.97801,-70.71887,12.0,47.0,1.0,200000.0,-1.0,150000000.0,,30000.0,1.0,1.0,1.0,1.0,1.0


In [6]:
def distance(x1,x2,y1,y2):
    # return the Euclidean distance between two points (x1,y1) and (x2,y2)
    return np.linalg.norm(np.array([x1,y1]) - np.array([x2,y2]))

def nearestPair(homes,universities):
    # This function finds the nearest university to each home and append the university info to the home entry.
    # To reduce computation, data are partitioned by state. While partition by city or zipcode can further reduce 
    # the computation, it may also introduce error because the nearest university to a home may not neccessarily be
    # in the same city/zip. (Consider a home on city boundary)
    minDistPair = pd.DataFrame()
    states = home['state'].unique()
    homes_grouped = homes.groupby('state')
    university_grouped = universities.groupby('state')
    for state in states:
        print state
        home_state = homes_grouped.get_group(state)
        university_state = university_grouped.get_group(state)
        minDistPair_state = nearestInState(home_state, university_state)
        minDistPair = pd.concat([minDistPair, minDistPair_state])
    return minDistPair

def nearestInState(h, u):
    # This function finds the nearest university to each home and append the university info to the home entry.
    # pair a house to every university in the same state
    mergeDf = h.merge(u, left_on='state', right_on='state', how='inner')
    # find distance between a house and an university
    mergeDf['distance'] = mergeDf.apply(lambda row: distance(row['longitude_x'],row['longitude_y'],row['latitude_x'],row['latitude_y']), axis=1)
    # find the distance from a house to the nearest university
    mergeDfGrouped = mergeDf.groupby(['latitude_x','longitude_x'],as_index=False)
    # filter the house-university pairs by the smallest distance found above
    minDistPair = pd.DataFrame()
    for name, group in mergeDfGrouped:
        pair = mergeDf[mergeDf['distance'] == mergeDfGrouped.get_group(name).min()['distance']].head(1)
        minDistPair = pd.concat([minDistPair, pair])
    return minDistPair
        
home = home30day
allPair = nearestPair(home,university).reset_index(drop=True)
allPair.shape

CA
AL
TN
LA
DE
DC
CO
AR
FL
MN
MO
AZ
OR
IN
UT
MI
RI
NV
NH
AK
WY
HI
VT


(149348, 56)

In [15]:
# look at the merged dataframe to make sure we have the result as wanted.
print list(allPair.columns.values)
print allPair.shape
print allPair.info()

['website_x', 'home_url', 'property_type', 'record_type', 'parser_type', 'latitude_x', 'longitude_x', 'streetaddr_x', 'city_x', 'state', 'zipcode', 'country', 'numbed', 'num_bath_full', 'num_bath_part', 'rentalprice_min', 'rentalprice_max', 'saleprice', 'yearbuilt', 'floor_plan', 'garage', 'lotsize', 'stories', 'size', 'pool', 'style', 'numofparking', 'gatedCommunity', 'stainlessAppliances', 'fireplace', 'floorType', 'kitchenCountertop', 'renovation', 'elevation', 'elevationType', 'description', 'crawl_time', 'datalisted', 'inactive', 'UniversityName', 'streetaddr_y', 'city_y', 'postcode', 'acceptanceRate', 'ranking', 'enroll', 'latitude_y', 'longitude_y', 'type', 'endowment', 'acadStaff', 'students', 'underGrad', 'postGrad', 'website_y', 'distance']
(149348, 56)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149348 entries, 0 to 149347
Data columns (total 56 columns):
website_x              149348 non-null object
home_url               149343 non-null object
property_type          

In [11]:
# calculate correlations between each numerical feature pairs. We are interested in saleprice's correlations
# particularly. 
corr = allPair.corr()
price_corr = corr["saleprice"]
price_corr = price_corr.to_dict()
# print price_corr
for ele in sorted(price_corr.items(), key = lambda x: -abs(x[1])):
    print("{0}: \t{1}".format(*ele))

size: 	0.212434739444
num_bath_part: 	0.0259196241384
rentalprice_max: 	nan
floor_plan: 	nan
saleprice: 	1.0
fireplace: 	0.114797524331
longitude_y: 	-0.101855836736
num_bath_full: 	0.101698415286
longitude_x: 	-0.101392110915
numbed: 	0.0905193630614
pool: 	0.058742727049
rentalprice_min: 	-0.0545353930517
gatedCommunity: 	0.0460546172569
latitude_y: 	-0.0272588998541
latitude_x: 	-0.0272049802005
stainlessAppliances: 	0.0259795581264
renovation: 	0.0128040081913
distance: 	0.00965132092317


Size is the most correlated feature to saleprice. However, the correlations are weak. We need to do some more careful analysis.