In [1]:
# Import packages and read data

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#plt.style.use('seaborn')
%matplotlib inline

In [2]:
raw_data = pd.read_csv('data/kc_house_data.csv')

pd.set_option('display.max_columns', 21)
raw_data.head(2)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,NONE,Average,7 Average,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,Average,7 Average,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639


In [None]:
raw_data.shape

In [None]:
# from geopy.geocoders import Nominatim
# def get_city(lat, long):
#     geolocator = Nominatim(user_agent="geoapiExercises")
#     location = geolocator.reverse(lat+","+long)
#     address = location.raw['address']
#     city = address.get('city', '')
#     town = address.get('town', '')
#     if city == '':
#         return town
#     else:
#         return city

In [None]:
# get_city(str(47.5427), str(-122.288))

In [None]:
# for i in range(10):
#     print(get_city(str(raw_data.loc[i, 'lat']), str(raw_data.loc[i, 'long'])))

In [None]:
# Create city column
# raw_data['city'] = raw_data.apply(lambda row: get_city(str(row.lat), str(row.long)), axis=1)

In [3]:
# Data prep and cleaning

# Change to datetime and add month column
raw_data['date'] = pd.to_datetime(raw_data['date'])
raw_data['month'] = pd.DatetimeIndex(raw_data['date']).month

# Change waterfront missing value to No
raw_data.loc[raw_data.waterfront.isnull(), 'waterfront'] = "NO"
raw_data['waterfront'] = raw_data['waterfront'].apply(lambda x: 0 if x == 'NO' else 1)

# Change view missing value to None
raw_data.loc[raw_data.view.isnull(), 'view'] = "NONE"
view_dict = {'NONE':0, 'FAIR':1, 'AVERAGE':2, 'GOOD':3, 'EXCELLENT':4}
raw_data['view'].replace(view_dict, inplace=True)

# Change condition to numerical value
cond_dict = {'Poor':0, 'Fair':1, 'Average':2, 'Good':3, 'Very Good':4}
raw_data['condition'].replace(cond_dict, inplace=True)

# Change grade to numerical value
raw_data['grade'] = raw_data['grade'].map(lambda x: int(x.split(' ')[0]))

# Add has_basement column
raw_data['basement'] = raw_data['sqft_basement'].apply(lambda x: 0 if x == 0 else 1)

# Change some yr_renovated missing value to 0 and add renovated column
raw_data.loc[raw_data.yr_renovated.isnull(), 'yr_renovated'] = 0
raw_data['renovated'] = raw_data['yr_renovated'].apply(lambda x: 0 if x == 0 else 1)

# Add house_age column
raw_data['age'] = raw_data['date'].dt.year - raw_data['yr_built']

In [28]:
raw_data.head(2)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,zipcode,lat,long,sqft_living15,sqft_lot15,month,basement,renovated,age,distance
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,98178,47.5112,-122.257,1340,5650,10,1,0,59,7.617592
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,98125,47.721,-122.319,1690,7639,12,1,1,63,7.842822


In [27]:
from haversine import haversine, Unit

def get_dist(lat, long):
    seattle = (47.608013, -122.335167)
    house = (lat, long)
    return haversine(seattle, house, unit=Unit.MILES)

# Add distance_from_seattle column
raw_data['distance'] = raw_data.apply(lambda x: get_dist(x.lat, x.long), axis=1)

In [29]:
def get_dist_red(lat, long):
    redmond = (47.673988, -122.121513)
    house = (lat, long)
    return haversine(redmond, house, unit=Unit.MILES)

# Add distance_from_seattle column
raw_data['distance_r'] = raw_data.apply(lambda x: get_dist_red(x.lat, x.long), axis=1)

In [30]:
raw_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,lat,long,sqft_living15,sqft_lot15,month,basement,renovated,age,distance,distance_r
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,47.5112,-122.257,1340,5650,10,1,0,59,7.617592,12.898239
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,47.721,-122.319,1690,7639,12,1,1,63,7.842822,9.741242
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,...,47.7379,-122.233,2720,8062,2,1,0,82,10.155421,6.809569
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,0,...,47.5208,-122.393,1360,5000,12,1,0,49,6.601563,16.49327
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,0,...,47.6168,-122.045,1800,7503,2,1,0,28,13.529254,5.319599


In [None]:
from math import sin, cos, sqrt, atan2, radians

# approximate radius of earth in km
R = 3958.8

lat1 = radians(47.608013)
lon1 = radians(-122.335167)

lat2 = radians(raw_data.loc[0, 'lat'])
lon2 = radians(raw_data.loc[0, 'long'])

dlon = lon2 - lon1
dlat = lat2 - lat1

a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))

distance = R * c

print("Result:", distance)

In [None]:
seattle = (47.608013, -122.335167)
house = (raw_data.loc[0, 'lat'], raw_data.loc[0, 'long'])

haversine(seattle, house, unit=Unit.MILES)

In [None]:
from haversine import haversine, Unit

lyon = (45.7597, 4.8422) # (lat, lon)
paris = (48.8567, 2.3508)

haversine(lyon, paris)
#392.2172595594006  # in kilometers

#haversine(lyon, paris, unit=Unit.MILES)

In [None]:
less_than_million = raw_data.loc[(raw_data.price < 1_000_000)]
million_house = raw_data.loc[(raw_data.price >= 1_000_000)]

In [None]:
# import libraries
import geopandas as gpd
from shapely.geometry import Point, Polygon
import matplotlib.pyplot as plt
# import street map
street_map = gpd.read_file('Map/Incorporated_Areas_of_King_County___city_area.shp')

In [None]:
less_geometry = [Point(xy) for xy in zip(less_than_million['long'], less_than_million['lat'])]

less_df = gpd.GeoDataFrame(less_than_million, crs = 'EPSG:4326', geometry = less_geometry)

In [None]:
fig, ax = plt.subplots(figsize=(15,15))

street_map.plot(ax=ax, alpha=0.4,color='grey')

less_df.plot(column='price',ax=ax,alpha=0.5, legend=True, markersize=10, cmap='Spectral')

plt.title('House Prices in KingCounty', fontsize=15,fontweight='bold')

plt.xlim(raw_data.long.min(),raw_data.long.max())
plt.ylim(raw_data.lat.min(),raw_data.lat.max())

plt.show()

In [None]:
mill_geometry = [Point(xy) for xy in zip(million_house['long'], million_house['lat'])]

mill_df = gpd.GeoDataFrame(million_house, crs = 'EPSG:4326', geometry = mill_geometry)

In [None]:
fig, ax = plt.subplots(figsize=(15,15))

street_map.plot(ax=ax, alpha=0.4,color='grey')

mill_df.plot(column='price',ax=ax,alpha=0.5, legend=True, markersize=10, cmap='Spectral')

plt.title('House Prices in KingCounty', fontsize=15,fontweight='bold')

plt.xlim(raw_data.long.min(),raw_data.long.max())
plt.ylim(raw_data.lat.min(),raw_data.lat.max())

plt.show()

In [None]:
# Drop columns
raw_data.drop(columns=['id', 'date', 'yr_renovated', 'sqft_above', 'sqft_basement',
                      'yr_built', 'yr_renovated'], inplace=True)

In [None]:
raw_data.head(2)

In [None]:
raw_data.bedrooms.value_counts()

In [6]:
raw_data.loc[(raw_data.bedrooms == 33), :]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,month,basement,renovated,age
15856,2402100895,2014-06-25,640000.0,33,1.75,1620,6000,1.0,0,0,...,0.0,98103,47.6878,-122.331,1330,4700,6,1,0,67


In [None]:
raw_data.bathrooms.value_counts()

In [5]:
raw_data.loc[(raw_data.bathrooms >6), :]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,month,basement,renovated,age
4020,9175600025,2014-10-07,800000.0,7,6.75,7480,41664,2.0,0,2,...,0.0,98166,47.4643,-122.368,2810,33190,10,1,0,61
7245,6762700020,2014-10-13,7700000.0,6,8.0,12050,27600,2.5,0,3,...,1987.0,98102,47.6298,-122.323,3940,8800,10,1,1,104
8085,1924059029,2014-06-17,4670000.0,5,6.75,9640,13068,1.0,1,4,...,2009.0,98040,47.557,-122.21,3270,10454,6,1,1,31
8537,424049043,2014-08-11,450000.0,9,7.5,4050,6504,2.0,0,0,...,0.0,98144,47.5923,-122.301,1448,3866,8,1,0,18
9245,9208900037,2014-09-19,6890000.0,6,7.75,9890,31374,2.0,0,4,...,0.0,98039,47.6305,-122.24,4540,42730,9,1,0,13
12764,1225069038,2014-05-05,2280000.0,7,8.0,13540,307752,3.0,0,4,...,0.0,98053,47.6675,-121.986,4850,217800,5,1,0,15
14542,2303900035,2014-06-11,2890000.0,5,6.25,8670,64033,2.0,0,4,...,0.0,98177,47.7295,-122.372,4140,81021,6,1,0,49
18288,6072800246,2014-07-02,3300000.0,5,6.25,8020,21738,2.0,0,0,...,0.0,98006,47.5675,-122.189,4160,18969,7,1,0,13
20562,424069279,2015-03-28,1180000.0,6,6.5,6260,10955,2.0,0,0,...,0.0,98075,47.5947,-122.039,2710,12550,3,1,0,8
21490,2524069097,2014-05-09,2240000.0,5,6.5,7270,130017,2.0,0,0,...,0.0,98027,47.5371,-121.982,1800,44890,5,1,0,4


- Discrete: bedrooms, bathrooms, floors, condition, grade, age
- Continuous: price, sqft_living, sqft_lot, sqft_living15, sqft_lot15
- Categorical: waterfront, view, zipcode, month or season, basement, renovated

In [None]:
def corr_check(df, threshold):
    '''
    Enter dataframe and threshold for correlation
    Returns table of the highly correlated pairs
    '''
    corr_df = df.corr().abs().stack().reset_index().sort_values(0, ascending=False)
    corr_df['pairs'] = list(zip(corr_df.level_0, corr_df.level_1))
    corr_df.set_index(['pairs'], inplace = True)
    corr_df.drop(columns=['level_1', 'level_0'], inplace = True)
    corr_df.columns = ['cc']
    corr_df = corr_df.drop_duplicates()
    corr_df = corr_df[(corr_df['cc'] > threshold) & (corr_df['cc'] < 1)]
    return corr_df

corr_check(raw_data, .5)

In [None]:
to_drop = ['price', 'waterfront', 'view', 'zipcode', 
           'month', 'basement', 'renovated', 'lat', 'long', 
           'sqft_living15', 'sqft_lot15', 'grade', 'bathrooms', 'sqft_lot', 'age', 'condition']
y = raw_data.price
X = raw_data.drop(to_drop, axis=1)

In [None]:
a = X < 0
a.sum()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

In [None]:
import statsmodels.api as sm

model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
model.rsquared

In [None]:
# to_transform = ['sqft_living']
# log_df = X_train.copy()

# for col in to_transform:
#     log_df[col] = np.log(log_df[col])

In [None]:
X_train = np.log(X_train)

In [None]:
model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
model.summary()

In [None]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns, index = X_train.index)

model = sm.OLS(y_train, sm.add_constant(X_scaled)).fit()
model.summary()