In [1]:
import pandas as pd
from scipy import stats
from utils import create_connection, calculate_distance
import numpy as np
import pickle as pkl
import seaborn as sns
sns.set_theme()
import matplotlib.pyplot as plt
import plotly.express as px
import json
import random

In [2]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [3]:
from shapely.geometry import Polygon, Point

In [4]:
# Load the geojson of the neighborhoods
with open('./data/external/seattle-neighborhoods.geojson') as f:
    neighborhoods = json.load(f)

In [5]:
# Load the geojson of the zipcodes

with open('./data/external/seattle-zipcodes.geojson') as f:
    zipcodes = json.load(f)

# Generate the latitude/longitude pairs

In [45]:
# Filter out points_df far from seattle
min_lat = 47.42
max_lat = 47.745
min_lon= -122.45
max_lon = -122.253

scale_factor = abs(max_lon - min_lon)/(max_lat - min_lat)

# Generate grid
xx, yy = np.meshgrid(np.linspace(min_lon,max_lon,67), np.linspace(min_lat,max_lat,100))
xc = xx.flatten()
yc = yy.flatten()

# Generate list of points
points = [Point(x, y) for x, y in zip(xc, yc)]

points_df = pd.DataFrame({'point': points, 'latitude': yc, 'longitude': xc})
points_df['neighborhood'] = None

# Get polygon label for where point occurs
def check_point_within_poly(point, poly, nhood_name):
    if point.within(poly) == True:
        return nhood_name
    else:
        pass
    
# Loop through all polygons
for nhood in neighborhoods['features']:
    nhood_name = nhood['properties']['S_HOOD']
    poly = Polygon([tuple(corr) for corr in nhood['geometry']['coordinates'][0]])
    
    # Create a mask for the dataframe
    mask = points_df.point.apply(check_point_within_poly, poly=poly, nhood_name=nhood_name)
    
    # Add neighborhood name for true in mask
    points_df['neighborhood'] = np.where(mask,nhood_name,points_df['neighborhood'])

points_df = points_df[~points_df.neighborhood.isnull()]
points_df.drop(columns={'point'}, inplace=True)
points_df.head()


Unnamed: 0,latitude,longitude,neighborhood
1632,47.498788,-122.378364,Arbor Heights
1633,47.498788,-122.375379,Arbor Heights
1669,47.498788,-122.267924,Rainier View
1670,47.498788,-122.264939,Rainier View
1671,47.498788,-122.261955,Rainier View


In [47]:
fig = px.scatter_mapbox(points_df,
              lat="latitude" ,
              lon="longitude",
              mapbox_style='carto-positron',    
              zoom=10)

fig.update_layout(height=1000, width=1000, margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

# Add the location data

In [48]:
points_df.latitude=points_df.latitude.astype(float)
points_df.longitude=points_df.longitude.astype(float)

In [49]:
# Calculate the distance to downtown seattle

downtown_seattle = {"lat": 47.604013, "lon": -122.335167}

points_df['dist_seattle']=points_df.apply(lambda row: calculate_distance(float(row.latitude), float(row.longitude), downtown_seattle['lat'], downtown_seattle['lon']), axis=1)
points_df.head()

Unnamed: 0,latitude,longitude,neighborhood,dist_seattle
1632,47.498788,-122.378364,Arbor Heights,7.544257
1633,47.498788,-122.375379,Arbor Heights,7.508289
1669,47.498788,-122.267924,Rainier View,7.917756
1670,47.498788,-122.264939,Rainier View,7.973906
1671,47.498788,-122.261955,Rainier View,8.032076


In [50]:
# Open the train stop location data
train_stops = pd.read_csv("./data/external/seattle_train_stops.txt")[
    ["stop_id", "stop_lat", "stop_lon"]
]

# Open the bus stop location data
bus_stops = pd.read_csv("./data/external/seattle_bus_stops.txt")[
    ["stop_id", "stop_lat", "stop_lon"]
]

In [51]:
# Calculate the distances to the bus and train stops for each apartment complex
points_df['distBus']=points_df.apply(lambda row: calculate_distance(float(row.latitude), float(row.longitude), bus_stops['stop_lat'], bus_stops['stop_lon']).min(), axis=1)
points_df['distTrain']=points_df.apply(lambda row: calculate_distance(float(row.latitude), float(row.longitude), train_stops['stop_lat'], train_stops['stop_lon']).min(), axis=1)

In [52]:
points_df['dist_transit'] = np.where(points_df.distBus < points_df.distTrain,
                              points_df.distBus,
                              points_df.distTrain)

points_df.drop(columns={'distBus', 'distTrain'}, inplace=True)
points_df.head()

Unnamed: 0,latitude,longitude,neighborhood,dist_seattle,dist_transit
1632,47.498788,-122.378364,Arbor Heights,7.544257,0.051343
1633,47.498788,-122.375379,Arbor Heights,7.508289,0.085326
1669,47.498788,-122.267924,Rainier View,7.917756,0.11285
1670,47.498788,-122.264939,Rainier View,7.973906,0.068639
1671,47.498788,-122.261955,Rainier View,8.032076,0.083283


In [54]:
# Load the location clusters and map to the data set
with open('./data/seattle_clusters.pickle', 'rb') as handle:
    kmeans = pkl.load(handle)

# Load the location clusters rent averae and map to the data set
with open('./data/seattle_cluster_rent_per_sqft.pickle', 'rb') as handle:
    cluster_rent_per_sqft = pkl.load(handle)
    
    
points_df['cluster_id'] = kmeans.predict(points_df[['latitude','longitude']])
points_df['cluster_rent_per_sqft'] = points_df.cluster_id.map(cluster_rent_per_sqft)
points_df.drop(columns={'cluster_id'}, inplace=True)
points_df.head()

Unnamed: 0,latitude,longitude,neighborhood,dist_seattle,dist_transit,cluster_rent_per_sqft
1632,47.498788,-122.378364,Arbor Heights,7.544257,0.051343,2.333471
1633,47.498788,-122.375379,Arbor Heights,7.508289,0.085326,2.333471
1669,47.498788,-122.267924,Rainier View,7.917756,0.11285,3.022761
1670,47.498788,-122.264939,Rainier View,7.973906,0.068639,3.022761
1671,47.498788,-122.261955,Rainier View,8.032076,0.083283,3.022761


# Setup the User Inputs

In [55]:
# Setup the user input data
user_inputs = {'beds_times_baths': 1,
        'sqft': np.log(500),
        'fitness_center': 0.0,
        'air_conditioning': 0.0,
        'in_unit_washer_dryer': 0.0,
        'laundry_facilities': 0.0,
        #'car_charging': 0.0,
        'roof': 0.0,
        'concierge': 0.0,
        #'pool': 0.0,
        #'elevator': 1.0,
        'garage': 0.0,
        #'income_restrictions': 0.0,
        #'dishwasher': 1.0,
        'pets_allowed': 1.0
        }

In [56]:
# Add the user inputs to the data
points_df = points_df.assign(**user_inputs)
points_df.head()

Unnamed: 0,latitude,longitude,neighborhood,dist_seattle,dist_transit,cluster_rent_per_sqft,beds_times_baths,sqft,fitness_center,air_conditioning,in_unit_washer_dryer,laundry_facilities,roof,concierge,garage,pets_allowed
1632,47.498788,-122.378364,Arbor Heights,7.544257,0.051343,2.333471,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1633,47.498788,-122.375379,Arbor Heights,7.508289,0.085326,2.333471,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1669,47.498788,-122.267924,Rainier View,7.917756,0.11285,3.022761,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1670,47.498788,-122.264939,Rainier View,7.973906,0.068639,3.022761,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1671,47.498788,-122.261955,Rainier View,8.032076,0.083283,3.022761,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Get predictions for all data points

In [57]:
# Load the xbg and rf models
with open('./models/seattle_rent_prediction.pickle', 'rb') as handle:
    xgb = pkl.load(handle)


In [58]:
# Set the column order for the model
column_order = ['sqft', 'fitness_center', 'air_conditioning', 'in_unit_washer_dryer',
       'laundry_facilities', 'roof', 'concierge', 'garage', 'dist_seattle',
       'dist_transit', 'cluster_rent_per_sqft', 'beds_times_baths',
       'pets_allowed']

In [59]:
X = points_df[column_order]
X.head()


Unnamed: 0,sqft,fitness_center,air_conditioning,in_unit_washer_dryer,laundry_facilities,roof,concierge,garage,dist_seattle,dist_transit,cluster_rent_per_sqft,beds_times_baths,pets_allowed
1632,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.544257,0.051343,2.333471,1,1.0
1633,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.508289,0.085326,2.333471,1,1.0
1669,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.917756,0.11285,3.022761,1,1.0
1670,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.973906,0.068639,3.022761,1,1.0
1671,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.032076,0.083283,3.022761,1,1.0


In [60]:
points_df['pred_rent'] = np.exp(xgb.predict(X)).astype(int)
points_df.head()

Unnamed: 0,latitude,longitude,neighborhood,dist_seattle,dist_transit,cluster_rent_per_sqft,beds_times_baths,sqft,fitness_center,air_conditioning,in_unit_washer_dryer,laundry_facilities,roof,concierge,garage,pets_allowed,pred_rent
1632,47.498788,-122.378364,Arbor Heights,7.544257,0.051343,2.333471,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1617
1633,47.498788,-122.375379,Arbor Heights,7.508289,0.085326,2.333471,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1519
1669,47.498788,-122.267924,Rainier View,7.917756,0.11285,3.022761,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1310
1670,47.498788,-122.264939,Rainier View,7.973906,0.068639,3.022761,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1483
1671,47.498788,-122.261955,Rainier View,8.032076,0.083283,3.022761,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1462


# Plot the data

In [61]:
dist_limit = 0.1

nhood_rent = points_df[points_df.dist_transit < dist_limit].groupby(['neighborhood'], as_index=False)['pred_rent'].median()
nhood_rent.pred_rent = nhood_rent.pred_rent.astype(int)
nhood_rent = nhood_rent[nhood_rent['neighborhood'].str.strip().astype(bool)]
nhood_rent.head()

Unnamed: 0,neighborhood,pred_rent
0,Adams,1742
1,Alki,1647
2,Arbor Heights,1519
3,Atlantic,1637
4,Belltown,1981


In [62]:
downtown_seattle = {"lat": 47.604013, "lon": -122.335167}

In [63]:
fig = px.choropleth_mapbox(nhood_rent, 
                        geojson=neighborhoods, 
                        locations='neighborhood', 
                        featureidkey="properties.S_HOOD", 
                        color='pred_rent',
                        mapbox_style="carto-positron",
                        zoom=10.8, center={"lat": downtown_seattle['lat'], "lon": downtown_seattle['lon']},
                        opacity=0.7,
                        hover_name="neighborhood"
)

title_settings={
        'text': "Predicted Rent ($): 1 Bed, 1 Bath, 584sqft",
        'font_size': 25,
        'y':0.975,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'}

fig.update_layout(height=1000, width=1000)
fig.update_layout(title=title_settings)
fig.show()

In [65]:
# Save the locations dataframe w/out the user inputs
# This will be used by the app to make the predictions

cols_to_save =['neighborhood', 'latitude', 'longitude', 'dist_seattle','dist_transit', 'cluster_rent_per_sqft']

points_df[cols_to_save].to_csv('./data/processed/seattle_prediction_location.csv', index=False)