In [1]:
import pandas as pd
from scipy import stats
from utils import create_connection, calculate_distance
import numpy as np
import pickle as pkl
import seaborn as sns
sns.set_theme()
import matplotlib.pyplot as plt
import plotly.express as px
import json
import random

In [2]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [3]:
from shapely.geometry import Polygon, Point

In [22]:
# Load the geojson of the neighborhoods
with open('./data/external/seattle-neighborhoods.geojson') as f:
    neighborhoods = json.load(f)

In [5]:
# Load the geojson of the zipcodes

with open('./data/external/seattle-zipcodes.geojson') as f:
    zipcodes = json.load(f)

# Generate the latitude/longitude pairs

In [6]:
def random_points_within(poly, num_points, nhood_name):
    """Generates n random points within a given polygon"""
    min_x, min_y, max_x, max_y = poly.bounds

    points = []

    while len(points) < num_points:
        random_point = Point([random.uniform(min_x, max_x), random.uniform(min_y, max_y)])
        if (random_point.within(poly)):
            points.append({'neighborhood': nhood_name, 'latitude': random_point.y, 'longitude': random_point.x})

    return points

In [7]:
# Generate points for each neighborhood
points =[]
for nhood in neighborhoods['features']:
    nhood_name = nhood['properties']['S_HOOD']
    poly = Polygon([tuple(corr) for corr in nhood['geometry']['coordinates'][0]])
    
    points.extend(random_points_within(poly, 100, nhood_name))
    
locations = pd.DataFrame(points)
locations = locations[locations.neighborhood != '']
locations.head()

Unnamed: 0,neighborhood,latitude,longitude
0,OOO,47.695397,-122.274673
1,OOO,47.695434,-122.274801
2,OOO,47.695379,-122.274558
3,OOO,47.695339,-122.274196
4,OOO,47.695427,-122.27473


In [8]:
# Add the zipcode to the locations dataframe
def check_point_in_polygon(poly, points):
    return points.within(poly)

locations['point']=locations[['longitude', 'latitude']].apply(Point, axis=1)
locations['zipcode'] = None


# Add the neighborhood name
def check_point_within_poly(point, poly, zipcode_id):
    if point.within(poly) == True:
        return zipcode_id
    else:
        pass

for zipcode in zipcodes['features']:
    zipcode_id = zipcode['properties']['ZCTA5CE10']
    poly = Polygon([tuple(corr) for corr in zipcode['geometry']['coordinates'][0]])
        
    # Create a mask for the dataframe
    mask = locations.point.apply(check_point_within_poly, poly=poly, zipcode_id=zipcode_id)
        
    locations['zipcode'] = np.where(mask,zipcode_id,locations['zipcode'])

locations.drop(columns={'point'}, inplace=True)
locations.head()

Unnamed: 0,neighborhood,latitude,longitude,zipcode
0,OOO,47.695397,-122.274673,98115
1,OOO,47.695434,-122.274801,98115
2,OOO,47.695379,-122.274558,98115
3,OOO,47.695339,-122.274196,98115
4,OOO,47.695427,-122.27473,98115


# Add the location data

In [9]:
locations.latitude=locations.latitude.astype(float)
locations.longitude=locations.longitude.astype(float)

In [10]:
# Filter out locations far from seattle
locations = locations[(locations.latitude > 47.42) & (locations.longitude < -122.08) & (locations.longitude > -122.42)]

In [11]:
# Calculate the distance to downtown seattle

downtown_seattle = {"lat": 47.604013, "lon": -122.335167}

locations['dist_seattle']=locations.apply(lambda row: calculate_distance(float(row.latitude), float(row.longitude), downtown_seattle['lat'], downtown_seattle['lon']), axis=1)
locations.head()

Unnamed: 0,neighborhood,latitude,longitude,zipcode,dist_seattle
0,OOO,47.695397,-122.274673,98115,6.913399
1,OOO,47.695434,-122.274801,98115,6.913285
2,OOO,47.695379,-122.274558,98115,6.914464
3,OOO,47.695339,-122.274196,98115,6.918804
4,OOO,47.695427,-122.27473,98115,6.914211


In [12]:
# Open the train stop location data
train_stops = pd.read_csv("./data/external/seattle_train_stops.txt")[
    ["stop_id", "stop_lat", "stop_lon"]
]

# Open the bus stop location data
bus_stops = pd.read_csv("./data/external/seattle_bus_stops.txt")[
    ["stop_id", "stop_lat", "stop_lon"]
]

In [13]:
# Calculate the distances to the bus and train stops for each apartment complex
locations['distBus']=locations.apply(lambda row: calculate_distance(float(row.latitude), float(row.longitude), bus_stops['stop_lat'], bus_stops['stop_lon']).min(), axis=1)
locations['distTrain']=locations.apply(lambda row: calculate_distance(float(row.latitude), float(row.longitude), train_stops['stop_lat'], train_stops['stop_lon']).min(), axis=1)

In [14]:
locations['dist_transit'] = np.where(locations.distBus < locations.distTrain,
                              locations.distBus,
                              locations.distTrain)

locations.drop(columns={'distBus', 'distTrain'}, inplace=True)
locations.head()

Unnamed: 0,neighborhood,latitude,longitude,zipcode,dist_seattle,dist_transit
0,OOO,47.695397,-122.274673,98115,6.913399,0.064774
1,OOO,47.695434,-122.274801,98115,6.913285,0.065302
2,OOO,47.695379,-122.274558,98115,6.914464,0.065794
3,OOO,47.695339,-122.274196,98115,6.918804,0.072512
4,OOO,47.695427,-122.27473,98115,6.914211,0.065827


In [15]:
# Load the zipcode clusters and map to the data set
with open('./data/seattle_zipcode_clusters.pickle', 'rb') as handle:
    zipcode_cluster_ids = pkl.load(handle)
    
locations['zipcode_cluster_id'] = locations.zipcode.map(zipcode_cluster_ids)
locations.head()

Unnamed: 0,neighborhood,latitude,longitude,zipcode,dist_seattle,dist_transit,zipcode_cluster_id
0,OOO,47.695397,-122.274673,98115,6.913399,0.064774,3.0
1,OOO,47.695434,-122.274801,98115,6.913285,0.065302,3.0
2,OOO,47.695379,-122.274558,98115,6.914464,0.065794,3.0
3,OOO,47.695339,-122.274196,98115,6.918804,0.072512,3.0
4,OOO,47.695427,-122.27473,98115,6.914211,0.065827,3.0


In [16]:
# Get dummies for the borough column
locations = locations.join(pd.get_dummies(locations.zipcode_cluster_id, prefix='cluster_id'))

# Drop the zipcode clusters column
locations.drop(columns={'zipcode_cluster_id'}, inplace=True)

locations.head()

Unnamed: 0,neighborhood,latitude,longitude,zipcode,dist_seattle,dist_transit,cluster_id_0.0,cluster_id_1.0,cluster_id_2.0,cluster_id_3.0,cluster_id_4.0,cluster_id_5.0
0,OOO,47.695397,-122.274673,98115,6.913399,0.064774,0,0,0,1,0,0
1,OOO,47.695434,-122.274801,98115,6.913285,0.065302,0,0,0,1,0,0
2,OOO,47.695379,-122.274558,98115,6.914464,0.065794,0,0,0,1,0,0
3,OOO,47.695339,-122.274196,98115,6.918804,0.072512,0,0,0,1,0,0
4,OOO,47.695427,-122.27473,98115,6.914211,0.065827,0,0,0,1,0,0


# Setup the User Inputs

In [17]:
# Setup the user input data
user_inputs = {'beds': 1.0,
        'baths': 1,
        'sqft': np.log(500),
        'fitness_center': 0.0,
        'air_conditioning': 0.0,
        'in_unit_washer_dryer': 0.0,
        'laundry_facilities': 0.0,
        #'car_charging': 0.0,
        'roof': 0.0,
        'concierge': 0.0,
        #'pool': 0.0,
        #'elevator': 1.0,
        'garage': 0.0,
        #'income_restrictions': 0.0,
        #'dishwasher': 1.0,
        'pets_allowed': 1.0
        }

In [18]:
# Add the user inputs to the data
locations = locations.assign(**user_inputs)
locations.head()

Unnamed: 0,neighborhood,latitude,longitude,zipcode,dist_seattle,dist_transit,cluster_id_0.0,cluster_id_1.0,cluster_id_2.0,cluster_id_3.0,...,baths,sqft,fitness_center,air_conditioning,in_unit_washer_dryer,laundry_facilities,roof,concierge,garage,pets_allowed
0,OOO,47.695397,-122.274673,98115,6.913399,0.064774,0,0,0,1,...,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,OOO,47.695434,-122.274801,98115,6.913285,0.065302,0,0,0,1,...,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,OOO,47.695379,-122.274558,98115,6.914464,0.065794,0,0,0,1,...,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,OOO,47.695339,-122.274196,98115,6.918804,0.072512,0,0,0,1,...,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,OOO,47.695427,-122.27473,98115,6.914211,0.065827,0,0,0,1,...,1,6.214608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Get predictions for all data points

In [20]:
# Load the xbg and rf models
with open('.../models/seattle_rent_prediction.pickle', 'rb') as handle:
    xgb = pkl.load(handle)


FileNotFoundError: [Errno 2] No such file or directory: '.../models/seattle_rent_prediction.pickle'

In [None]:
# Set the column order for the model
column_order = ['beds', 'baths', 'sqft', 'fitness_center', 'air_conditioning',
       'in_unit_washer_dryer', 'laundry_facilities', 'roof', 'concierge',
       'garage', 'dist_seattle', 'dist_transit', 'pets_allowed',
       'cluster_id_0.0', 'cluster_id_1.0', 'cluster_id_2.0', 'cluster_id_3.0', 'cluster_id_4.0', 'cluster_id_5.0']

In [None]:
X = locations[column_order]
X.head()


In [None]:
locations['pred_rent'] = np.exp(xgb.predict(X)).astype(int)
locations.head()

# Plot the data

In [None]:
dist_limit = 0.05

nhood_rent = locations[locations.dist_transit < dist_limit].groupby(['neighborhood'], as_index=False)['pred_rent'].median()
nhood_rent.pred_rent = nhood_rent.pred_rent.astype(int)
nhood_rent = nhood_rent[nhood_rent['neighborhood'].str.strip().astype(bool)]
nhood_rent.head()

In [None]:
downtown_seattle = {"lat": 47.604013, "lon": -122.335167}

In [None]:
fig = px.choropleth_mapbox(nhood_rent, 
                        geojson=neighborhoods, 
                        locations='neighborhood', 
                        featureidkey="properties.S_HOOD", 
                        color='pred_rent',
                        mapbox_style="carto-positron",
                        zoom=10.8, center={"lat": downtown_seattle['lat'], "lon": downtown_seattle['lon']},
                        opacity=0.7,
                        hover_name="neighborhood"
)

title_settings={
        'text': "Predicted Rent ($): 1 Bed, 1 Bath, 584sqft",
        'font_size': 25,
        'y':0.975,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'}

fig.update_layout(height=1000, width=1000)
fig.update_layout(title=title_settings)
fig.show()

In [None]:
# Save the locations dataframe w/out the user inputs
# This will be used by the app to make the predictions
locations = locations[locations.neighborhood != "OOO"]
locations = locations[locations.neighborhood != "Harbor Island"]
locations = locations[locations.neighborhood.str.strip().astype(bool)]

cols_to_save =['neighborhood', 'latitude', 'longitude', 'zipcode', 'dist_seattle','dist_transit', 'cluster_id_0.0', 'cluster_id_1.0', 'cluster_id_2.0', 'cluster_id_3.0', 'cluster_id_4.0', 'cluster_id_5.0']

locations[cols_to_save].to_csv('.../data/processed/seattle_prediction_location.csv', index=False)