In [3]:
import pandas as pd
import numpy as np
import folium
import geopandas as gpd
from shapely.geometry import Point, MultiPolygon, Polygon
from feature_engine.categorical_encoders import OneHotCategoricalEncoder

### Add the neighborhood information for the houses

In [4]:
#Load the data
houses = pd.read_csv('Data/houses.csv')
houses.head(2)

Unnamed: 0,latitude,longitude,home_size,lot_size,sex_offenders,crime_index,enviornmental_hazards,school_quality,bedrooms,bathrooms,sale_price,date,sale_price_cpi,age
0,33.92957,-118.417952,1890.0,6414.0,3.0,Moderate,11.0,Excellent,3.0,2.0,1570000.0,2020-12-09,1252.447084,56.0
1,33.813798,-118.1332,1775.0,5492.0,2.0,Slightly High,4.0,Excellent,3.0,2.0,819000.0,2020-12-14,904.589434,70.0


In [86]:
hoods = gpd.read_file('Los Angeles Neighborhood Map.geojson')
hoods = hoods.rename(columns={'name': 'neighborhood'})
hoods[['longitude', 'latitude']] = hoods[['longitude', 'latitude']].astype(float)
hoods['location'] = [Point(xy) for xy in zip(hoods.longitude, hoods.latitude)]
hoods['polygon'] = hoods['geometry'].apply(lambda x: Polygon(x[0]))
hoods.head(2)

Unnamed: 0,external_i,neighborhood,location,latitude,slug_1,sqmi,display_na,set,slug,longitude,name_1,kind,type,geometry,polygon
0,acton,Acton,POINT (34.49735523924085 -118.1698101922935),-118.16981,,39.3391089485,Acton L.A. County Neighborhood (Current),L.A. County Neighborhoods (Current),acton,34.497355,,L.A. County Neighborhood (Current),unincorporated-area,"MULTIPOLYGON (((-118.20262 34.53899, -118.1894...","POLYGON ((-118.20262 34.53899, -118.18947 34.5..."
1,adams-normandie,Adams-Normandie,POINT (34.03146149912416 -118.3002080000001),-118.300208,,0.805350187789,Adams-Normandie L.A. County Neighborhood (Curr...,L.A. County Neighborhoods (Current),adams-normandie,34.031461,,L.A. County Neighborhood (Current),segment-of-a-city,"MULTIPOLYGON (((-118.30901 34.03741, -118.3004...","POLYGON ((-118.30901 34.03741, -118.30041 34.0..."


In [6]:
#Turn the lat, long info into a geometry point
gdf = gpd.GeoDataFrame(houses, geometry=[Point(xy) for xy in zip(houses.longitude, houses.latitude)])
gdf.head(2)

Unnamed: 0,latitude,longitude,home_size,lot_size,sex_offenders,crime_index,enviornmental_hazards,school_quality,bedrooms,bathrooms,sale_price,date,sale_price_cpi,age,geometry
0,33.92957,-118.417952,1890.0,6414.0,3.0,Moderate,11.0,Excellent,3.0,2.0,1570000.0,2020-12-09,1252.447084,56.0,POINT (-118.41795 33.92957)
1,33.813798,-118.1332,1775.0,5492.0,2.0,Slightly High,4.0,Excellent,3.0,2.0,819000.0,2020-12-14,904.589434,70.0,POINT (-118.13320 33.81380)


In [7]:
#This function will classify every house and label it an appropriate neighborhood
def find_neighborhood(coordinates):
    for index, hood in hoods.iterrows():
        if hood['polygon'].contains(coordinates):
            return hood['neighborhood']
        else:
            continue 

gdf['neighborhood'] = gdf['geometry'].apply(find_neighborhood)
gdf.head()

Unnamed: 0,latitude,longitude,home_size,lot_size,sex_offenders,crime_index,enviornmental_hazards,school_quality,bedrooms,bathrooms,sale_price,date,sale_price_cpi,age,geometry,neighborhood
0,33.92957,-118.417952,1890.0,6414.0,3.0,Moderate,11.0,Excellent,3.0,2.0,1570000.0,2020-12-09,1252.447084,56.0,POINT (-118.41795 33.92957),El Segundo
1,33.813798,-118.1332,1775.0,5492.0,2.0,Slightly High,4.0,Excellent,3.0,2.0,819000.0,2020-12-14,904.589434,70.0,POINT (-118.13320 33.81380),Long Beach
2,34.162508,-118.403515,2272.0,6297.0,5.0,Low,3.0,Excellent,3.0,3.0,1160000.0,2020-12-14,1076.560781,73.0,POINT (-118.40351 34.16251),Valley Village
3,34.161052,-118.619714,1862.0,5319.0,0.0,Very Low,2.0,Excellent,3.0,2.0,855000.0,2020-12-14,924.25672,64.0,POINT (-118.61971 34.16105),Woodland Hills
4,33.801546,-118.139969,1174.0,6552.0,3.0,Slightly High,9.0,Excellent,3.0,1.0,735000.0,2020-12-14,856.945553,72.0,POINT (-118.13997 33.80155),Long Beach


In [98]:
def find_neighborhood2(coordinates):
    hoods = gpd.read_file('Los Angeles Neighborhood Map.geojson')
    hoods[['longitude', 'latitude']] = hoods[['longitude', 'latitude']].astype(float)
    hoods['location'] = [Point(xy) for xy in zip(hoods.longitude, hoods.latitude)]
    hoods['distance'] = 1.0
    for index, row in hoods.iterrows():
        row['distance'] = row['location'].distance(coordinates)
    hoods.sort_values(by='distance', inplace=True)
    return hoods['name'][0]

In [8]:
print(f'There are  {gdf.neighborhood.isnull().sum()} out of {gdf.shape[0]} missing classifications.')
gdf.drop('geometry', axis=1, inplace=True)
gdf['neighborhood'].fillna('Missing', inplace=True)
gdf.to_csv('Data/houses_neighborhood_info.csv', index=False)
gdf.head()

There are  656 out of 6736 missing classifications.


Unnamed: 0,latitude,longitude,home_size,lot_size,sex_offenders,crime_index,enviornmental_hazards,school_quality,bedrooms,bathrooms,sale_price,date,sale_price_cpi,age,neighborhood
0,33.92957,-118.417952,1890.0,6414.0,3.0,Moderate,11.0,Excellent,3.0,2.0,1570000.0,2020-12-09,1252.447084,56.0,El Segundo
1,33.813798,-118.1332,1775.0,5492.0,2.0,Slightly High,4.0,Excellent,3.0,2.0,819000.0,2020-12-14,904.589434,70.0,Long Beach
2,34.162508,-118.403515,2272.0,6297.0,5.0,Low,3.0,Excellent,3.0,3.0,1160000.0,2020-12-14,1076.560781,73.0,Valley Village
3,34.161052,-118.619714,1862.0,5319.0,0.0,Very Low,2.0,Excellent,3.0,2.0,855000.0,2020-12-14,924.25672,64.0,Woodland Hills
4,33.801546,-118.139969,1174.0,6552.0,3.0,Slightly High,9.0,Excellent,3.0,1.0,735000.0,2020-12-14,856.945553,72.0,Long Beach


In [13]:
pd.set_option('display.max_rows', None)

### Create a DataFrame for the Basic Search component in the app

In [9]:
#Create a dataframe that will be used by the app
#Sale Price per neighborhood

avg = gdf.groupby('neighborhood')[['home_size', 'lot_size', 'bedrooms', 'bathrooms', 'sex_offenders', 
                                   'enviornmental_hazards', 'age', 'sale_price']].mean()

cat_avg = gdf.groupby('neighborhood')[['school_quality', 'crime_index']].agg(lambda x:x.value_counts().index[0])
avg['sale_price'] = avg['sale_price'].apply(lambda x: '${:,.2f}'.format(x))
new_avg = pd.merge(avg, hoods[['neighborhood', 'latitude', 'longitude']], on='neighborhood', how='left')
new_avg = pd.merge(new_avg, cat_avg, on='neighborhood', how='left')
new_avg = new_avg.dropna()

ohe_encoder = OneHotCategoricalEncoder(variables=['crime_index', 'school_quality'])
ohe_encoder.fit(new_avg)
new_avg = ohe_encoder.transform(new_avg)
if 'crime_index_Very High' not in new_avg.columns:
    new_avg['crime_index_Very High'] = 0
new_avg.to_csv('Data/Neighborhoods_final.csv', index=False)