In [4]:
import pandas as pd
import numpy as np
import folium
import geopandas as gpd
from shapely.geometry import Point, MultiPolygon, Polygon
from feature_engine.categorical_encoders import OneHotCategoricalEncoder

### Add the neighborhood information for the houses

In [9]:
#Load the data
houses = pd.read_csv('Data/houses.csv')
houses.head(2)

Unnamed: 0,latitude,longitude,home_size,lot_size,sex_offenders,crime_index,enviornmental_hazards,school_quality,bedrooms,bathrooms,geometry,neighborhood,sale_price,date,sale_price_cpi,age,neighborhood_name
0,33.803187,-118.201013,2079.0,6223.0,25,Slightly High,6,Average,2.0,2.0,POINT (-118.201013 33.803187),790.807319,785000.0,2020-12-09,885.613826,81.0,Long Beach
1,34.247827,-118.428183,1338.0,7250.0,13,Low,6,Average,3.0,2.0,POINT (-118.428183 34.247827),666.220923,769000.0,2020-12-14,876.541998,72.0,Arleta


In [24]:
hoods = gpd.read_file('Los Angeles Neighborhood Map.geojson')
hoods = hoods.rename(columns={'name': 'neighborhood_name'})
hoods[['longitude', 'latitude']] = hoods[['longitude', 'latitude']].astype(float)
hoods.head(2)

Unnamed: 0,external_i,neighborhood_name,location,latitude,slug_1,sqmi,display_na,set,slug,longitude,name_1,kind,type,geometry
0,acton,Acton,POINT(34.497355239240846 -118.16981019229348),-118.16981,,39.3391089485,Acton L.A. County Neighborhood (Current),L.A. County Neighborhoods (Current),acton,34.497355,,L.A. County Neighborhood (Current),unincorporated-area,"MULTIPOLYGON (((-118.20262 34.53899, -118.1894..."
1,adams-normandie,Adams-Normandie,POINT(34.031461499124156 -118.30020800000011),-118.300208,,0.805350187789,Adams-Normandie L.A. County Neighborhood (Curr...,L.A. County Neighborhoods (Current),adams-normandie,34.031461,,L.A. County Neighborhood (Current),segment-of-a-city,"MULTIPOLYGON (((-118.30901 34.03741, -118.3004..."


### Create a DataFrame for the Basic Search component in the app

In [29]:
#Create a dataframe that will be used by the app
#Sale Price per neighborhood

avg = houses.groupby('neighborhood_name')[['home_size', 'lot_size', 'bedrooms', 'bathrooms', 'sex_offenders', 
                                   'enviornmental_hazards', 'age', 'sale_price', 'neighborhood']].mean()

cat_avg = houses.groupby('neighborhood_name')[['school_quality', 'crime_index']].agg(lambda x:x.value_counts().index[0])
avg['sale_price'] = avg['sale_price'].apply(lambda x: '${:,.2f}'.format(x))
new_avg = pd.merge(avg, hoods[['neighborhood_name', 'latitude', 'longitude']], 
                   on ='neighborhood_name',  how='left')
new_avg = pd.merge(new_avg, cat_avg, on='neighborhood_name', how='left')
new_avg = new_avg.dropna()

ohe_encoder = OneHotCategoricalEncoder(variables=['crime_index', 'school_quality'])
ohe_encoder.fit(new_avg)
new_avg = ohe_encoder.transform(new_avg)

#Accounting for a rare label
if 'crime_index_Very High' not in new_avg.columns:
    new_avg['crime_index_Very High'] = 0
    
new_avg.to_csv('Data/Neighborhoods_final.csv', index=False)