In [1]:
import pandas as pd
import numpy as np
import folium
import geopandas as gpd
from shapely.geometry import Point, MultiPolygon, Polygon
from feature_engine.categorical_encoders import OneHotCategoricalEncoder

### Add the neighborhood information for the houses

In [2]:
#Load the data
houses = pd.read_csv('Data/houses.csv')
houses.head(2)

Unnamed: 0,latitude,longitude,home_size,lot_size,zoning,sex_offenders,crime_index,enviornmental_hazards,school_quality,bedrooms,bathrooms,sale_price,date,sale_price_cpi
0,34.012375,-118.407173,3247.0,5011.0,CCR1,1.0,Moderate,6.0,Excellent,4.0,4.0,1700000.0,2020-12-01,1303.268866
1,33.796401,-118.280405,812.0,4991.0,LAR1,13.0,Moderate,15.0,Average,2.0,1.0,325000.0,2020-11-25,570.931609


In [3]:
hoods = gpd.read_file('Los Angeles Neighborhood Map.geojson')
hoods = hoods.rename(columns={'name': 'neighborhood'})
hoods['polygon'] = hoods['geometry'].apply(lambda x: Polygon(x[0]))
hoods.head(2)

Unnamed: 0,external_i,neighborhood,location,latitude,slug_1,sqmi,display_na,set,slug,longitude,name_1,kind,type,geometry,polygon
0,acton,Acton,POINT(34.497355239240846 -118.16981019229348),-118.16981019229348,,39.3391089485,Acton L.A. County Neighborhood (Current),L.A. County Neighborhoods (Current),acton,34.49735523924085,,L.A. County Neighborhood (Current),unincorporated-area,"MULTIPOLYGON (((-118.20262 34.53899, -118.1894...","POLYGON ((-118.20262 34.53899, -118.18947 34.5..."
1,adams-normandie,Adams-Normandie,POINT(34.031461499124156 -118.30020800000011),-118.30020800000013,,0.805350187789,Adams-Normandie L.A. County Neighborhood (Curr...,L.A. County Neighborhoods (Current),adams-normandie,34.03146149912416,,L.A. County Neighborhood (Current),segment-of-a-city,"MULTIPOLYGON (((-118.30901 34.03741, -118.3004...","POLYGON ((-118.30901 34.03741, -118.30041 34.0..."


In [4]:
#Turn the lat, long info into a geometry point
gdf = gpd.GeoDataFrame(houses, geometry=[Point(xy) for xy in zip(houses.longitude, houses.latitude)])
gdf.head(2)

Unnamed: 0,latitude,longitude,home_size,lot_size,zoning,sex_offenders,crime_index,enviornmental_hazards,school_quality,bedrooms,bathrooms,sale_price,date,sale_price_cpi,geometry
0,34.012375,-118.407173,3247.0,5011.0,CCR1,1.0,Moderate,6.0,Excellent,4.0,4.0,1700000.0,2020-12-01,1303.268866,POINT (-118.40717 34.01237)
1,33.796401,-118.280405,812.0,4991.0,LAR1,13.0,Moderate,15.0,Average,2.0,1.0,325000.0,2020-11-25,570.931609,POINT (-118.28041 33.79640)


In [17]:
#This function will classify every house and label it an appropriate neighborhood
def find_neighborhood(coordinates):
    for index, hood in hoods.iterrows():
        if hood['polygon'].contains(coordinates):
            return hood['neighborhood']
        else:
            continue 

gdf['neighborhood'] = gdf['geometry'].apply(find_neighborhood)
gdf.head()

Unnamed: 0,latitude,longitude,home_size,lot_size,zoning,sex_offenders,crime_index,enviornmental_hazards,school_quality,bedrooms,bathrooms,sale_price,date,sale_price_cpi,age,geometry,neighborhood
0,34.012375,-118.407173,3247.0,5011.0,CCR1,1.0,Moderate,6.0,Excellent,4.0,4.0,1700000.0,2020-12-01,1303.268866,41.0,POINT (-118.40717 34.01237),Culver City
1,34.242494,-118.26143,986.0,6180.0,GLR1YY,5.0,Low,2.0,Excellent,2.0,1.0,767500.0,2020-11-30,877.367614,72.0,POINT (-118.26143 34.24249),Glendale
2,34.11711,-118.370951,3243.0,8441.0,LARE11,1.0,Slightly High,2.0,Excellent,3.0,4.0,700000.0,2020-11-25,837.898528,20.0,POINT (-118.37095 34.11711),Hollywood Hills West
3,33.863577,-118.048971,1733.0,5002.0,CERS5000,5.0,Low,7.0,Excellent,4.0,2.0,845000.0,2020-11-25,920.599557,48.0,POINT (-118.04897 33.86358),Cerritos
4,33.892628,-118.090063,728.0,5088.0,NOR1,22.0,Moderate,5.0,Above Average,2.0,1.0,535000.0,2020-11-25,732.519682,70.0,POINT (-118.09006 33.89263),Norwalk


In [18]:
print(f'There are  {gdf.neighborhood.isnull().sum()} out of {gdf.shape[0]} missing classifications.')
gdf.drop('geometry', axis=1, inplace=True)
gdf['neighborhood'].fillna('Missing', inplace=True)
gdf.to_csv('Data/houses_neighborhood_info.csv', index=False)
gdf.head()

There are  574 out of 6068 missing classifications.


Unnamed: 0,latitude,longitude,home_size,lot_size,zoning,sex_offenders,crime_index,enviornmental_hazards,school_quality,bedrooms,bathrooms,sale_price,date,sale_price_cpi,age,neighborhood
0,34.012375,-118.407173,3247.0,5011.0,CCR1,1.0,Moderate,6.0,Excellent,4.0,4.0,1700000.0,2020-12-01,1303.268866,41.0,Culver City
1,34.242494,-118.26143,986.0,6180.0,GLR1YY,5.0,Low,2.0,Excellent,2.0,1.0,767500.0,2020-11-30,877.367614,72.0,Glendale
2,34.11711,-118.370951,3243.0,8441.0,LARE11,1.0,Slightly High,2.0,Excellent,3.0,4.0,700000.0,2020-11-25,837.898528,20.0,Hollywood Hills West
3,33.863577,-118.048971,1733.0,5002.0,CERS5000,5.0,Low,7.0,Excellent,4.0,2.0,845000.0,2020-11-25,920.599557,48.0,Cerritos
4,33.892628,-118.090063,728.0,5088.0,NOR1,22.0,Moderate,5.0,Above Average,2.0,1.0,535000.0,2020-11-25,732.519682,70.0,Norwalk


### Create a DataFrame for the Basic Search component in the app

In [20]:
#Create a dataframe that will be used by the app
#Sale Price per neighborhood

avg = gdf.groupby('neighborhood')[['home_size', 'lot_size', 'bedrooms', 'bathrooms', 'sex_offenders', 
                                   'enviornmental_hazards', 'age', 'sale_price']].mean()

cat_avg = gdf.groupby('neighborhood')[['school_quality', 'crime_index']].agg(lambda x:x.value_counts().index[0])
avg['sale_price'] = avg['sale_price'].apply(lambda x: '${:,.2f}'.format(x))
new_avg = pd.merge(avg, hoods[['neighborhood', 'latitude', 'longitude']], on='neighborhood', how='left')
new_avg = pd.merge(new_avg, cat_avg, on='neighborhood', how='left')
new_avg = new_avg.dropna()

ohe_encoder = OneHotCategoricalEncoder(variables=['crime_index', 'school_quality'])
ohe_encoder.fit(new_avg)
new_avg = ohe_encoder.transform(new_avg)
if 'crime_index_Very High' not in new_avg.columns:
    new_avg['crime_index_Very High'] = 0
new_avg.to_csv('Data/Neighborhoods_final.csv', index=False)