## Imports

In [31]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os
from shapely import wkt
from shapely.geometry import Point
import googlemaps

## Data Read in

In [19]:
df = pd.read_csv('CHI_agents_data-Table 1.csv',low_memory=False)

In [17]:
df['geometry'].isna().value_counts()

geometry
False    107944
True        271
Name: count, dtype: int64

In [22]:
# Function to convert WKT format to a Point geometry
def to_point(coord):
    try:
        if isinstance(coord, str) and 'POINT' in coord:
            # Use WKT to convert the string into a Point object
            return wkt.loads(coord)
        else:
            return None  # Return None for invalid or missing coordinates
    except Exception as e:
        return None  # Catch any issues

# Apply the conversion to the 'geometry' column
df['geometry'] = df['geometry'].apply(to_point)

# Now convert the DataFrame to a GeoDataFrame
df_geo = gpd.GeoDataFrame(df, geometry='geometry')

In [25]:
df_geo['geometry'].isna().value_counts()

geometry
False    107944
True        271
Name: count, dtype: int64

In [27]:
df_geo['Full_Address'] = df_geo['Address'] + ' ' + df_geo['Location'] + ' Chicago, IL'

In [32]:
# Retrieve the API key
%store -r google_maps_API_Key
gmaps_key = googlemaps.Client(key=google_maps_API_Key)

In [33]:
# Function to geocode an address using googlemaps.Client
def geocode_address_gmaps(address):
    try:
        geocode_result = gmaps_key.geocode(address)
        if geocode_result:
            location = geocode_result[0]['geometry']['location']
            return Point(location['lng'], location['lat'])
        else:
            return None
    except Exception as e:
        print(f"Error geocoding {address}: {e}")
        return None

# Apply geocoding function to rows with null geometry
df_geo['geometry'] = df_geo.apply(
    lambda row: geocode_address_gmaps(row['Full_Address']) if row['geometry'] is None else row['geometry'], 
    axis=1
)

  arr = construct_1d_object_array_from_listlike(values)


In [78]:
### Read in neighborhood boundaries
neighborhood_boundaries = gpd.read_file("Boundaries - Community Areas (current).geojson")

In [79]:
df_geo['Price_int'] = df_geo['Price'].str.replace('$','').str.replace(',','').astype(int)

In [80]:
# Read in neighborhood boundaries
neighborhood_boundaries = gpd.read_file("Boundaries - Community Areas (current).geojson")

# Ensure both GeoDataFrames (df_geo and neighborhood_boundaries) have the same CRS
if df_geo.crs != neighborhood_boundaries.crs:
    df_geo = df_geo.to_crs(neighborhood_boundaries.crs)  # Transform CRS to match

# Perform a spatial join to find which points (brokerages) fall within which community (multipolygon)
joined_gdf = gpd.sjoin(df_geo, neighborhood_boundaries, how="inner", predicate='within')

# Group by community and agent/agent team to get the count and sum of sale prices
grouped = joined_gdf.groupby(['community', 'Final_Agent/Team']).agg(
    count=('Final_Agent/Team', 'count'),  # Count of how many times an agent or agent team appears
    total_price=('Price_int', 'sum')         # Sum sale price
).reset_index()

# Now, `grouped` contains the count and total price grouped by community and agent/team


In [81]:
grouped

Unnamed: 0,community,Final_Agent/Team,count,total_price
0,ALBANY PARK,Abby Powell,1,220000
1,ALBANY PARK,Adam McDowell,1,299000
2,ALBANY PARK,Adam Wavrunek,1,180000
3,ALBANY PARK,Adam Zenullahi,2,533000
4,ALBANY PARK,Adrian Abonce,1,499000
...,...,...,...,...
34502,WOODLAWN,William Salamone,1,160000
34503,WOODLAWN,William Volpe,1,127500
34504,WOODLAWN,Willie Whitehead,1,450000
34505,WOODLAWN,Yannetta Alexander,1,137800


In [83]:
print(f'${grouped.total_price.sum():,}')

$26,278,358,580


In [84]:
# Group by 'community' and find the row with the maximum 'count' in each group
max_count_idx = grouped.groupby("community")['count'].idxmax()

# Use the indices to retrieve the rows with the highest 'count' in each community
agent_result = grouped.loc[max_count_idx, ['community','Final_Agent/Team','count','total_price']]

agent_result = agent_result.rename(columns={'count':'agent_count','total_price':'agent_price'})

In [85]:
community_result = grouped.groupby('community').agg({'count':'sum','total_price':'sum'})

community_result = community_result.rename(columns={'count':'community_count','total_price':'community_price'})

In [86]:
merged_df = pd.merge(left=agent_result, left_on='community', how='inner',
                     right=community_result, right_on='community')

In [87]:
neighborhood_boundaries = neighborhood_boundaries.drop(columns=['area', 'shape_area', 'perimeter', 'area_num_1',
       'area_numbe', 'comarea_id', 'comarea', 'shape_len'])

In [101]:
# final_merge = pd.merge(left=merged_df, left_on='community',how='inner',
#                       right=neighborhood_boundaries, right_on='community')

In [88]:
final_merge = neighborhood_boundaries.merge(merged_df, on='community')

In [99]:
final_merge = final_merge.rename(columns={'Final_Agent/Team':'final_agent_team'})

In [103]:
final_merge.to_file("final_merge.geojson",driver='GeoJSON')

In [102]:
final_merge.columns

Index(['community', 'geometry', 'final_agent_team', 'agent_count',
       'agent_price', 'community_count', 'community_price'],
      dtype='object')

In [90]:
final_merge['community_price'].max()

3345481313

In [91]:
final_merge['community_price'].min()

780000

In [92]:
final_merge.head(1)

Unnamed: 0,community,geometry,Final_Agent/Team,agent_count,agent_price,community_count,community_price
0,DOUGLAS,"MULTIPOLYGON (((-87.60914 41.84469, -87.60915 ...",Xiaohuang Wu,6,1457500,231,63789252


In [95]:
final_merge.columns

Index(['community', 'geometry', 'Final_Agent/Team', 'agent_count',
       'agent_price', 'community_count', 'community_price'],
      dtype='object')

In [77]:
for i, row in final_merge.iterrows():
    if row['community_price'] > row['agent_price']:
        print(f"{row['community']} -- ${row['community_price'] - row['agent_price']:,}")
        print(f"{row['community']} -- {row['community_price'] / row['community_count']:}%")
        print(f"{row['Final_Agent/Team']} -- ${row['agent_price'] / row['brokerage_count']:,}")
        print('------------')
    else:
        print('Not great!')

DOUGLAS -- $62,331,752
DOUGLAS -- 276143.94805194804%
Xiaohuang Wu -- $242,916.66666666666
------------
OAKLAND -- $25,834,000
OAKLAND -- 378097.1830985916%
Lane Chesebro -- $336,966.6666666667
------------
FULLER PARK -- $11,842,822
FULLER PARK -- 294591.0%
Roger Valdez -- $265,000.0
------------
GRAND BOULEVARD -- $168,567,482
GRAND BOULEVARD -- 442955.056122449%
Naja Morris -- $563,433.3333333334
------------
KENWOOD -- $137,259,633
KENWOOD -- 469941.3194888179%
Robert Sullivan -- $1,092,444.4444444445
------------
LINCOLN SQUARE -- $557,339,463
LINCOLN SQUARE -- 626392.6255555556%
Biazar Group -- $583,081.8181818182
------------
WASHINGTON PARK -- $25,571,700
WASHINGTON PARK -- 375021.5189873418%
Biazar Group -- $1,013,750.0
------------
HYDE PARK -- $173,981,394
HYDE PARK -- 337016.3384030418%
Jinhong Wang -- $137,050.0
------------
WOODLAWN -- $158,172,766
WOODLAWN -- 430397.56135770236%
Biazar Group -- $741,055.5555555555
------------
ROGERS PARK -- $286,136,868
ROGERS PARK -- 2

In [93]:
base_name = 'https://trd-digital.github.io/trd-news-interactive-maps/'

cwd = os.getcwd()

cwd = cwd.split('/')

final_name = base_name + cwd[-1]
print(final_name)

https://trd-digital.github.io/trd-news-interactive-maps/CookCountyBrokersNeighborhoods_12_4_2024


In [94]:
df_geo.to_csv("backup_geocoded_csv.csv")