## Imports

In [17]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os
from shapely import wkt

## Data Read in

In [26]:
# Read the CSV file
df = pd.read_csv('CHI_brokerage_data_fully_geocoded.csv',index_col=0,low_memory=False)

# Function to convert WKT format to a Point geometry
def to_point(coord):
    try:
        if isinstance(coord, str) and 'POINT' in coord:
            # Use WKT to convert the string into a Point object
            return wkt.loads(coord)
        else:
            return None  # Return None for invalid or missing coordinates
    except Exception as e:
        return None  # Catch any issues

# Apply the conversion to the 'geometry' column
df['geometry'] = df['geometry'].apply(to_point)

# Now convert the DataFrame to a GeoDataFrame
df_geo = gpd.GeoDataFrame(df, geometry='geometry')

In [27]:
### Read in neighborhood boundaries
neighborhood_boundaries = gpd.read_file("Boundaries - Community Areas (current).geojson")

In [32]:
# Ensure both GeoDataFrames (df_geo and gdf) have the same CRS
df_geo = df_geo.set_crs(neighborhood_boundaries.crs, allow_override=True)  # Set CRS if needed

In [35]:
# Perform a spatial join to find which points (brokerages) fall within which community (multipolygon)
joined_gdf = gpd.sjoin(df_geo, neighborhood_boundaries, how="inner", op='within')

# Now, 'joined_gdf' contains the data from both the original DataFrame and the GeoDataFrame where the point lies within a community


  if await self.run_code(code, result, async_=asy):


In [37]:
# Group by community and brokerage to get the count and median price
grouped = joined_gdf.groupby(['community', 'Final_Brokerage']).agg({
    'Final_Brokerage': 'count',   # Count of how many times a brokerage appears
    'Price': 'sum'             # Sum sale price
}).rename(columns={'Final_Brokerage': 'count'}).reset_index()

In [54]:
print(f'${grouped.Price.sum():,}')

$23,020,396,856


In [79]:
# Group by 'community' and find the row with the maximum 'count' in each group
max_count_idx = grouped.groupby("community")['count'].idxmax()

# Use the indices to retrieve the rows with the highest 'count' in each community
brokerage_result = grouped.loc[max_count_idx, ['community','Final_Brokerage','count','Price']]

brokerage_result = brokerage_result.rename(columns={'count':'brokerage_count','Price':'brokerage_price'})

In [83]:
community_result = grouped.groupby('community').agg({'count':'sum','Price':'sum'})

community_result = community_result.rename(columns={'count':'community_count','Price':'community_price'})

In [86]:
merged_df = pd.merge(left=brokerage_result, left_on='community', how='inner',
                     right=community_result, right_on='community')

In [92]:
neighborhood_boundaries = neighborhood_boundaries.drop(columns=['area', 'shape_area', 'perimeter', 'area_num_1',
       'area_numbe', 'comarea_id', 'comarea', 'shape_len'])

In [101]:
# final_merge = pd.merge(left=merged_df, left_on='community',how='inner',
#                       right=neighborhood_boundaries, right_on='community')

In [104]:
final_merge = neighborhood_boundaries.merge(merged_df, on='community')

In [106]:
final_merge.to_file("final_merge.geojson",driver='GeoJSON')

In [123]:
final_merge['community_price'].max()

2932545028

In [124]:
final_merge['community_price'].min()

780000

In [125]:
(2932545028 - 780000)/8

366470628.5

In [122]:
for i, row in final_merge.iterrows():
    if row['community_price'] > row['brokerage_price']:
        print(f"{row['community']} -- ${row['community_price'] - row['brokerage_price']:,}")
        print(f"{row['community']} -- {row['community_price'] / row['community_count']:}%")
        print(f"{row['Final_Brokerage']} -- ${row['brokerage_price'] / row['brokerage_count']:,}")
        print('------------')
    else:
        print('Not great!')

DOUGLAS -- $51,081,602
DOUGLAS -- 269697.41509433964%
Coldwell Banker Realty -- $304,712.5
------------
OAKLAND -- $21,008,000
OAKLAND -- 371224.24242424243%
Coldwell Banker Realty -- $349,280.0
------------
FULLER PARK -- $9,304,722
FULLER PARK -- 285550.5789473684%
Realty of Chicago LLC -- $309,240.0
------------
GRAND BOULEVARD -- $122,576,132
GRAND BOULEVARD -- 438330.5515151515%
@properties Christie's International Real Estate -- $525,546.4285714285
------------
KENWOOD -- $99,075,777
KENWOOD -- 473577.8521126761%
Berkshire Hathaway HomeServices Chicago -- $805,007.5681818182
------------
LINCOLN SQUARE -- $370,601,610
LINCOLN SQUARE -- 616006.0587484036%
@properties Christie's International Real Estate -- $677,158.3878787879
------------
WASHINGTON PARK -- $21,739,800
WASHINGTON PARK -- 353805.8823529412%
@properties Christie's International Real Estate -- $331,285.71428571426
------------
HYDE PARK -- $142,589,850
HYDE PARK -- 334179.03319502075%
@properties Christie's Internati

In [None]:
# Now, we need to group again by community and aggregate the brokerages with their counts into a single string
def format_brokerages(group):
    return ', '.join([f"{row['Final_Brokerage']} ({row['count']})" for _, row in group.iterrows()])

# Group by community and format the brokerages information
grouped_brokerages = grouped.groupby('community').apply(format_brokerages).reset_index(name='brokerage_info')

# Merge this aggregated data back into the GeoDataFrame (gdf)
gdf = pd.merge(gdf, grouped_brokerages, on='community', how='left')