## Goal:

<insert Kelli's message from slack here>

## Imports

In [25]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
import numpy as np

## Data read-in

In [2]:
df = pd.read_csv('CHI_agents_data-Table 1.csv',low_memory=False)

In [3]:
# Function to convert WKT format to a Point geometry
def to_point(coord):
    try:
        if isinstance(coord, str) and 'POINT' in coord:
            # Use WKT to convert the string into a Point object
            return wkt.loads(coord)
        else:
            return None  # Return None for invalid or missing coordinates
    except Exception as e:
        return None  # Catch any issues

# Apply the conversion to the 'geometry' column
df['geometry'] = df['geometry'].apply(to_point)

# Now convert the DataFrame to a GeoDataFrame
df_geo = gpd.GeoDataFrame(df, geometry='geometry')

In [4]:
boundaries_df = gpd.read_file('Boundaries - Community Areas (current).geojson')

In [5]:
communities_to_keep = [
    'NEAR SOUTH SIDE',
    'ARMOUR SQUARE',
    'BRIDGEPORT',
    'MCKINLEY PARK',
    'BRIGHTON PARK',
    'ARCHER HEIGHTS',
    'GARFIELD RIDGE',
    'CLEARING',
    'WEST ELSDON',
    'GAGE PARK',
    'NEW CITY',
    'FULLER PARK',
    'GRAND BOULEVARD',
    'DOUGLAS',
    'OAKLAND',
    'KENWOOD',
    'HYDE PARK',
    'WASHINGTON PARK',
    'ENGLEWOOD',
    'WEST ENGLEWOOD',
    'CHICAGO LAWN',
    'WEST LAWN',
    'WOODLAWN',
    'GREATER GRAND CROSSING',
    'SOUTH SHORE',
    'AUBURN GRESHAM',
    'ASHBURN',
    'CHATHAM',
    'AVALON PARK',
    'SOUTH CHICAGO',
    'CALUMET HEIGHTS',
    'BURNSIDE',
    'WASHINGTON HEIGHTS',
    'BEVERLY',
    'MOUNT GREENWOOD',
    'MORGAN PARK',
    'WEST PULLMAN',
    'SOUTH DEERING',
    'RIVERDALE',
    'EAST SIDE',
    'HEGEWISCH'
]

In [6]:
filtered_boundaries = boundaries_df[boundaries_df['community'].isin(communities_to_keep)]

In [7]:
df_geo = df_geo.set_crs(4326)

In [8]:
# Read in neighborhood boundaries
neighborhood_boundaries = filtered_boundaries

# Clean and convert the 'Price' column to integers
df_geo['Price_int'] = df_geo['Price'].str.replace('[\$,]', '', regex=True).astype(int)

# Ensure both GeoDataFrames (df_geo and neighborhood_boundaries) have the same CRS
if df_geo.crs != neighborhood_boundaries.crs:
    df_geo = df_geo.to_crs(neighborhood_boundaries.crs)  # Transform CRS to match

# Perform a spatial join to find which points (brokerages) fall within which community
joined_gdf = gpd.sjoin(df_geo, neighborhood_boundaries, how="inner", predicate='within')

# # Group by community and agent/team to calculate counts and total sales
# grouped = joined_gdf.groupby(['community', 'Final_Agent/Team']).agg(
#     count=('Final_Agent/Team', 'count'),
#     total_price=('Price_int', 'sum')
# ).reset_index().nlargest(20)

# # Find the agent/team with the highest count for each community
# max_count_idx = grouped.groupby("community")['count'].idxmax()
# agent_result = grouped.loc[max_count_idx, ['community', 'Final_Agent/Team', 'count', 'total_price']]
# agent_result = agent_result.rename(columns={'count': 'agent_count', 'total_price': 'agent_price'})

# # Calculate community-level totals
# community_result = grouped.groupby('community').agg(
#     community_count=('count', 'sum'),
#     community_price=('total_price', 'sum')
# ).reset_index()

# # Merge agent results with community-level totals
# merged_df = pd.merge(agent_result, community_result, on='community', how='inner')

# # Drop unnecessary columns from neighborhood boundaries
# columns_to_drop = ['area', 'shape_area', 'perimeter', 'area_num_1', 'area_numbe', 'comarea_id', 'comarea', 'shape_len']
# neighborhood_boundaries = neighborhood_boundaries.drop(columns=columns_to_drop, errors='ignore')

# # Merge with neighborhood boundaries
# final_merge = neighborhood_boundaries.merge(merged_df, on='community')
# final_merge = final_merge.rename(columns={'Final_Agent/Team': 'final_agent_team'})

# # Save the result to a GeoJSON file
# final_merge.to_file("final_merge.geojson", driver='GeoJSON')


In [9]:
joined_gdf

Unnamed: 0,mls_id,Address,Location,Zip_Code,Price,agent_no,side,agent_name,Final_Agent/Team,Brokerage,...,index_right,community,area,shape_area,perimeter,area_num_1,area_numbe,comarea_id,comarea,shape_len
6,11284349,3000 S Pitney Ct,CHI - Bridgeport,60608.0,"$8,000,000",buy_agent_1,buyer,Anne Fan,Anne Fan,Compass,...,57,BRIDGEPORT,0,58291519.2767,0,60,60,0,0,32732.7183268
7,11284349,3000 S Pitney Ct,CHI - Bridgeport,60608.0,"$8,000,000",list_agent_1,listing,George Toscas,George Toscas,ACO Commercial,...,57,BRIDGEPORT,0,58291519.2767,0,60,60,0,0,32732.7183268
8,11284349,3000 S Pitney Ct,CHI - Bridgeport,60608.0,"$8,000,000",list_agent_2,listing,Linda Hattar,Linda Hattar,ACO Commercial,...,57,BRIDGEPORT,0,58291519.2767,0,60,60,0,0,32732.7183268
126,11993747,4825 S Woodlawn Ave,CHI - Kenwood,60615.0,"$4,000,000",list_agent_1,listing,Eugene Fu,Eugene Fu,@properties Christie's International Real Estate,...,4,KENWOOD,0,29071741.9283,0,39,39,0,0,23325.1679062
135,11993747,4825 S Woodlawn Ave,CHI - Kenwood,60615.0,"$4,000,000",buy_agent_1,buyer,Susan O'Connor,Susan O'Connor,Berkshire Hathaway HomeServices Chicago,...,4,KENWOOD,0,29071741.9283,0,39,39,0,0,23325.1679062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108203,11836116,12910 S Parnell Ave,CHI - West Pullman,60628.0,"$100,000",list_agent_1,listing,Vera Brown,Vera Brown,R G Ramsey & Associates,...,51,WEST PULLMAN,0,99365198.0822,0,53,53,0,0,50023.8430008
108204,12041412,8222 S Perry Ave,CHI - Chatham,60620.0,"$100,000",list_agent_1,listing,Vernon Lilly,Vernon Lilly,EXIT True Design Realty LLC,...,39,CHATHAM,0,82320670.3112,0,44,44,0,0,42006.9450094
108208,11887086,7558 S Marshfield Ave,CHI - Auburn Gresham,60620.0,"$100,000",list_agent_1,listing,Victor Vita,Victor Vita,Vylla Home,...,69,AUBURN GRESHAM,0,105065353.602,0,71,71,0,0,46757.7217161
108210,11876519,6950 S Vernon Ave,CHI - Greater Grand Crossing,60637.0,"$100,000",list_agent_1,listing,William Bates Jr,William Bates Jr,EXIT Strategy Realty,...,66,GREATER GRAND CROSSING,0,98853167.7093,0,69,69,0,0,54645.3302996


In [10]:
top_agents_list = joined_gdf.value_counts('Final_Agent/Team').head(31)
top_agents_list = top_agents_list.keys().to_list()

In [11]:
joined_gdf.columns

Index(['mls_id', 'Address', 'Location', 'Zip_Code', 'Price', 'agent_no',
       'side', 'agent_name', 'Final_Agent/Team', 'Brokerage', 'url',
       'TRD_Note', 'geometry', 'SqFt', 'price_per_sq_ft', 'year_built',
       'broker_email', 'data_source_name', 'within_county', 'Price_int',
       'index_right', 'community', 'area', 'shape_area', 'perimeter',
       'area_num_1', 'area_numbe', 'comarea_id', 'comarea', 'shape_len'],
      dtype='object')

In [12]:
gdf_for_export = joined_gdf[['mls_id', 'Address', 'Location', 'Zip_Code', 'Price', 'agent_no',
       'side', 'agent_name', 'Final_Agent/Team', 'Brokerage', 'url',
       'TRD_Note', 'geometry', 'SqFt', 'price_per_sq_ft', 'year_built',
       'broker_email']]

In [13]:
# Create an ExcelWriter instance
with pd.ExcelWriter('all_agents_data.xlsx') as writer:
    for agent in top_agents_list:
        # Filter for the current agent
        x = gdf_for_export[gdf_for_export['Final_Agent/Team'] == agent]
        
        # Write each agent's dataframe to a different sheet
        x.to_excel(writer, sheet_name=agent, index=False)


In [14]:
joined_gdf

Unnamed: 0,mls_id,Address,Location,Zip_Code,Price,agent_no,side,agent_name,Final_Agent/Team,Brokerage,...,index_right,community,area,shape_area,perimeter,area_num_1,area_numbe,comarea_id,comarea,shape_len
6,11284349,3000 S Pitney Ct,CHI - Bridgeport,60608.0,"$8,000,000",buy_agent_1,buyer,Anne Fan,Anne Fan,Compass,...,57,BRIDGEPORT,0,58291519.2767,0,60,60,0,0,32732.7183268
7,11284349,3000 S Pitney Ct,CHI - Bridgeport,60608.0,"$8,000,000",list_agent_1,listing,George Toscas,George Toscas,ACO Commercial,...,57,BRIDGEPORT,0,58291519.2767,0,60,60,0,0,32732.7183268
8,11284349,3000 S Pitney Ct,CHI - Bridgeport,60608.0,"$8,000,000",list_agent_2,listing,Linda Hattar,Linda Hattar,ACO Commercial,...,57,BRIDGEPORT,0,58291519.2767,0,60,60,0,0,32732.7183268
126,11993747,4825 S Woodlawn Ave,CHI - Kenwood,60615.0,"$4,000,000",list_agent_1,listing,Eugene Fu,Eugene Fu,@properties Christie's International Real Estate,...,4,KENWOOD,0,29071741.9283,0,39,39,0,0,23325.1679062
135,11993747,4825 S Woodlawn Ave,CHI - Kenwood,60615.0,"$4,000,000",buy_agent_1,buyer,Susan O'Connor,Susan O'Connor,Berkshire Hathaway HomeServices Chicago,...,4,KENWOOD,0,29071741.9283,0,39,39,0,0,23325.1679062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108203,11836116,12910 S Parnell Ave,CHI - West Pullman,60628.0,"$100,000",list_agent_1,listing,Vera Brown,Vera Brown,R G Ramsey & Associates,...,51,WEST PULLMAN,0,99365198.0822,0,53,53,0,0,50023.8430008
108204,12041412,8222 S Perry Ave,CHI - Chatham,60620.0,"$100,000",list_agent_1,listing,Vernon Lilly,Vernon Lilly,EXIT True Design Realty LLC,...,39,CHATHAM,0,82320670.3112,0,44,44,0,0,42006.9450094
108208,11887086,7558 S Marshfield Ave,CHI - Auburn Gresham,60620.0,"$100,000",list_agent_1,listing,Victor Vita,Victor Vita,Vylla Home,...,69,AUBURN GRESHAM,0,105065353.602,0,71,71,0,0,46757.7217161
108210,11876519,6950 S Vernon Ave,CHI - Greater Grand Crossing,60637.0,"$100,000",list_agent_1,listing,William Bates Jr,William Bates Jr,EXIT Strategy Realty,...,66,GREATER GRAND CROSSING,0,98853167.7093,0,69,69,0,0,54645.3302996


In [15]:
joined_gdf['price_as_int'] = joined_gdf['Price'].str.replace("$","",regex=False)
joined_gdf['price_as_int'] = joined_gdf['price_as_int'].str.replace(",","",regex=False).astype(int)

In [16]:
agents_by_transaction = joined_gdf.groupby('Final_Agent/Team')['price_as_int'].sum()

In [17]:
joined_gdf.columns

Index(['mls_id', 'Address', 'Location', 'Zip_Code', 'Price', 'agent_no',
       'side', 'agent_name', 'Final_Agent/Team', 'Brokerage', 'url',
       'TRD_Note', 'geometry', 'SqFt', 'price_per_sq_ft', 'year_built',
       'broker_email', 'data_source_name', 'within_county', 'Price_int',
       'index_right', 'community', 'area', 'shape_area', 'perimeter',
       'area_num_1', 'area_numbe', 'comarea_id', 'comarea', 'shape_len',
       'price_as_int'],
      dtype='object')

In [19]:
joined_gdf.groupby('Final_Agent/Team')['price_as_int'].sum()

Final_Agent/Team
Aaqila Harvey       569900
Aaron Campbell      127000
Aaron Franklin      252000
Aaron Gaines       4900147
Aaron Greenberg     455000
                    ...   
Zoe Wohlsifer       135000
Zoey Zhu           2459588
Zofia Strzep       1659800
Zofia Zon-Leiva     620100
Zorica Ledic        268000
Name: price_as_int, Length: 5279, dtype: int64

In [20]:
agents_by_transaction.sort_values(ascending=False).to_csv("agents_by_dollar_volume.csv")

In [21]:
top_199_agents_by_dollar_volume = agents_by_transaction.sort_values(ascending=False).head(199).keys().to_list()

In [22]:
# Create an ExcelWriter instance
with pd.ExcelWriter('all_agents_dollar_data.xlsx') as writer:
    for agent in top_199_agents_by_dollar_volume:
        # Filter for the current agent
        x = gdf_for_export[gdf_for_export['Final_Agent/Team'] == agent]
        
        # Write each agent's dataframe to a different sheet
        x.to_excel(writer, sheet_name=agent, index=False)


## Collect property types

In [23]:
joined_gdf['api_url'] = joined_gdf['url'].str[22:]

In [24]:
joined_gdf['api_url']

6         /IL/Chicago/3000-S-Pitney-Ct-60608/home/177573811
7         /IL/Chicago/3000-S-Pitney-Ct-60608/home/177573811
8         /IL/Chicago/3000-S-Pitney-Ct-60608/home/177573811
126       /IL/Chicago/4825-S-Woodlawn-Ave-60615/home/139...
135       /IL/Chicago/4825-S-Woodlawn-Ave-60615/home/139...
                                ...                        
108203    /IL/Chicago/12910-S-Parnell-Ave-60628/home/130...
108204     /IL/Chicago/8222-S-Perry-Ave-60620/home/13225737
108208    /IL/Chicago/7558-S-Marshfield-Ave-60620/home/1...
108210    /IL/Chicago/6950-S-Vernon-Ave-60637/unit-2/hom...
108211    /IL/Chicago/9549-S-Bensley-Ave-60617/home/1304...
Name: api_url, Length: 13571, dtype: object

In [127]:
joined_gdf

Unnamed: 0,mls_id,Address,Location,Zip_Code,Price,agent_no,side,agent_name,Final_Agent/Team,Brokerage,...,area,shape_area,perimeter,area_num_1,area_numbe,comarea_id,comarea,shape_len,price_as_int,api_url
6,11284349,3000 S Pitney Ct,CHI - Bridgeport,60608.0,"$8,000,000",buy_agent_1,buyer,Anne Fan,Anne Fan,Compass,...,0,58291519.2767,0,60,60,0,0,32732.7183268,8000000,/IL/Chicago/3000-S-Pitney-Ct-60608/home/177573811
7,11284349,3000 S Pitney Ct,CHI - Bridgeport,60608.0,"$8,000,000",list_agent_1,listing,George Toscas,George Toscas,ACO Commercial,...,0,58291519.2767,0,60,60,0,0,32732.7183268,8000000,/IL/Chicago/3000-S-Pitney-Ct-60608/home/177573811
8,11284349,3000 S Pitney Ct,CHI - Bridgeport,60608.0,"$8,000,000",list_agent_2,listing,Linda Hattar,Linda Hattar,ACO Commercial,...,0,58291519.2767,0,60,60,0,0,32732.7183268,8000000,/IL/Chicago/3000-S-Pitney-Ct-60608/home/177573811
126,11993747,4825 S Woodlawn Ave,CHI - Kenwood,60615.0,"$4,000,000",list_agent_1,listing,Eugene Fu,Eugene Fu,@properties Christie's International Real Estate,...,0,29071741.9283,0,39,39,0,0,23325.1679062,4000000,/IL/Chicago/4825-S-Woodlawn-Ave-60615/home/139...
135,11993747,4825 S Woodlawn Ave,CHI - Kenwood,60615.0,"$4,000,000",buy_agent_1,buyer,Susan O'Connor,Susan O'Connor,Berkshire Hathaway HomeServices Chicago,...,0,29071741.9283,0,39,39,0,0,23325.1679062,4000000,/IL/Chicago/4825-S-Woodlawn-Ave-60615/home/139...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108203,11836116,12910 S Parnell Ave,CHI - West Pullman,60628.0,"$100,000",list_agent_1,listing,Vera Brown,Vera Brown,R G Ramsey & Associates,...,0,99365198.0822,0,53,53,0,0,50023.8430008,100000,/IL/Chicago/12910-S-Parnell-Ave-60628/home/130...
108204,12041412,8222 S Perry Ave,CHI - Chatham,60620.0,"$100,000",list_agent_1,listing,Vernon Lilly,Vernon Lilly,EXIT True Design Realty LLC,...,0,82320670.3112,0,44,44,0,0,42006.9450094,100000,/IL/Chicago/8222-S-Perry-Ave-60620/home/13225737
108208,11887086,7558 S Marshfield Ave,CHI - Auburn Gresham,60620.0,"$100,000",list_agent_1,listing,Victor Vita,Victor Vita,Vylla Home,...,0,105065353.602,0,71,71,0,0,46757.7217161,100000,/IL/Chicago/7558-S-Marshfield-Ave-60620/home/1...
108210,11876519,6950 S Vernon Ave,CHI - Greater Grand Crossing,60637.0,"$100,000",list_agent_1,listing,William Bates Jr,William Bates Jr,EXIT Strategy Realty,...,0,98853167.7093,0,69,69,0,0,54645.3302996,100000,/IL/Chicago/6950-S-Vernon-Ave-60637/unit-2/hom...


In [38]:
to_be_sliced = joined_gdf.reset_index(drop=True)

In [39]:
group_1 = to_be_sliced.iloc[0:8000]
group_2 = to_be_sliced.iloc[8000:]

In [27]:
# import requests

# url = "https://redfin-com-data.p.rapidapi.com/properties/details"

# # querystring = {"url":"/IL/Chicago/6605-S-Kimbark-Ave-60637/home/12570870"}

# querystring = {"url":"/IL/Chicago/100-E-14th-St-60605/unit-1112/home/39569782"}

# headers = {
#     "x-rapidapi-key": "00191da588msh8450293d26e3515p1bbd40jsn56510b513b61",
#     "x-rapidapi-host": "redfin-com-data.p.rapidapi.com"
# }

# response = requests.get(url, headers=headers, params=querystring)

In [28]:
# response.json()['data']['belowTheFold']['publicRecordsInfo']['basicInfo']['propertyTypeName']

In [29]:
# def get_property_type(api_url=str):
    
#     querystring = {"url":api_url}

#     headers = {
#         "x-rapidapi-key": "00191da588msh8450293d26e3515p1bbd40jsn56510b513b61",
#         "x-rapidapi-host": "redfin-com-data.p.rapidapi.com"
#     }

#     response = requests.get(url, headers=headers, params=querystring)
    
#     property_type = response.json()['data']['belowTheFold']['publicRecordsInfo']['basicInfo']['propertyTypeName']
    
#     return property_type

In [40]:
import time
import requests

def get_property_type_with_error(api_url, max_retries=3, backoff_factor=1.0):
    """
    Fetch property type from the Redfin API, with error handling and exponential backoff.

    Returns:
      - The property type (string), OR
      - An error message (string) if something goes wrong.
    """
    
    url = "https://redfin-com-data.p.rapidapi.com/properties/details"
    
    querystring = {"url": api_url}
    headers = {
        "x-rapidapi-key": "00191da588msh8450293d26e3515p1bbd40jsn56510b513b61",  
        "x-rapidapi-host": "redfin-com-data.p.rapidapi.com"
    }

    attempt = 1
    while attempt <= max_retries:
        try:
            response = requests.get(url, headers=headers, params=querystring, timeout=10)
            response.raise_for_status()

            data = response.json()
            property_type = data['data']['belowTheFold']['publicRecordsInfo']['basicInfo']['propertyTypeName']
            
            return property_type
        
        except requests.exceptions.RequestException as re:
            # Networking or HTTP error
            if attempt == max_retries:
                # Return error message as a string
                return f"ERROR: Request failed after {max_retries} retries. {re}"
            else:
                # Exponential backoff
                delay = backoff_factor * (2 ** (attempt - 1))
                time.sleep(delay)
                attempt += 1

        except KeyError as ke:
            # JSON doesn't have the fields we expect
            return f"ERROR: Unexpected JSON structure. Missing key: {ke}"

        except Exception as e:
            # Catch-all
            return f"ERROR: An unexpected error occurred: {e}"

In [43]:
from tqdm.auto import tqdm  # or from tqdm import tqdm
tqdm.pandas()              # This adds "progress_apply" to pandas DataFrames

In [None]:
test_df = joined_gdf.tail(30) 

test_df['property_type'] = test_df['api_url'].progress_apply(get_property_type_with_error)

In [47]:
group_1['property_type'] = group_1['api_url'].progress_apply(get_property_type_with_error)
group_1.to_csv("group_1.csv")


  0%|          | 0/8000 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [48]:
group_2['property_type'] = group_2['api_url'].progress_apply(get_property_type_with_error)
group_2.to_csv("group_2.csv")

  0%|          | 0/5571 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [49]:
group_1['property_type'].iloc[0]

'Vacant Land'

In [50]:
group_1['property_type'].value_counts()

property_type
Single Family Residential                                        4013
Condo/Co-op                                                      1589
Multi-Family (2-4 Unit)                                          1367
Multi-Family (5+ Unit)                                            424
Townhouse                                                         408
Vacant Land                                                       103
ERROR: Unexpected JSON structure. Missing key: 'belowTheFold'      61
Other                                                              23
Parking                                                            12
Name: count, dtype: int64

In [51]:
group_2['property_type'].value_counts()

property_type
Single Family Residential                                                      3330
Condo/Co-op                                                                    1194
Multi-Family (2-4 Unit)                                                         718
Townhouse                                                                       129
Multi-Family (5+ Unit)                                                          127
Vacant Land                                                                      44
ERROR: Unexpected JSON structure. Missing key: 'belowTheFold'                    17
Parking                                                                           6
Other                                                                             4
ERROR: An unexpected error occurred: 'NoneType' object is not subscriptable       2
Name: count, dtype: int64

In [52]:
df_concat_property_types = pd.concat([group_1,group_2])

In [53]:
df_concat_property_types['property_type'].value_counts()

property_type
Single Family Residential                                                      7343
Condo/Co-op                                                                    2783
Multi-Family (2-4 Unit)                                                        2085
Multi-Family (5+ Unit)                                                          551
Townhouse                                                                       537
Vacant Land                                                                     147
ERROR: Unexpected JSON structure. Missing key: 'belowTheFold'                    78
Other                                                                            27
Parking                                                                          18
ERROR: An unexpected error occurred: 'NoneType' object is not subscriptable       2
Name: count, dtype: int64

In [59]:
# df_concat_property_types[df_concat_property_types['property_type'] == "ERROR: Unexpected JSON structure. Missing key: 'belowTheFold'"]['url'].iloc[0]

In [60]:
keep_list = [
    "Single Family Residential",
    "Condo/Co-op",
    "Townhouse"
]

In [61]:
final_df = df_concat_property_types[df_concat_property_types['property_type'].isin(keep_list)]

In [70]:
final_agent_list = final_df.groupby('Final_Agent/Team')['price_as_int'].sum().sort_values(ascending=False)

In [71]:
final_agent_list

Final_Agent/Team
Nadine Ferrata     28709300
Robert Sullivan    20029500
Susan O'Connor     14776500
Non Member         14423336
The Laricy Team    14276500
                     ...   
Darraneika Lacy      100000
Dorothy Turner       100000
Angela Upton         100000
Willie Jones         100000
Erica Smith          100000
Name: price_as_int, Length: 4612, dtype: int64

In [72]:
final_agent_list.to_csv("final_agent_list_transaction_volume_with_property_filtered.csv")

In [100]:
team_names_df = pd.read_csv("team_names.csv",usecols=["For_team_matching"])

In [101]:
team_names_df = team_names_df.dropna()

In [113]:
matched_team_df = final_df

In [114]:
# -----------------------------------------------------------------------------
# STEP 1: CREATE LOOKUP DICTIONARIES
# -----------------------------------------------------------------------------
def normalize_name(name: str) -> str:
    """Lowercase and strip whitespace to handle case mismatch."""
    return name.strip().lower()

# name_to_team: agent_name (lowercased) -> Official Team Name (unmodified)
name_to_team = {}

# team_to_members: official_team_name (lowercased) -> list of all team members (unmodified)
team_to_members = {}

for row in team_names_df.itertuples(index=False):
    # e.g.: "Alice Smith, Bob Johnson, Charlie Brown"
    members_raw = row.For_team_matching.split(",")
    
    # Clean up whitespace in each name, but don't lose original capitalization
    members = [m.strip() for m in members_raw]
    
    # We'll treat the first name as the "official team name"
    official_team_name = members[0]            # e.g. "Alice Smith"
    official_team_key  = normalize_name(official_team_name)  # e.g. "alice smith"
    
    # Save the full list of members for this team
    team_to_members[official_team_key] = members
    
    # Map each member to the same official team name
    for member in members:
        name_to_team[normalize_name(member)] = official_team_name

# -----------------------------------------------------------------------------
# STEP 2: UPDATE THE "final_df" COLUMNS
# -----------------------------------------------------------------------------
def get_team_name_or_original(agent_name: str, current_value: str) -> str:
    """Return the official team name if the agent belongs to a team, else original value."""
    agent_key = normalize_name(agent_name)
    if agent_key in name_to_team:
        # Overwrite with the official team name
        return name_to_team[agent_key]
    else:
        # Leave the existing name alone
        return current_value

def get_team_members(agent_name: str) -> str:
    """Return a comma-separated string of all members if agent belongs to a team, else None."""
    agent_key = normalize_name(agent_name)
    if agent_key in name_to_team:
        # Get the official team name
        official_team_name = name_to_team[agent_key]
        official_team_key  = normalize_name(official_team_name)
        # Convert the list of members to a single string
        all_members = team_to_members[official_team_key]
        return ", ".join(all_members)
    return None

# Create/update columns in final_df
matched_team_df["Final_Agent/Team"] = matched_team_df.apply(
    lambda row: get_team_name_or_original(
        agent_name=row["agent_name"],
        current_value=row["Final_Agent/Team"]
    ),
    axis=1
)

# Add a new column that shows all team members for that agent (or None if not on a team)
matched_team_df["Team_Members"] = matched_team_df["agent_name"].apply(get_team_members)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [116]:
matched_final_agent_list = matched_team_df.groupby('Final_Agent/Team')['price_as_int'].sum().sort_values(ascending=False)



In [121]:
matched_final_agent_list.to_csv("matched_final_agent_list.csv")

In [128]:
matched_team_df.to_csv("matched_team_df.csv")

In [135]:
matched_team_df['mls_id'] = matched_team_df['mls_id'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [138]:
matched_team_df['mls_id'].value_counts().head(60)

mls_id
11896882    4
11893822    4
12030043    4
11956458    4
11866009    4
11828670    4
11840285    4
12051913    4
11938887    4
12045369    4
11975552    4
11846499    4
12004485    4
11828021    4
11898153    4
11874078    4
11751869    4
11816920    4
11985298    4
12028301    4
11867772    4
11828915    4
11859206    4
11829674    4
11905981    4
11857372    4
12014016    4
11855029    4
11994702    4
11831574    4
12017476    4
11881460    4
11988158    4
11940428    4
11867069    4
11889710    4
11975668    4
11915224    4
11997805    4
11937939    4
11987310    4
11958865    4
11811315    4
11794014    4
11812398    4
11962182    4
11981676    4
12013605    4
12021378    4
12059763    4
11448896    4
12039492    4
11807395    4
11938631    4
11954947    4
12030082    3
11823690    3
11819124    3
11871232    3
11741064    3
Name: count, dtype: int64

In [140]:
matched_team_df[matched_team_df['mls_id'] == "12030043"]

Unnamed: 0,mls_id,Address,Location,Zip_Code,Price,agent_no,side,agent_name,Final_Agent/Team,Brokerage,...,perimeter,area_num_1,area_numbe,comarea_id,comarea,shape_len,price_as_int,api_url,property_type,Team_Members
6884,12030043,13025 S Manistee Ave,CHI - Hegewisch,60633.0,"$274,000",list_agent_2,listing,Hugo Gracia,Hugo Gracia,Compass,...,0,55,55,0,0,73692.3821322,274000,/IL/Chicago/13025-S-Manistee-Ave-60633/home/13...,Single Family Residential,
6886,12030043,13025 S Manistee Ave,CHI - Hegewisch,60633.0,"$274,000",list_agent_1,listing,Leslie Cifuentes,Leslie Cifuentes,Compass,...,0,55,55,0,0,73692.3821322,274000,/IL/Chicago/13025-S-Manistee-Ave-60633/home/13...,Single Family Residential,
6887,12030043,13025 S Manistee Ave,CHI - Hegewisch,60633.0,"$274,000",buy_agent_2,buyer,Lourdes Fernando,Lourdes Fernando,Keller Williams ONEChicago,...,0,55,55,0,0,73692.3821322,274000,/IL/Chicago/13025-S-Manistee-Ave-60633/home/13...,Single Family Residential,
6888,12030043,13025 S Manistee Ave,CHI - Hegewisch,60633.0,"$274,000",buy_agent_1,buyer,Tiffany Bennett,Tiffany Bennett,Keller Williams ONEChicago,...,0,55,55,0,0,73692.3821322,274000,/IL/Chicago/13025-S-Manistee-Ave-60633/home/13...,Single Family Residential,


## Create separate CSVs for top 30 and write into workbook

In [2]:
import pandas as pd

In [12]:
teams = pd.read_clipboard()

In [14]:
dealsheet = pd.read_csv("ChicagoSouthSideAgentsTeamsMatched - deal_sheet.csv")

In [29]:
teams_list = teams['Final_Agent/Team'].to_list()

In [21]:
dealsheet.columns

Index(['mls_id', 'Address', 'Location', 'property_type', 'Zip_Code', 'Price',
       'agent_no', 'side', 'agent_name', 'Final_Agent/Team', 'Team_Members',
       'Brokerage', 'url', 'TRD_Note', 'geometry', 'SqFt', 'price_per_sq_ft',
       'year_built', 'broker_email', 'data_source_name', 'within_county',
       'Price_int', 'index_right', 'community', 'area', 'shape_area',
       'perimeter', 'area_num_1', 'area_numbe', 'comarea_id', 'comarea',
       'shape_len', 'price_as_int', 'api_url'],
      dtype='object')

In [30]:
# import pandas as pd

# # Create a DataFrame
# df = pd.DataFrame({
#     'Name': ['Alice', 'Bob', 'Charlie', 'David'],
#     'Age': [25, 30, 22, 28]
# })

# # List of names to filter by
# names_to_filter = ['Alice', 'Charlie']

# # Filter the DataFrame
# filtered_df = df[df['Name'].isin(names_to_filter)]

# print(filtered_df)

filtered_dealsheet = dealsheet[dealsheet['Final_Agent/Team'].isin(teams_list)]

In [32]:
filtered_dealsheet['Final_Agent/Team'].value_counts()

Final_Agent/Team
Frank Montro               55
Non Member                 51
Naja Morris                48
Nadine Ferrata             42
QianKun Chen               35
Pablo Galarza              32
Robert Sullivan            32
Salvador Gonzalez          30
Sybil Martin               30
The Laricy Team            29
Carlos Sanchez             28
Grigory Pekarsky           26
Kellye Jackson             22
Robert Fitzpatrick         20
Mary Fitzpatrick Duleba    20
Jennifer Liu               18
Jacob Reiner               16
Lane Chesebro              15
Nancy Hotchkiss            14
Robert Yoshimura           13
Ben Lalez Team             13
Lilianna Sekula-Lark       13
Susan O'Connor             11
Melanie Giglio              7
Name: count, dtype: int64

In [34]:
# Suppose your dataframe is 'filtered_dealsheet'
# and the column of interest is "Final_Agent/Team"

# 1. Get all unique agents/teams
unique_agents = filtered_dealsheet["Final_Agent/Team"].unique()

# 2. Create a new Excel file and write each subset to a different sheet
output_file = "output_file.xlsx"
with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
    for agent in unique_agents:
        # 3. Filter rows for the current agent/team
        agent_data = filtered_dealsheet[filtered_dealsheet["Final_Agent/Team"] == agent]
        
        # 4. Write to a sheet named after the agent/team
        #    Ensure sheet_name does not exceed Excel’s 31-char limit
        sheet_name = str(agent)[:31]
        
        agent_data.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"Workbook created: {output_file}")


Workbook created: output_file.xlsx
