In [1]:
import re
import numpy as np
import pandas as pd
import calendar

In [2]:
df = pd.read_csv('All Transactions over $100K in 3 Counties in 2025 - ALL_Three.csv')

In [3]:
df['SALE_PRC1_CLEAN'] = df.SALE_PRC1.str.replace('$','').replace(',','',regex=True).astype(int)

In [4]:
# --- Setup ---
keep_labels = [
    'Single Family',
    'Condominiums',
    'Vacant Residential',
    'Cooperatives',
    'Mobile Homes',
]
# Case-insensitive exact-keep map (normalize hyphen/spaces for Single Family)
keep_norm = {k.lower().replace('-', ' ').strip(): k for k in keep_labels}

# Commercial patterns (covers common variants)
commercial_pat = re.compile(
    r'(store|office|professional\s+service(?:s)?(?:\s+(?:building|bldg))?|'
    r'multi[\s-]?family|multifamily|hotel|restaurant)s?',
    flags=re.IGNORECASE
)

# --- Normalize MICRO ---
micro = df['MICRO'].astype('string')
micro_norm = micro.str.strip()

# Start with 'Other'
bucket = pd.Series('Other', index=df.index, dtype='string')

# Unknown/empty
bucket = bucket.mask(micro_norm.isna() | (micro_norm == ''), 'Unknown')

# Exact keep (case-insensitive; also accept "Single-Family")
micro_exact_norm = micro_norm.str.lower().str.replace('-', ' ', regex=False)
keep_mask = micro_exact_norm.isin(keep_norm.keys())
bucket = bucket.mask(keep_mask, micro_exact_norm.map(keep_norm))

# Commercial via regex
commercial_mask = micro_norm.str.contains(commercial_pat, na=False)
bucket = bucket.mask(commercial_mask, 'Commercial')

df['MICRO_bucket'] = bucket

# --- Aggregate ---
df['SALE_PRC1_CLEAN'] = pd.to_numeric(df['SALE_PRC1_CLEAN'], errors='coerce')
groupby_cols = ['MICRO_bucket', 'SALE_MO1', 'PHY_CITY']

agg_df = (
    df.groupby(groupby_cols, dropna=False)
      .agg(
          avg_price=('SALE_PRC1_CLEAN', 'mean'),
          median_price=('SALE_PRC1_CLEAN', 'median'),
          n=('SALE_PRC1_CLEAN', 'size')
      )
      .reset_index()
      .query("n >= 5")    # only keep groups with 5+ rows
)


# Optional polish
agg_df['avg_price'] = agg_df['avg_price'].round(2)
agg_df['median_price'] = agg_df['median_price'].round(2)
agg_df = agg_df.sort_values(groupby_cols)


  commercial_mask = micro_norm.str.contains(commercial_pat, na=False)


In [5]:
agg_df.MICRO_bucket.value_counts()

MICRO_bucket
Single Family         341
Condominiums          316
Vacant Residential     84
Commercial             77
Other                  57
Cooperatives           21
Mobile Homes            9
Name: count, dtype: Int64

In [6]:
agg_df.SALE_MO1.value_counts()

SALE_MO1
4    187
3    183
5    169
2    166
1    161
6     39
Name: count, dtype: int64

In [7]:
# Pick the actual column name you have
month_col = 'SALE_MO1' if 'SALE_MO1' in agg_df.columns else 'SALE_M01'

# Build 1..12 → "January".. "December"
month_map = {i: calendar.month_name[i] for i in range(1, 13)}

# Make a new column with full month names
agg_df[month_col + '_NAME'] = (
    pd.to_numeric(agg_df[month_col], errors='coerce')     # "01" → 1, keeps NaN if missing
      .astype('Int64')                                # nullable integer dtype
      .map(month_map)                                 # 1 → "January", NaN stays NaN
)

# (Optional) If you want to REPLACE the original column instead:
# df[month_col] = pd.to_numeric(df[month_col], errors='coerce').astype('Int64').map(month_map)


In [8]:
agg_df['PHY_CITY'] = agg_df['PHY_CITY'].str.title()

In [9]:
agg_df = agg_df.drop(columns='SALE_MO1')

In [10]:
agg_df[(agg_df['PHY_CITY'] == 'Miami') & (agg_df['SALE_MO1_NAME'] == 'June')]

Unnamed: 0,MICRO_bucket,PHY_CITY,avg_price,median_price,n,SALE_MO1_NAME
689,Condominiums,Miami,995857.14,437000.0,14,June


In [11]:
agg_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 905 entries, 2 to 1767
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MICRO_bucket   905 non-null    string 
 1   PHY_CITY       898 non-null    object 
 2   avg_price      905 non-null    float64
 3   median_price   905 non-null    float64
 4   n              905 non-null    int64  
 5   SALE_MO1_NAME  905 non-null    object 
dtypes: float64(2), int64(1), object(2), string(1)
memory usage: 49.5+ KB


In [17]:
agg_df.to_csv('agg_df_cards.csv')