In [1]:
import os
import re
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns',None)

In [3]:
root_dir = Path(".")  # change if needed

def load_county_df(root: Path, county_keyword: str) -> pd.DataFrame:
    """
    Recursively finds all CSVs that include the county name in the filename,
    reads them, adds useful metadata, and concatenates to one DataFrame.
    """
    # Match files like ./2024S/NAL_2024_23Dade_S.csv, etc.
    files = sorted(root.rglob(f"*{county_keyword}*.csv"))
    if not files:
        print(f"No files found for {county_keyword}")
        return pd.DataFrame()

    frames = []
    for f in files:
        df = pd.read_csv(f,low_memory=False)
        # Add metadata columns from filename
        m = re.search(r"NAL_(\d{4})_", f.name)
        year = int(m.group(1)) if m else None
        df["source_file"] = str(f)
        df["year"] = year
        df["county"] = county_keyword
        frames.append(df)

    # Union the columns across years (handles schema drift)
    out = pd.concat(frames, ignore_index=True, sort=True)
    return out

# Build the three county DataFrames
df_dade = load_county_df(root_dir, "Dade")
df_palm_beach = load_county_df(root_dir, "Palm Beach")
df_broward = load_county_df(root_dir, "Broward")

# (Optional) quick sanity checks
for name, df in {
    "Dade": df_dade, "Palm Beach": df_palm_beach, "Broward": df_broward
}.items():
    print(f"{name}: {len(df):,} rows from {df['year'].nunique() if not df.empty else 0} years")

# Now you have:
#   df_dade, df_palm_beach, df_broward


Dade: 4,606,257 rows from 5 years
Palm Beach: 3,224,566 rows from 5 years
Broward: 3,750,098 rows from 5 years


In [4]:
test_pbc_df = pd.read_csv('./2020S/NAL_2020_60Palm Beach_S.csv',low_memory=False)

### Column Check to see if data is consistent

In [5]:
cols1 = df_broward.columns
cols2 = df_dade.columns

common        = cols1.intersection(cols2)
only_in_1     = cols1.difference(cols2)
only_in_2     = cols2.difference(cols1)
sym_difference = cols1.symmetric_difference(cols2)  # in either, not both

print(f"Common: {len(common)}\nOnly in main df: {len(only_in_1)}\nOnly in test df: {len(only_in_2)}")
print(f"Only in 1: {only_in_1}")
print(f"Only in 2: {only_in_2}")
print(f"Sym difference: {sym_difference}")

Common: 120
Only in main df: 0
Only in test df: 0
Only in 1: Index([], dtype='object')
Only in 2: Index([], dtype='object')
Sym difference: Index([], dtype='object')


In [6]:
concat_df = pd.concat([df_dade,df_broward,df_palm_beach])

In [7]:
for col in concat_df.columns:
    print(f"{col}\n-----")

ACT_YR_BLT
-----
ALT_KEY
-----
APP_STAT
-----
ASMNT_YR
-----
ASS_DIF_TRNS
-----
ASS_TRNSFR_FG
-----
ATV_STRT
-----
AV_CLASS_USE
-----
AV_CONSRV_LND
-----
AV_H2O_RECHRGE
-----
AV_HIST_COM_PROP
-----
AV_HIST_SIGNF
-----
AV_HMSTD
-----
AV_NON_HMSTD_RESD
-----
AV_NSD
-----
AV_RESD_NON_RESD
-----
AV_SD
-----
AV_WRKNG_WTRFNT
-----
BAS_STRT
-----
CENSUS_BK
-----
CLERK_NO1
-----
CLERK_NO2
-----
CONO_PRV_HM
-----
CONST_CLASS
-----
CO_APP_STAT
-----
CO_NO
-----
DEL_VAL
-----
DISTR_CD
-----
DISTR_YR
-----
DOR_UC
-----
DT_LAST_INSPT
-----
EFF_YR_BLT
-----
EXEMPTIONS
-----
FIDU_ADDR1
-----
FIDU_ADDR2
-----
FIDU_CD
-----
FIDU_CITY
-----
FIDU_NAME
-----
FIDU_STATE
-----
FIDU_ZIPCD
-----
FILE_T
-----
GRP_NO
-----
IMP_QUAL
-----
JV
-----
JV_CHNG
-----
JV_CHNG_CD
-----
JV_CLASS_USE
-----
JV_CONSRV_LND
-----
JV_H2O_RECHRGE
-----
JV_HIST_COM_PROP
-----
JV_HIST_SIGNF
-----
JV_HMSTD
-----
JV_NON_HMSTD_RESD
-----
JV_RESD_NON_RESD
-----
JV_WRKNG_WTRFNT
-----
LND_SQFOOT
-----
LND_UNTS_CD
-----
LND_VAL
-----
MK

In [8]:
keep_cols = [
    "PHY_CITY",
    "DOR_UC",
    "SALE_PRC1",
    "SALE_MO1",
    "SALE_YR1",
    "RS_ID",
    "county"
]

In [9]:
filtered_df = concat_df[keep_cols]

In [10]:
filtered_df[(filtered_df['PHY_CITY'] == 'Aventura') & 
    (filtered_df['SALE_MO1'] == 3) & 
    (filtered_df['SALE_YR1'] == 2024) & 
    (filtered_df['DOR_UC'] == 1)]

Unnamed: 0,PHY_CITY,DOR_UC,SALE_PRC1,SALE_MO1,SALE_YR1,RS_ID,county
4072860,Aventura,1,1239000.0,3.0,2024.0,276B,Dade


In [11]:
filtered_df['SALE_PRC1'].isna().value_counts()

SALE_PRC1
True     10373028
False     1207893
Name: count, dtype: int64

In [12]:
nona_df = filtered_df.dropna(subset='SALE_PRC1')

In [13]:
nona_df['county'].value_counts()

county
Dade          432046
Broward       405218
Palm Beach    370629
Name: count, dtype: int64

In [14]:
nona_df.assign?

[0;31mSignature:[0m [0mnona_df[0m[0;34m.[0m[0massign[0m[0;34m([0m[0;34m**[0m[0mkwargs[0m[0;34m)[0m [0;34m->[0m [0;34m'DataFrame'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Assign new columns to a DataFrame.

Returns a new object with all original columns in addition to new ones.
Existing columns that are re-assigned will be overwritten.

Parameters
----------
**kwargs : dict of {str: callable or Series}
    The column names are keywords. If the values are
    callable, they are computed on the DataFrame and
    assigned to the new columns. The callable must not
    change input DataFrame (though pandas doesn't check it).
    If the values are not callable, (e.g. a Series, scalar, or array),
    they are simply assigned.

Returns
-------
DataFrame
    A new DataFrame with the new columns in addition to
    all the existing columns.

Notes
-----
Assigning multiple columns within the same ``assign`` is possible.
Later items in '\*\*kwargs' may refer to newly cr

In [15]:
nona_df = nona_df.assign(
    county=lambda d: d["county"].replace({"Dade": "Miami-Dade"}),  # exact-match replace
    PHY_CITY=lambda d: d["PHY_CITY"].str.strip().str.title()       # trim + title-case
)

In [16]:
use_codes_df = pd.read_csv("DOR_FLA_USE_CODES.csv",index_col=0)

In [17]:
use_codes_df

Unnamed: 0,Use Code,Definition
1,0.0,Vacant Residential
2,1.0,Single Family
3,2.0,Mobile Homes
4,4.0,Condominiums
5,5.0,Cooperatives
...,...,...
102,95.0,"Rivers and lakes, submerged lands"
103,96.0,"Sewage disposal, solid waste, borrow pits, dra..."
104,97.0,"Outdoor recreational or parkland, or high-wate..."
106,98.0,Centrally assessed


In [18]:
merged_df = pd.merge(how='left', left=nona_df, left_on='DOR_UC', right=use_codes_df, right_on='Use Code')

In [19]:
len(merged_df) == len(nona_df)

True

In [20]:
merged_df['Definition'].isna().value_counts()

Definition
False    1207893
Name: count, dtype: int64

In [21]:
merged_df

Unnamed: 0,PHY_CITY,DOR_UC,SALE_PRC1,SALE_MO1,SALE_YR1,RS_ID,county,Use Code,Definition
0,Miami,3,4949600.0,8.0,2019.0,1D37,Miami-Dade,3.0,Multi-family - 10 units or more
1,Miami,8,4949600.0,8.0,2019.0,1D37,Miami-Dade,8.0,Multi-family - fewer than 10 units
2,Miami,8,4949600.0,8.0,2019.0,1D37,Miami-Dade,8.0,Multi-family - fewer than 10 units
3,Miami,8,4949600.0,8.0,2019.0,1D37,Miami-Dade,8.0,Multi-family - fewer than 10 units
4,Miami,3,4146000.0,10.0,2019.0,1D37,Miami-Dade,3.0,Multi-family - 10 units or more
...,...,...,...,...,...,...,...,...,...
1207888,Westlake,1,942051.0,11.0,2023.0,2742,Palm Beach,1.0,Single Family
1207889,Westlake,1,964460.0,11.0,2023.0,2742,Palm Beach,1.0,Single Family
1207890,Westlake,1,1004284.0,12.0,2023.0,2742,Palm Beach,1.0,Single Family
1207891,Westlake,1,1103067.0,9.0,2023.0,2742,Palm Beach,1.0,Single Family


In [22]:
import numpy as np
import pandas as pd

# ---- CONFIG ----
DEF_COL = "Definition"

# normalize
s = merged_df[DEF_COL].fillna("").str.strip().str.lower()

# helper to build masks
def has(pat):
    return s.str.contains(pat, regex=True, na=False)

# FIRST-MATCH-WINS order (more specific → more general)
rules = [
    # Mixed-Use
    (has(r"\bmixed use\b"), "Mixed-Use"),

    # Cooperatives / Condos
    (has(r"\bcooperatives?\b|\bco-?ops?\b"), "Cooperatives"),
    (has(r"condominiums?|\bresidential common elements/areas\b|\bcommon elements\b"),
     "Condo"),

    # Multi-Family
    (has(r"mobile home parks?"), "Multi-Family Dwelling"),  # handled before parking lots
    (has(r"multi[- ]family"), "Multi-Family Dwelling"),
    (has(r"retirement homes?"), "Multi-Family Dwelling"),
    (has(r"miscellaneous residential|boarding homes|migrant camps"), "Multi-Family Dwelling"),

    # Single-Family
    (has(r"\bsingle family\b"), "Single-Family Home"),
    (has(r"\bmobile homes?\b"), "Single-Family Home"),

    # Office
    (has(r"office buildings?|professional service buildings?|insurance company offices?"),
     "Office"),
    (has(r"financial institutions?"), "Office"),

    # Retail (entertainment, food, auto sales, shops, centers, service stations, florists)
    (has(r"stores?\b|shopping centers?|department stores?|supermarkets|restaurants|"
         r"drive-?in restaurants|nightclubs|cocktail lounges|bars|theaters|auditoriums|"
         r"tourist attractions|exhibits|fairgrounds|bowling alleys|skating rinks|pool halls|"
         r"arenas|drive-?in theaters|open stadiums|auto sales|auto rental|mobile home sales|"
         r"motorcycles|construction vehicle sales|service stations|florists|greenhouses"),
     "Retail"),

    # Industrial (production/wholesale/repair/infrastructure)
    (has(r"wholesale outlets|manufacturing outlets|produce houses"), "Industrial"),
    (has(r"repair service shops .*radio.*t\.?v\.?|refrigeration service|electric repair|laundries|laundromats"),
     "Industrial"),
    (has(r"airports|bus terminals|marine terminals|piers|marinas"), "Industrial"),

    # Land / open-use
    (has(r"vacant residential|vacant commercial|parking lots|golf courses|driving ranges|camps"),
     "Land"),
]

# apply precedence
simple = pd.Series(pd.NA, index=merged_df.index, dtype="object")
for mask, label in rules:
    simple = simple.mask(mask & simple.isna(), label)

merged_df = (
    merged_df.assign(simple_definition=simple)
             .loc[lambda d: d["simple_definition"].notna()]  # drop everything else
             .copy()
)

# make categorical (ordered: residential first, then commercial)
order = [
    "Condo", "Single-Family Home", "Multi-Family Dwelling", "Cooperatives",
    "Industrial", "Land", "Mixed-Use", "Office", "Retail",
]
merged_df["simple_definition"] = pd.Categorical(merged_df["simple_definition"], categories=order, ordered=True)

# audit coverage
print("Mapped rows:", len(merged_df))
unmapped = (simple.isna()).sum()
print("Unmapped (dropped):", unmapped)

# (Optional) see what we missed, to refine rules
missed = merged_df.loc[simple.isna(), DEF_COL] if unmapped else pd.Series([], dtype="object")
print(missed.value_counts().head(20))


Mapped rows: 1176904
Unmapped (dropped): 30989
Series([], Name: count, dtype: int64)


In [23]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1176904 entries, 0 to 1207892
Data columns (total 10 columns):
 #   Column             Non-Null Count    Dtype   
---  ------             --------------    -----   
 0   PHY_CITY           1173264 non-null  object  
 1   DOR_UC             1176904 non-null  int64   
 2   SALE_PRC1          1176904 non-null  float64 
 3   SALE_MO1           1176904 non-null  float64 
 4   SALE_YR1           1176904 non-null  float64 
 5   RS_ID              1176904 non-null  object  
 6   county             1176904 non-null  object  
 7   Use Code           1176904 non-null  float64 
 8   Definition         1176904 non-null  object  
 9   simple_definition  1176904 non-null  category
dtypes: category(1), float64(4), int64(1), object(4)
memory usage: 90.9+ MB


In [24]:
merged_df

Unnamed: 0,PHY_CITY,DOR_UC,SALE_PRC1,SALE_MO1,SALE_YR1,RS_ID,county,Use Code,Definition,simple_definition
0,Miami,3,4949600.0,8.0,2019.0,1D37,Miami-Dade,3.0,Multi-family - 10 units or more,Multi-Family Dwelling
1,Miami,8,4949600.0,8.0,2019.0,1D37,Miami-Dade,8.0,Multi-family - fewer than 10 units,Multi-Family Dwelling
2,Miami,8,4949600.0,8.0,2019.0,1D37,Miami-Dade,8.0,Multi-family - fewer than 10 units,Multi-Family Dwelling
3,Miami,8,4949600.0,8.0,2019.0,1D37,Miami-Dade,8.0,Multi-family - fewer than 10 units,Multi-Family Dwelling
4,Miami,3,4146000.0,10.0,2019.0,1D37,Miami-Dade,3.0,Multi-family - 10 units or more,Multi-Family Dwelling
...,...,...,...,...,...,...,...,...,...,...
1207888,Westlake,1,942051.0,11.0,2023.0,2742,Palm Beach,1.0,Single Family,Single-Family Home
1207889,Westlake,1,964460.0,11.0,2023.0,2742,Palm Beach,1.0,Single Family,Single-Family Home
1207890,Westlake,1,1004284.0,12.0,2023.0,2742,Palm Beach,1.0,Single Family,Single-Family Home
1207891,Westlake,1,1103067.0,9.0,2023.0,2742,Palm Beach,1.0,Single Family,Single-Family Home


In [25]:
import pandas as pd
from io import StringIO

# Start from your prepared merged_df (already has 'simple_definition')
df = merged_df.copy()

# Keep only the six columns you care about (and coerce types sensibly)
keep = ["PHY_CITY", "SALE_PRC1", "SALE_MO1", "SALE_YR1", "county", "simple_definition"]
df = df[keep].copy()

# Clean month/year for grouping and display
df["SALE_MO1"] = pd.to_numeric(df["SALE_MO1"], errors="coerce").astype("Int64")
df["SALE_YR1"] = pd.to_numeric(df["SALE_YR1"], errors="coerce").astype("Int64")
df["SALE_PRC1"] = pd.to_numeric(df["SALE_PRC1"], errors="coerce")

# Drop rows missing month/year
df = df.loc[df["SALE_MO1"].notna() & df["SALE_YR1"].notna()].copy()

# ----- DETAILS CSV (rows) -----
details = (
    df.sort_values(
        ["county", "PHY_CITY", "simple_definition", "SALE_YR1", "SALE_MO1"]
    ).reset_index(drop=True)
)
details.to_csv("deal_groups_details.csv", index=False)

# ----- SUMMARY CSV (counts per combo) -----
keys = ["county", "PHY_CITY", "simple_definition", "SALE_YR1", "SALE_MO1"]
summary = (
    df.groupby(keys, observed=True)
      .size().reset_index(name="number_of_deals")
      .sort_values(keys).reset_index(drop=True)
)
summary.to_csv("deal_groups_summary.csv", index=False)

# ----- TXT REPORT (sections + per-row table with only your six cols) -----
def ym_str(y, m):
    return f"{int(y):04d}-{int(m):02d}"

report = StringIO()
for _, s in summary.iterrows():
    mask = (
        (details["county"] == s["county"]) &
        (details["PHY_CITY"] == s["PHY_CITY"]) &
        (details["simple_definition"] == s["simple_definition"]) &
        (details["SALE_YR1"] == s["SALE_YR1"]) &
        (details["SALE_MO1"] == s["SALE_MO1"])
    )
    g = details.loc[mask, keep]
    report.write(
        f"\n{'='*100}\n"
        f"{s['PHY_CITY']} | {s['county']} | {s['simple_definition']} | {ym_str(s['SALE_YR1'], s['SALE_MO1'])}\n"
        f"Deals: {int(s['number_of_deals'])}\n"
        f"{'-'*100}\n"
    )
    report.write(g.to_string(index=False))
    report.write("\n")

with open("deal_groups_report.txt", "w", encoding="utf-8") as f:
    f.write(report.getvalue())

# ----- RAW ROWS JSON (only your six columns) -----
details.to_json("fl_market_rows.json", orient="records", indent=2)

# ----- (UNCHANGED) PAGE JSON for the UI (aggregated monthly metrics) -----
# Build year_month, then aggregate to the schema your page expects.
out_for_page = (
    details.assign(
        year_month=lambda d: d["SALE_YR1"].astype(str).str.zfill(4)
                           + "-" + d["SALE_MO1"].astype(str).str.zfill(2),
        borough=lambda d: d["county"],
        neighborhood=lambda d: d["PHY_CITY"],
        property_group=lambda d: d["simple_definition"],
    )
    .groupby(["borough", "neighborhood", "property_group", "year_month"], observed=True)
    .agg(
        sales_volume=("SALE_PRC1", "sum"),
        avg_price=("SALE_PRC1", "mean"),
        median_price=("SALE_PRC1", "median"),
        number_of_deals=("SALE_PRC1", "size"),
    )
    .reset_index()
    .sort_values(["borough", "neighborhood", "property_group", "year_month"])
    .reset_index(drop=True)
)

out_for_page.to_json("fl_market.json", orient="records", indent=2)

print(
    "Wrote:\n"
    " - deal_groups_details.csv (six columns only)\n"
    " - deal_groups_summary.csv (six columns + number_of_deals)\n"
    " - deal_groups_report.txt (six-column tables)\n"
    " - fl_market_rows.json (six columns only)\n"
    " - fl_market.json (aggregated for the webpage)"
)


Wrote:
 - deal_groups_details.csv (six columns only)
 - deal_groups_summary.csv (six columns + number_of_deals)
 - deal_groups_report.txt (six-column tables)
 - fl_market_rows.json (six columns only)
 - fl_market.json (aggregated for the webpage)


In [26]:
df[(df['PHY_CITY'] == 'Aventura') & 
    (df['SALE_MO1'] == 3) & 
    (df['SALE_YR1'] == 2024) & 
    (df['simple_definition'] == 'Single-Family Home')]

Unnamed: 0,PHY_CITY,SALE_PRC1,SALE_MO1,SALE_YR1,county,simple_definition
388627,Aventura,1239000.0,3,2024,Miami-Dade,Single-Family Home


In [27]:
df[(df['PHY_CITY'] == 'Aventura') & 
    (df['SALE_MO1'] == 2) & 
    (df['SALE_YR1'] == 2022) & 
    (df['simple_definition'] == 'Single-Family Home')]['SALE_PRC1'].median()

25384800.0

In [28]:
# df = merged_df.copy()

# # 1) Clean month/year and build YYYY-MM
# m = pd.to_numeric(df["SALE_MO1"], errors="coerce").astype("Int64")
# y = pd.to_numeric(df["SALE_YR1"], errors="coerce").astype("Int64")

# df = df.loc[m.notna() & y.notna()].copy()
# df["SALE_MO1"] = m
# df["SALE_YR1"] = y
# df["year_month"] = y.astype(str).str.zfill(4) + "-" + m.astype(str).str.zfill(2)

# # 2) Group columns (your deal identity)
# group_cols = ["county", "PHY_CITY", "simple_definition", "year_month"]

# # 3) Aggregate — number_of_deals = group size
# out = (
#     df.groupby(group_cols, observed=True)
#       .agg(
#           sales_volume=("SALE_PRC1", "sum"),
#           avg_price=("SALE_PRC1", "mean"),
#           median_price=("SALE_PRC1", "median"),
#           number_of_deals=("SALE_PRC1", "size"),   # 👈 count rows in the combo
#       )
#       .reset_index()
#       .rename(columns={
#           "county": "borough",
#           "PHY_CITY": "neighborhood",
#           "simple_definition": "property_group",
#       })
#       .sort_values(["borough", "neighborhood", "year_month"])
#       .reset_index(drop=True)
# )

# # (Optional) if you don't want these at all, you can skip adding them
# # out["avg_asking_rent"] = pd.NA
# # out["PPSF"] = pd.NA

# # Quick sanity check: group sizes equal number_of_deals
# assert (
#     df.groupby(group_cols, observed=True).size().reset_index(name="n")["n"].sum()
#     == out["number_of_deals"].sum()
# )

# # Save JSON for the page
# out.to_json("fl_market.json", orient="records", indent=2)
# print("Wrote fl_market.json with number_of_deals = group size for the combo.")


In [29]:
df.groupby?

[0;31mSignature:[0m
[0mdf[0m[0;34m.[0m[0mgroupby[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mby[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m:[0m [0;34m'Axis | lib.NoDefault'[0m [0;34m=[0m [0;34m<[0m[0mno_default[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlevel[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mas_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msort[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgroup_keys[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mobserved[0m[0;34m:[0m [0;34m'bool | lib.NoDefault'[0m [0;34m=[0m [0;34m<[0m[0mno_default[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdropna[0m[0;34m:[0m [0;34m'bool'[0m [0;34m