In [1]:
import pandas as pd
from difflib import get_close_matches
import os

In [2]:
def load_and_copy_data(filepath: str) -> pd.DataFrame | None:
    """
    Loads data from a CSV file into a pandas DataFrame and returns a copy.
    
    Args:
        filepath (str): The path to the CSV file.
        
    Returns:
        pd.DataFrame | None: A copy of the DataFrame if the file exists, otherwise None.
    """
    if not os.path.exists(filepath):
        print(f"❌ Error: File not found at '{filepath}'")
        return None
    
    print(f"✅ Loading data from '{filepath}'...")
    df = pd.read_csv(filepath)
    # Fill NaN values to ensure they are handled consistently
    for col in ['Submarket', 'Secondary Submarket']:
        df[col] = df[col].fillna('N/A')
    return df.copy()

In [3]:
file = r"csv_data\combined.csv"

df = load_and_copy_data(file)
df

✅ Loading data from 'csv_data\combined.csv'...


Unnamed: 0,Broker,Year,Period_Type,Period_Number,Market,Submarket,Secondary Submarket,Property Type,Inventory SF,Vacancy Q,Net Absorption Q,Under Construction Q,Asking Rent Q,Delivered Q,Leasing Activity Q
0,CBRE,2021,Q,1,Boston,Urban,Urbon,Industrial,14512313.0,0.036,-85337.0,,21.93,,
1,CBRE,2021,Q,1,Boston,Metro North,Close-In Suburbs North,Industrial,17910543.0,0.019,-2600.0,,17.68,,
2,CBRE,2021,Q,1,Boston,Metro North,Route 128 - North,Industrial,32260569.0,0.024,286431.0,,14.21,,
3,CBRE,2021,Q,1,Boston,Metro North,Route 495- Northeast,Industrial,19244348.0,0.011,-3205.0,,11.66,,
4,CBRE,2021,Q,1,Boston,Metro North,Route 3 - North,Industrial,17726435.0,0.042,36732.0,,11.87,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10288,CBRE,2025,Q,2,Tri Valley,San Ramon,,Warehouse,983969.0,0.007,-2834.0,,2.19,,
10289,CBRE,2025,Q,2,Tri Valley,San Ramon,,Manufacturing,,,,,,,
10290,CBRE,2025,Q,2,Tri Valley,Tri-Valley Ind. Market Total,,Industrial,20204357.0,0.075,-117168.0,760243.0,1.17,,
10291,CBRE,2025,Q,2,Tri Valley,Tri-Valley Ind. Market Total,,Warehouse,17342161.0,0.072,-94837.0,760243.0,1.18,,


In [4]:
def summarize_submarket_combinations(df: pd.DataFrame) -> pd.DataFrame:
    """
    Groups data by market and time period to show the unique combinations
    of Submarket and Secondary Submarket for each period.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A DataFrame summarizing the unique submarket combinations.
    """
    print("\n" + "="*50)
    print("📋 Summarizing unique Submarket and Secondary Submarket combinations by period...")
    
    # Define the columns to group by and display
    grouping_cols = ['Market', 'Year', 'Period_Type', 'Period_Number', 'Submarket', 'Secondary Submarket']
    
    # Use only the columns that actually exist in the DataFrame
    display_cols = [col for col in grouping_cols if col in df.columns]
    
    if 'Submarket' not in display_cols and 'Secondary Submarket' not in display_cols:
        print("❌ Cannot summarize because 'Submarket' and 'Secondary Submarket' columns are missing.")
        return pd.DataFrame()

    # Get the unique combinations of submarkets for each market and period
    summary_df = df[display_cols].drop_duplicates().reset_index(drop=True)
    
    # Sort for better readability
    sort_cols = [col for col in ['Market', 'Year', 'Period_Number'] if col in display_cols]
    summary_df = summary_df.sort_values(by=sort_cols)
    
    return summary_df
    """
    Identifies markets with inconsistent naming schemes and adds a boolean
    'Is_Inconsistent' column to the DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame with market data.

    Returns:
        pd.DataFrame: The original DataFrame with the new 'Is_Inconsistent' flag.
    """
    print("🔎 Identifying inconsistent markets to flag...")
    df_flagged = df.copy()
    inconsistent_markets = set()
    
    # Define the columns to group by, checking if 'Period_Type' exists
    grouping_cols = ['Market', 'Year', 'Period_Number']
    if 'Period_Type' in df.columns:
        grouping_cols.insert(2, 'Period_Type')

    for column_to_check in ['Submarket', 'Secondary Submarket']:
        if column_to_check not in df.columns:
            continue
            
        # FIX: Convert all items to str before sorting to avoid TypeError
        agg_function = lambda x: tuple(sorted(str(item) for item in x.unique()))

        # Group by periods to get the naming scheme for each
        schemes = df.groupby(grouping_cols)[column_to_check].apply(agg_function)
        
        # Now, group by market to see if the schemes are consistent over time
        market_schemes = schemes.groupby('Market').unique()
        
        # Identify markets with more than one unique scheme
        for market, unique_schemes in market_schemes.items():
            if len(unique_schemes) > 1:
                inconsistent_markets.add(market)

    if not inconsistent_markets:
        print("✅ All markets have consistent naming schemes!")
        df_flagged['Is_Inconsistent'] = False
    else:
        print(f"⚠️ Found {len(inconsistent_markets)} inconsistent markets: {list(inconsistent_markets)}")
        # Add the flag column based on whether the market is in our set
        df_flagged['Is_Inconsistent'] = df_flagged['Market'].isin(inconsistent_markets)

    return df_flagged

In [5]:
df = summarize_submarket_combinations(df)
print(df)
df


📋 Summarizing unique Submarket and Secondary Submarket combinations by period...
           Market  Year Period_Type  Period_Number             Submarket  \
1225  Albuquerque  2023           Q              2               Airport   
1226  Albuquerque  2023           Q              2              Downtown   
1227  Albuquerque  2023           Q              2          Mesa del Sol   
1228  Albuquerque  2023           Q              2            NE Heights   
1229  Albuquerque  2023           Q              2            North I-25   
...           ...   ...         ...            ...                   ...   
81          tulsa  2022           H              1         South Central   
82          tulsa  2022           H              1             Southeast   
83          tulsa  2022           H              1             Southwest   
84          tulsa  2022           H              1  Outlying Market Area   
85          tulsa  2022           H              1                 Tulsa   

     

Unnamed: 0,Market,Year,Period_Type,Period_Number,Submarket,Secondary Submarket
1225,Albuquerque,2023,Q,2,Airport,
1226,Albuquerque,2023,Q,2,Downtown,
1227,Albuquerque,2023,Q,2,Mesa del Sol,
1228,Albuquerque,2023,Q,2,NE Heights,
1229,Albuquerque,2023,Q,2,North I-25,
...,...,...,...,...,...,...
81,tulsa,2022,H,1,South Central,
82,tulsa,2022,H,1,Southeast,
83,tulsa,2022,H,1,Southwest,
84,tulsa,2022,H,1,Outlying Market Area,


In [6]:
df.to_csv("test_df.csv", index=False)