In [3]:
import pandas as pd

def standardize_market_data_final(df):
    """
    Cleans and standardizes only the 'Market' and 'Submarket' columns.
    This version correctly handles interstates even when not preceded by a space.
    """
    # Create copies to work with
    market_col = df['Market'].astype(str).copy()
    submarket_col = df['Submarket'].astype(str).copy()

    # Comprehensive dictionary for standardizing abbreviations
    abbreviations = {
        # Directions
        r'\bN\b': 'North',
        r'\bS\b': 'South',
        r'\bE\b': 'East',
        r'\bW\b': 'West',
        r'\bNe\b': 'Northeast',
        r'\bNw\b': 'Northwest',
        r'\bSe\b': 'Southeast',
        r'\bSw\b': 'Southwest',
        # Common terms
        r'\bSt\b': 'Saint',
        r'\bFt\b': 'Fort',
        r'\bBalt\b': 'Baltimore',
        r'&': 'and'
    }

    # Apply standardization steps to only the Market and Submarket columns
    for col in [market_col, submarket_col]:
        # 1. Start with lowercase and strip whitespace
        col = col.str.lower().str.strip()

        # 2. NEW: Convert '1-' to 'i-' at the start. Handles "Atlanta1-" -> "atlantai-"
        col = col.str.replace('1-', 'i-', regex=False)
        
        # 3. Remove periods that are not part of a specific acronym (like d.c.)
        col = col.str.replace(r'\.(?!\s*c\.)', '', regex=True)

        # 4. Standardize punctuation - NOTE: We now leave hyphens alone.
        col = col.str.replace(r'[/–]', ' ', regex=True)

        # 5. Apply title case. This correctly converts "i-" to "I-"
        col = col.str.title()
        
        # 6. Replace known abbreviations from the dictionary
        for abbr, full_word in abbreviations.items():
            col = col.str.replace(abbr, full_word, regex=True)
        
        # 7. Final cleanup of extra spaces
        col = col.str.replace(r'\s+', ' ', regex=True).str.strip()

    # Add the cleaned columns back to the original DataFrame
    df['Market_Cleaned'] = market_col
    df['Submarket_Cleaned'] = submarket_col
    df['Market_Submarket_Cleaned'] = df['Market_Cleaned'] + '_' + df['Submarket_Cleaned']

    return df

In [4]:
df = pd.read_csv('combined.csv')

In [5]:
print(df)

     Broker  Year Period_Type  Period_Number      Market  \
0      CBRE  2021           Q              1    Savannah   
1      CBRE  2021           Q              2    Savannah   
2      CBRE  2021           Q              2  Tri Valley   
3      CBRE  2021           Q              2  Tri Valley   
4      CBRE  2021           Q              2  Tri Valley   
...     ...   ...         ...            ...         ...   
8764   CBRE  2025           Q              2  Tri Valley   
8765   CBRE  2025           Q              2  Tri Valley   
8766   CBRE  2025           Q              2  Tri Valley   
8767   CBRE  2025           Q              2  Tri Valley   
8768   CBRE  2025           Q              2  Tri Valley   

                         Submarket Secondary Submarket  Property Type  \
0                  Savannah Market                 NaN        Overall   
1                  Savannah Market                 NaN        Overall   
2                           Dublin                 NaN     I

In [6]:
standardize_market_data_final(df)

Unnamed: 0,Broker,Year,Period_Type,Period_Number,Market,Submarket,Secondary Submarket,Property Type,Inventory SF,Vacancy Q,Net Absorption Q,Under Construction Q,Asking Rent Q,Delivered Q,Leasing Activity Q,Market_Cleaned,Submarket_Cleaned,Market_Submarket_Cleaned
0,CBRE,2021,Q,1,Savannah,Savannah Market,,Overall,77228420.0,0.033,762012.0,8510808.0,5.39,,,Savannah,Savannah Market,Savannah_Savannah Market
1,CBRE,2021,Q,2,Savannah,Savannah Market,,Overall,79163460.0,0.035,2679277.0,9137976.0,5.40,1935040.0,,Savannah,Savannah Market,Savannah_Savannah Market
2,CBRE,2021,Q,2,Tri Valley,Dublin,,Industrial,1316832.0,0.023,0.0,,0.90,,,Tri Valley,Dublin,Tri Valley_Dublin
3,CBRE,2021,Q,2,Tri Valley,Dublin,,Warehouse,1266192.0,0.024,0.0,,0.90,,,Tri Valley,Dublin,Tri Valley_Dublin
4,CBRE,2021,Q,2,Tri Valley,Dublin,,Manufacturing,50640.0,0.000,0.0,,0.95,,,Tri Valley,Dublin,Tri Valley_Dublin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8764,CBRE,2025,Q,2,Tri Valley,San Ramon,,Warehouse,983969.0,0.007,-2834.0,,2.19,,,Tri Valley,San Ramon,Tri Valley_San Ramon
8765,CBRE,2025,Q,2,Tri Valley,San Ramon,,Manufacturing,,,,,,,,Tri Valley,San Ramon,Tri Valley_San Ramon
8766,CBRE,2025,Q,2,Tri Valley,Tri-Valley Ind. Market Total,,Industrial,20204357.0,0.075,-117168.0,760243.0,1.17,,,Tri Valley,Tri-Valley Ind. Market Total,Tri Valley_Tri-Valley Ind. Market Total
8767,CBRE,2025,Q,2,Tri Valley,Tri-Valley Ind. Market Total,,Warehouse,17342161.0,0.072,-94837.0,760243.0,1.18,,,Tri Valley,Tri-Valley Ind. Market Total,Tri Valley_Tri-Valley Ind. Market Total


In [7]:
print(df)

     Broker  Year Period_Type  Period_Number      Market  \
0      CBRE  2021           Q              1    Savannah   
1      CBRE  2021           Q              2    Savannah   
2      CBRE  2021           Q              2  Tri Valley   
3      CBRE  2021           Q              2  Tri Valley   
4      CBRE  2021           Q              2  Tri Valley   
...     ...   ...         ...            ...         ...   
8764   CBRE  2025           Q              2  Tri Valley   
8765   CBRE  2025           Q              2  Tri Valley   
8766   CBRE  2025           Q              2  Tri Valley   
8767   CBRE  2025           Q              2  Tri Valley   
8768   CBRE  2025           Q              2  Tri Valley   

                         Submarket Secondary Submarket  Property Type  \
0                  Savannah Market                 NaN        Overall   
1                  Savannah Market                 NaN        Overall   
2                           Dublin                 NaN     I

In [None]:
# df.to_csv('fixed_combined.csv')

In [9]:
new_df = pd.read_csv("CBRE_ArcGIS_data.csv")

In [10]:
standardize_market_data_final(new_df)

Unnamed: 0,FID,Submarket,Market,Market_Cleaned,Submarket_Cleaned,Market_Submarket_Cleaned
0,0,Albany,Albany,Albany,Albany,Albany_Albany
1,1,Columbia,Albany,Albany,Columbia,Albany_Columbia
2,2,Fulton,Albany,Albany,Fulton,Albany_Fulton
3,3,Greene,Albany,Albany,Greene,Albany_Greene
4,4,Montgomery,Albany,Albany,Montgomery,Albany_Montgomery
...,...,...,...,...,...,...
814,814,,,,,_
815,815,Frederick,Suburban Maryland,Suburban Maryland,Frederick,Suburban Maryland_Frederick
816,816,,,,,_
817,817,Harrisburg/York,PA I-78/I-81 Corridor,PA I-78/I-81 Corridor,Harrisburg/York,PA I-78/I-81 Corridor_Harrisburg/York


In [11]:
new_df.to_csv("fixed_CBRE_ArcGIS_data.csv")