In [1]:
import pandas as pd
from pathlib import Path
import sys
import os

In [2]:
EXPECTED_COLUMNS = [
    "primary_submarket", "secondary_submarket", "property_type",
    "total_inventory_q", "vacancy_q", "net_absorption_q",
    "under_construction_q", "rent_q", "delivered_q", "leasing_activity_q"
]

NUMERIC_COLUMNS = [
    "total_inventory_q", "vacancy_q", "net_absorption_q",
    "under_construction_q", "rent_q", "delivered_q", "leasing_activity_q"
]

RENAMING_COLUMNS = {
    "primary_submarket": "Submarket", "secondary_submarket": "Secondary Submarket",
    "property_type": "Property Type", "total_inventory_q": "Inventory SF",
    "vacancy_q": "Vacancy Q", "net_absorption_q": "Net Absorption Q",
    "under_construction_q": "Under Construction Q", "rent_q": "Asking Rent Q",
    "delivered_q": "Delivered Q", "leasing_activity_q": "Leasing Activity Q"
}

In [3]:
def clean_rent_range(value):
    if isinstance(value, str) and ' to ' in value:
        try:
            low, high = map(float, value.split(' to '))
            return (low + high) / 2
        except (ValueError, IndexError):
            return value
    return value

In [4]:
def validate_csv_file(file_path, root_dir):
    errors = []
    display_path = file_path.relative_to(root_dir)

    try:
        df = pd.read_csv(file_path, index_col=False)
    except Exception as e:
        errors.append(f"Path '{display_path}': Failed to read or parse. Error: {e}")
        return errors

    if 'rent_q' in df.columns:
        df['rent_q'] = df['rent_q'].apply(clean_rent_range)

    missing_cols = [col for col in EXPECTED_COLUMNS if col not in df.columns]
    if missing_cols:
        errors.append(f"Path '{display_path}': Missing required columns: {', '.join(missing_cols)}")
        return errors

    for col in NUMERIC_COLUMNS:
        original_not_na = df[col].notna()
        numeric_series = pd.to_numeric(df[col], errors='coerce')
        
        failed_conversion_mask = original_not_na & numeric_series.isna()
        if failed_conversion_mask.any():
            for index in df[failed_conversion_mask].index:
                bad_value = df.loc[index, col]
                errors.append(
                    f"Path '{display_path}': Column '{col}', row {index+2} has non-numeric value: '{bad_value}'"
                )
        
        if col not in ['net_absorption_q']:
             if (numeric_series < 0).any():
                for index in df[numeric_series < 0].index:
                     errors.append(
                         f"Path '{display_path}': Column '{col}', row {index+2} has a negative value: {numeric_series[index]}"
                     )
    return errors

In [5]:
def normalize_percentage(df, column_name):
    if column_name in df.columns:
        df[column_name] = pd.to_numeric(df[column_name], errors='coerce')
        if (df[column_name] > 1).any():
            df[column_name] = df[column_name] / 100
        df[column_name] = df[column_name].round(4)
    return df

In [6]:
def process_period_folder(directory_path, period):
    """
    Processes a directory of CSVs, including cleaning and type conversion.
    (Formerly process_quarterly_folder)
    """
    list_of_dataframes = []
    for file_path in directory_path.glob('*.csv'):
        market_name = file_path.stem
        df = pd.read_csv(file_path, index_col=False)

        # Apply the cleaning logic
        if 'rent_q' in df.columns:
            df['rent_q'] = df['rent_q'].apply(clean_rent_range)
            # Immediately convert the cleaned column to a numeric type.
            df['rent_q'] = pd.to_numeric(df['rent_q'], errors='coerce')

        df = normalize_percentage(df, 'vacancy_q')
        df.insert(0, "Period", period)
        df.insert(1, "Market", market_name)
        list_of_dataframes.append(df)
    return pd.concat(list_of_dataframes, ignore_index=True) if list_of_dataframes else pd.DataFrame()


In [7]:
def run_data_pipeline(root_directory_str, output_directory_str):
    root_dir = Path(root_directory_str)
    output_dir = Path(output_directory_str)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # MODIFICATION: Find both quarterly and half-yearly data folders
    quarterly_folders = list(root_dir.glob('202*/202* Q* csvs'))
    half_year_folders = list(root_dir.glob('202*/202* H* csvs'))
    all_data_folders = sorted(quarterly_folders + half_year_folders)
    
    if not all_data_folders:
        print(f"Error: No data folders found matching the required patterns in '{root_dir}'.")
        return

    print("--- Starting Phase 1: Global Validation ---")
    
    global_errors = []
    for folder_path in all_data_folders:
        print(f"  Validating files in '{folder_path.relative_to(root_dir)}'...")
        for file_path in folder_path.glob('*.csv'):
            file_errors = validate_csv_file(file_path, root_dir)
            if file_errors:
                global_errors.extend(file_errors)

    if global_errors:
        print("\n" + "="*80)
        print("VALIDATION FAILED. No files were created.")
        print("Please fix the following errors before running the script again:")
        print("="*80)
        for error in global_errors:
            print(f"- {error}")
        print("="*80)
        return

    print("\n--- Validation successful. All files are clean. ---")
    print("--- Starting Phase 2: Processing and File Generation ---\n")
    
    all_period_dfs = []
    for folder_path in all_data_folders:
        period_name = folder_path.name.replace(' csvs', '')
        print(f"Processing and combining '{period_name}'...")
        # MODIFICATION: Call the renamed function
        combined_period_df = process_period_folder(folder_path, period_name)
        
        if not combined_period_df.empty:
            output_path = output_dir / f"{period_name}.csv"
            combined_period_df.to_csv(output_path, index=False)
            print(f"  -> Successfully saved to '{output_path}'")
            all_period_dfs.append(combined_period_df)

    if not all_period_dfs:
        print("\nNo data was processed to create a final master file.")
        return
        
    print("\nCombining all period data into the master file...")
    master_df = pd.concat(all_period_dfs, ignore_index=True)
    
    master_df['rent_q'] = master_df['rent_q'].round(2)
    
    # MODIFICATION: Robustly parse 'Period' to handle both 'Q' and 'H' labels
    master_df[['Year', 'Period_Label']] = master_df['Period'].str.split(' ', expand=True)
    master_df['Year'] = pd.to_numeric(master_df['Year'])
    master_df['Period_Type'] = master_df['Period_Label'].str[0]
    master_df['Period_Number'] = pd.to_numeric(master_df['Period_Label'].str[1:])
    master_df.drop(columns=['Period', 'Period_Label'], inplace=True)
    master_df.insert(0, 'Broker', 'CBRE')
    
    # MODIFICATION: Reorder columns with new period columns
    start_order = ['Broker', 'Year', 'Period_Type', 'Period_Number']
    end_order = [col for col in master_df.columns if col not in start_order]
    master_df = master_df[start_order + end_order]

    master_df = master_df.rename(columns=RENAMING_COLUMNS)
    
    master_output_path = output_dir / "combined.csv"
    master_df.to_csv(master_output_path, index=False)
    
    print("\n" + "*"*80)
    print("PIPELINE COMPLETE!")
    print(f"Final master file created at: '{master_output_path}'")
    print("*"*80)

In [8]:
root_directory = r"pdf_data"

output_directory = r"csv_data"

run_data_pipeline(root_directory, output_directory)

--- Starting Phase 1: Global Validation ---
  Validating files in '2021\2021 Q1 csvs'...
  Validating files in '2021\2021 Q2 csvs'...
  Validating files in '2021\2021 Q3 csvs'...
  Validating files in '2021\2021 Q4 csvs'...
  Validating files in '2022\2022 H1 csvs'...
  Validating files in '2022\2022 H2 csvs'...
  Validating files in '2022\2022 Q1 csvs'...
  Validating files in '2022\2022 Q2 csvs'...
  Validating files in '2022\2022 Q3 csvs'...
  Validating files in '2022\2022 Q4 csvs'...


  df = pd.read_csv(file_path, index_col=False)


  Validating files in '2023\2023 H1 csvs'...
  Validating files in '2023\2023 H2 csvs'...
  Validating files in '2023\2023 Q1 csvs'...
  Validating files in '2023\2023 Q2 csvs'...
  Validating files in '2023\2023 Q3 csvs'...
  Validating files in '2023\2023 Q4 csvs'...
  Validating files in '2024\2024 H1 csvs'...
  Validating files in '2024\2024 H2 csvs'...
  Validating files in '2024\2024 Q1 csvs'...
  Validating files in '2024\2024 Q2 csvs'...
  Validating files in '2024\2024 Q3 csvs'...
  Validating files in '2024\2024 Q4 csvs'...
  Validating files in '2025\2025 Q1 csvs'...
  Validating files in '2025\2025 Q2 csvs'...

--- Validation successful. All files are clean. ---
--- Starting Phase 2: Processing and File Generation ---

Processing and combining '2021 Q1'...
  -> Successfully saved to 'csv_data\2021 Q1.csv'
Processing and combining '2021 Q2'...
  -> Successfully saved to 'csv_data\2021 Q2.csv'
Processing and combining '2021 Q3'...
  -> Successfully saved to 'csv_data\2021 Q3.

  df = pd.read_csv(file_path, index_col=False)


  -> Successfully saved to 'csv_data\2022 Q4.csv'
Processing and combining '2023 H1'...
  -> Successfully saved to 'csv_data\2023 H1.csv'
Processing and combining '2023 H2'...
  -> Successfully saved to 'csv_data\2023 H2.csv'
Processing and combining '2023 Q1'...
  -> Successfully saved to 'csv_data\2023 Q1.csv'
Processing and combining '2023 Q2'...
  -> Successfully saved to 'csv_data\2023 Q2.csv'
Processing and combining '2023 Q3'...
  -> Successfully saved to 'csv_data\2023 Q3.csv'
Processing and combining '2023 Q4'...
  -> Successfully saved to 'csv_data\2023 Q4.csv'
Processing and combining '2024 H1'...
  -> Successfully saved to 'csv_data\2024 H1.csv'
Processing and combining '2024 H2'...
  -> Successfully saved to 'csv_data\2024 H2.csv'
Processing and combining '2024 Q1'...
  -> Successfully saved to 'csv_data\2024 Q1.csv'
Processing and combining '2024 Q2'...
  -> Successfully saved to 'csv_data\2024 Q2.csv'
Processing and combining '2024 Q3'...
  -> Successfully saved to 'csv_