In [2]:
import json
import pandas as pd
import ast

In [3]:
business_sample = pd.read_csv('business_sample.csv')
OUTPUT_CSV_PATH = 'business_features_processed.csv'
print(business_sample.head()) 

              business_id                             name  \
0  Vvq9QucD0IokLBlkbttd3Q                    Brioche Dorée   
1  qL4Ya3cBmPLIUlXdx0aEsw              Blue Taj Restaurant   
2  e0iCFeakO_y2Jwjm8stW8w                Taste of Szechuan   
3  J6KMedR-L-tP4wKIAsj7tw            Ricky's All Day Grill   
4  rPCdcIzXOeeAFfaUhdJJ6A  Original Joe's Restaurant & Bar   

               address      city state postal_code   latitude   longitude  \
0    1000 Airport Road  Edmonton    AB     T5G 3G2  53.570859 -113.522812   
1    1523 50 Street NW  Edmonton    AB     T6L 7C9  53.444849 -113.418532   
2    3855 99 Street NW  Edmonton    AB     T6E 6H6  53.473223 -113.485271   
3  10140 109 Street NW  Edmonton    AB     T5J 1M7  53.541207 -113.509633   
4   12520 102nd Avenue  Edmonton    AB     T5N 0M3  53.543495 -113.538717   

   stars  review_count  is_open  \
0    3.5             5        0   
1    2.5             7        0   
2    4.5            12        1   
3    2.5            23  

In [4]:
def load_and_process_business_data_from_csv(csv_path):
    core_features = ['business_id', 'name', 'state', 'stars', 'review_count']
    business_sample_core_features = business_sample[core_features].copy()
    # 3. Handle the 'attributes' column
    
    # Convert the string representation of the dictionary into a proper dictionary object
    # We use a custom function with ast.literal_eval for safety and to handle different quote types
    def parse_attributes(attr_str):
        if pd.isna(attr_str):
            return {}
        try:
            # Safely evaluate the string as a Python literal (dictionary)
            # Replace single quotes inside the string with double quotes for JSON compatibility if needed,
            # but ast.literal_eval often handles it.
            # Handle the 'u' prefix for unicode strings that sometimes appears in the sample data
            attr_str = attr_str.replace("u'", "'")
            return ast.literal_eval(attr_str)
        except (ValueError, SyntaxError) as e:
            # Fallback for malformed strings
            print(f"Warning: Could not parse attribute string: {attr_str[:50]}... Error: {e}")
            return {}

    business_sample['attributes_dict'] = business_sample['attributes'].apply(parse_attributes) 

    # Flatten the 'attributes' dictionary into new columns
    attribute_data = []
    for index, row in business_sample.iterrows():
        attributes = row['attributes_dict']
        flat_attrs = {'business_id': row['business_id']}
        
        for key, value in attributes.items():
            # Handle nested objects like BusinessParking
            if isinstance(value, dict):
                for sub_key, sub_value in value.items():
                    # Column name: e.g., 'BusinessParking.garage'
                    flat_attrs[f'{key}.{sub_key}'] = sub_value
            # Handle non-nested attributes
            else:
                # Column name: e.g., 'RestaurantsTakeOut'
                flat_attrs[key] = value
                
        attribute_data.append(flat_attrs)

    df_attributes = pd.DataFrame(attribute_data)
    
    # Merge the core DataFrame with the flattened attributes DataFrame
    df_processed = pd.merge(business_sample_core_features, df_attributes, on='business_id', how='left')

    # 4. Handle the 'categories' column
    # The categories column is already a comma-separated string in your sample,
    # but we'll normalize it to a pipe-separated string for consistency, which 
    # may or may not be necessary depending on your end goal.
    
    def normalize_categories(cat_str):
        if pd.isna(cat_str):
            return None
        # Assuming the CSV has a comma-separated string and we want to normalize the separator
        return '|'.join([c.strip() for c in cat_str.split(',')])

    df_processed['categories'] = business_sample['categories'].apply(normalize_categories)

    # 5. Final Feature Selection (implicitly done)
    # The resulting DataFrame contains:
    # 'business_id', 'name', 'state', 'stars', 'review_count', 
    # all flattened attribute columns (e.g., 'RestaurantsTakeOut', 'BusinessParking.garage', etc.),
    # and 'categories'.

    return df_processed

# --- Execution ---
try:
    # Load and process the data
    processed_df = load_and_process_business_data_from_csv(business_sample)

    # Save the processed DataFrame to a CSV file
    processed_df.to_csv(OUTPUT_CSV_PATH, index=False)
    
    print("\n--- Processing Complete ---")
    print(f"Data successfully saved to {OUTPUT_CSV_PATH}")
    print(f"Total businesses processed: {len(processed_df)}")
    print("\nFirst 5 rows of the processed data (showing key columns):")
    # Show core features and some new flattened attribute columns
    display_cols = ['business_id', 'name', 'stars', 'RestaurantsTakeOut', 
                    'BusinessParking.garage', 'categories']
    print(processed_df.filter(items=display_cols).head())
    print("\nProcessed columns (Total):")
    print(processed_df.columns.tolist())

except FileNotFoundError:
    print(f"ERROR: The file '{business_sample}' was not found. Please ensure it is in the correct directory.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


--- Processing Complete ---
Data successfully saved to business_features_processed.csv
Total businesses processed: 3541

First 5 rows of the processed data (showing key columns):
              business_id                             name  stars  \
0  Vvq9QucD0IokLBlkbttd3Q                    Brioche Dorée    3.5   
1  qL4Ya3cBmPLIUlXdx0aEsw              Blue Taj Restaurant    2.5   
2  e0iCFeakO_y2Jwjm8stW8w                Taste of Szechuan    4.5   
3  J6KMedR-L-tP4wKIAsj7tw            Ricky's All Day Grill    2.5   
4  rPCdcIzXOeeAFfaUhdJJ6A  Original Joe's Restaurant & Bar    4.0   

  RestaurantsTakeOut                                         categories  
0                NaN                         Bakeries|Food|Coffee & Tea  
1               True                                 Restaurants|Indian  
2              False               Chinese|Szechuan|Restaurants|Hot Pot  
3               True          Restaurants|Canadian (New)|American (New)  
4               True  American (New