In [64]:
import json
import pandas as pd
import ast

In [110]:
import pandas as pd
import ast
import numpy as np
from io import StringIO

# --- Configuration ---
INPUT_CSV_PATH = 'business_sample.csv'
OUTPUT_CSV_PATH = 'business_features_processed.csv'
ROW_TO_CHECK = 1 # Index for the Blue Taj Restaurant
# --- End Configuration ---


# 1. Load the data (This time it won't fail!)
print(f"Loading data from: {INPUT_CSV_PATH}")
business_df = pd.read_csv(INPUT_CSV_PATH)

# Function to safely convert string dictionary from CSV to Python dict.
def safe_literal_eval(x):
    """Safely converts string dictionary from CSV to Python dict, handling complex escaping."""
    if pd.isna(x):
        return {}
    try:
        x = x.replace("u'", "'").replace("'", '"').replace('""', '"')
        return ast.literal_eval(x.replace('"', "'"))
    except (ValueError, SyntaxError, TypeError):
        return {}

# Function to safely convert the comma-separated category string to a Python list
def safe_categories_eval(x):
    try:
        if isinstance(x, str):
            return [c.strip() for c in x.split(',')]
        return []
    except Exception:
        return []

# --- Comprehensive Attribute Flattener Function (FINALIZED) ---
def flatten_and_extract_attributes(row):
    """
    Extracts and flattens required attributes, simplifying BusinessParking
    to a single boolean and keeping only the Ambience complexity flag.
    """
    
    flat_attrs = {'business_id': row['business_id']}
    attributes = row['attributes_dict']
    
    # 1. Simple Boolean Attributes (Default is False)
    SIMPLE_ATTRIBUTES = [
        'OutdoorSeating', 'RestaurantsDelivery', 'RestaurantsReservations', 'GoodForKids', 
        'RestaurantsGoodForGroups', 'RestaurantsTakeOut', 'WheelchairAccessible', 
        'RestaurantsTableService', 'GoodForDancing', 'HappyHour', 'CoatCheck', 
        'BusinessAcceptsCreditCards', 'DriveThr', 'ByAppointmentOnly', 'Smoking'
    ]
    
    for key in SIMPLE_ATTRIBUTES:
        value = attributes.get(key, False)
        flat_attrs[key] = bool(value) 

    # 2. Complex/Nested Attribute Extraction
    
    # 2.1. BusinessParking (Simplified to a single 'HasParking' flag)
    parking_dict = attributes.get('BusinessParking', {})
    if not isinstance(parking_dict, dict):
        parking_dict = {}
        
    parking_options = ['garage', 'street', 'validated', 'lot', 'valet']
    
    has_parking = False
    for prefix in parking_options:
        if bool(parking_dict.get(prefix, False)):
            has_parking = True
            break
            
    flat_attrs['HasParking'] = has_parking 
    
    # 2.2. Ambience (Simplified presence flag)
    ambience_dict = attributes.get('Ambience', {})
    if isinstance(ambience_dict, dict) and any(bool(v) for v in ambience_dict.values()):
        flat_attrs['HasAmbienceDetails'] = True
    else:
        flat_attrs['HasAmbienceDetails'] = False
        
    return flat_attrs

# 2. Select the base features and process attributes
core_features = ['business_id', 'name', 'state', 'stars', 'review_count', 'categories', 'attributes']
features_df = business_df[core_features].copy()
features_df['attributes_dict'] = features_df['attributes'].apply(safe_literal_eval)

# 3. Process the 'attributes' column (Extraction)
print("\nProcessing 'attributes' column (Final Extraction)...")

attribute_data_list = features_df.apply(flatten_and_extract_attributes, axis=1).tolist()
df_attributes = pd.DataFrame(attribute_data_list)

# Merge the attributes back into the main DataFrame
features_df = pd.merge(features_df.drop(columns=['attributes', 'attributes_dict']), df_attributes, on='business_id', how='left')


# 4. Process the 'categories' column (Optimized One-Hot Encoding)
print("Processing 'categories' column (Optimized)...")

features_df['category_list'] = features_df['categories'].apply(safe_categories_eval)
all_categories = sorted(list(set(
    cat for sublist in features_df['category_list'] for cat in sublist
)))

category_columns = {}
for category in all_categories:
    col_name = f'Category_{category.replace(" ", "_").replace("&", "and")}'
    category_columns[col_name] = features_df['category_list'].apply(lambda x: 1 if category in x else 0)

categories_df_ohe = pd.DataFrame(category_columns)
features_df = pd.concat([features_df.reset_index(drop=True), categories_df_ohe.reset_index(drop=True)], axis=1)

# --- 5. Final cleanup and Conversion ---
features_df = features_df.drop(columns=['categories', 'category_list'])

# Convert all Boolean attributes to 1/0
bool_cols = features_df.select_dtypes(include='bool').columns
print(f"\nConverting {len(bool_cols)} Boolean attributes to 1/0.")
for col in bool_cols:
    features_df[col] = features_df[col].astype(int)

# 6. Save the final resulting DataFrame
print(f"\nSaving final processed features to: {OUTPUT_CSV_PATH}")
features_df.to_csv(OUTPUT_CSV_PATH, index=False)

# 7. Display final result and column check
print("\n--- Processed DataFrame Head ---")

display_attributes = [
    'RestaurantsTakeOut', 'OutdoorSeating', 'RestaurantsDelivery', 
    'HasParking', 'HasAmbienceDetails'
]
category_display_cols = [col for col in features_df.columns if col.startswith('Category_')][:3]

final_cols = ['business_id', 'name', 'stars', 'review_count'] + display_attributes + category_display_cols
print(features_df[final_cols].head())

print("\n--- Final Column Check ---")
print("Total Features (Categories + Attributes + Core):", len(features_df.columns))

# 8. DETAILED INSPECTION (Final Verification)
print("\n\n#####################################################")
print("### DETAILED INSPECTION: SECOND BUSINESS (INDEX 1) ###")
print("#####################################################")

ROW_TO_CHECK = 1
single_business_row = features_df.iloc[[ROW_TO_CHECK]]

def filter_attributes_for_display(index):
    value = single_business_row.iloc[0].loc[index]
    
    # 1. Keep core identifiers
    if index in ['business_id', 'name', 'state', 'stars', 'review_count']:
        return True
        
    # 2. Keep only attributes/categories that are TRUE (1)
    if (isinstance(value, (int, np.int64)) and value == 1):
        return True
        
    return False

readable_output = single_business_row.T.loc[single_business_row.T.index.to_series().apply(filter_attributes_for_display)]

print(readable_output)

Loading data from: business_sample.csv

Processing 'attributes' column (Final Extraction)...
Processing 'categories' column (Optimized)...

Converting 17 Boolean attributes to 1/0.

Saving final processed features to: business_features_processed.csv

--- Processed DataFrame Head ---
              business_id                             name  stars  \
0  Vvq9QucD0IokLBlkbttd3Q                    Brioche Dor√©e    3.5   
1  qL4Ya3cBmPLIUlXdx0aEsw              Blue Taj Restaurant    2.5   
2  e0iCFeakO_y2Jwjm8stW8w                Taste of Szechuan    4.5   
3  J6KMedR-L-tP4wKIAsj7tw            Ricky's All Day Grill    2.5   
4  rPCdcIzXOeeAFfaUhdJJ6A  Original Joe's Restaurant & Bar    4.0   

   review_count  RestaurantsTakeOut  OutdoorSeating  RestaurantsDelivery  \
0             5                   0               0                    0   
1             7                   0               0                    0   
2            12                   0               0                    0