## 2. Data Loading and Initial Overview 

Our first step in the EDA process is to load the `AB_NYC_2019.csv` dataset. Missing values handled, log_price created, host_type_category potentially created based on findings in the EDA process.

### 2.1. Load Data

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns


try:
    df = pd.read_csv("..\\data\\AB_NYC_2019.csv")
    df['reviews_per_month'] = df['reviews_per_month'].fillna(0)
    df['name'] = df['name'].fillna('Unknown')
    df['host_name'] = df['host_name'].fillna('Unknown')
    if 'price' in df.columns and df['price'].min() >= 0:
        df['log_price'] = np.log1p(df['price'])
    bins_host = [0, 1, 2, 5, 10, 50, df['calculated_host_listings_count'].max() + 1]
    labels_host = ['1', '2', '3-5', '6-10', '11-50', '51+']
    df['host_type_category'] = pd.cut(df['calculated_host_listings_count'], bins=bins_host, labels=labels_host, right=True)
    print("DataFrame loaded and initial cleaning/preparation assumed complete.")
except FileNotFoundError:
    print("Ensure df is loaded and preprocessed in notebook.")
    df = pd.DataFrame({
        'price': np.random.exponential(150, 5000),
        'log_price': np.log1p(np.random.exponential(150, 5000)),
        'room_type': np.random.choice(['Entire home/apt', 'Private room', 'Shared room'], 5000, p=[0.5, 0.45, 0.05]),
        'minimum_nights': np.random.choice([1,2,3,7,30,90], 5000, p=[0.5,0.2,0.1,0.1,0.05,0.05]),
        'calculated_host_listings_count': np.random.choice([1,2,3,10,50,100], 5000, p=[0.6,0.15,0.1,0.05,0.05,0.05]),
        'number_of_reviews': np.random.randint(0,100,5000),
        'availability_365': np.random.randint(0, 366, 5000),
        'reviews_per_month': np.random.rand(5000) * 5,
        'neighbourhood_group': np.random.choice(['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island'], 5000),
        'neighbourhood': [f"Hood_{i%20}_{np.random.choice(['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island'])}" for i in range(5000)],
        'id': range(5000)
    })
    bins_host = [0, 1, 2, 5, 10, 50, df['calculated_host_listings_count'].max() + 1]
    labels_host = ['1', '2', '3-5', '6-10', '11-50', '51+']
    df['host_type_category'] = pd.cut(df['calculated_host_listings_count'], bins=bins_host, labels=labels_host, right=True)
    print("Using dummy data for script execution.")


plt.style.use('ggplot')
sns.set_palette("muted")

DataFrame loaded and initial cleaning/preparation assumed complete.


### 2.1 Define Price Tiers (for Neighborhood Profiling)

In [3]:

print("--- Defining Price Tiers ---")

if 'price' in df.columns:
    # Calculate the required percentiles
    price_q1 = df['price'].quantile(0.25)    # 25th percentile
    price_q3 = df['price'].quantile(0.75)    # 75th percentile
    price_q95 = df['price'].quantile(0.95)   # 95th percentile
    max_price = df['price'].max()

    print(f"Percentiles used for tiers: Q1=${price_q1:.2f}, Q3=${price_q3:.2f}, Q95=${price_q95:.2f}, Max=${max_price:.2f}")

    # Define bins and labels for 4 tiers
    price_bins = [-0.01, price_q1, price_q3, price_q95, max_price + 1]
    price_labels = ['Budget', 'Mid-Range', 'Premium', 'Upper Premium']
    
    # Check for non-monotonic bins which can happen if quantiles are equal
    # (e.g., if Q3 and Q95 are the same due to data distribution)
    is_monotonic = all(price_bins[i] < price_bins[i+1] for i in range(len(price_bins)-2)) # Check up to Q95
    if price_bins[len(price_bins)-2] > price_bins[len(price_bins)-1]: # Check last bin edge with max_price + 1
        is_monotonic = False # max_price + 1 should always be greater unless max_price itself is problematic

    if not is_monotonic or len(set(price_bins)) < len(price_bins):
        print("\nWarning: Bin edges are not strictly monotonic or contain duplicates.")
        print(f"Original calculated bins: {[-0.01, price_q1, price_q3, price_q95, max_price + 1]}")
        # Attempt to create unique sorted bins. This might reduce the number of bins if quantiles are equal.
        price_bins = sorted(list(set([-0.01, price_q1, price_q3, price_q95, max_price + 1])))
        print(f"Adjusted unique sorted bins: {price_bins}")
        
        # If after adjustment we don't have enough bins for 4 labels, we might need to simplify
        if len(price_bins) < 5: # Need 5 edges for 4 labels
            print("Could not form 4 distinct tiers with current quantiles. Consider reviewing thresholds or using fewer tiers.")
        else:
             # If we still have enough bins for 4 labels after sorting unique
             print("Proceeding with adjusted unique bins for 4 tiers.")


    if len(price_bins) == len(price_labels) + 1: # Ensure we have the right number of bins for labels
        df['price_tier'] = pd.cut(df['price'],
                                  bins=price_bins,
                                  labels=price_labels,
                                  right=True,        # (lower_bound, upper_bound]
                                  include_lowest=True) # Ensures min value is included

        print("\nValue counts for 'price_tier' (%):")
        print(df['price_tier'].value_counts(normalize=True).sort_index() * 100)
        
        print("\nPrice Tier Definitions Used:")
        for i in range(len(price_labels)):
            lower_b = price_bins[i]
            if np.isclose(lower_b, -0.01): lower_b = 0.0 # For cleaner display
            upper_b = price_bins[i+1]
            # Adjust display for the last bin to show it includes the max
            inclusive_char = "<=" if i < len(price_labels) -1 else "<=" 
            print(f"  {price_labels[i]}: ${lower_b:.2f} to ${upper_b if i < len(price_labels)-1 else max_price:.2f}")

    else:
        print("\nError: Could not create price tiers. Number of unique bin edges is insufficient for the desired number of labels.")
        print(f"Final bins considered: {price_bins}")
        print(f"Labels: {price_labels}")
else:
    print("Error: 'price' column not found. Cannot create price tiers.")

--- Defining Price Tiers ---
Percentiles used for tiers: Q1=$69.00, Q3=$175.00, Q95=$355.00, Max=$10000.00

Value counts for 'price_tier' (%):
price_tier
Budget           25.301156
Mid-Range        49.794458
Premium          19.912056
Upper Premium     4.992331
Name: proportion, dtype: float64

Price Tier Definitions Used:
  Budget: $0.00 to $69.00
  Mid-Range: $69.00 to $175.00
  Premium: $175.00 to $355.00
  Upper Premium: $355.00 to $10000.00


### 2.3 Aggregate Features to Create Neighborhood Profiles (for Similarity)

In [4]:
print("--- Creating Neighbourhood Profiles for Similarity ---")

if 'host_type_category' not in df.columns and 'calculated_host_listings_count' in df.columns:
    bins_host = [0, 1, 2, 5, 10, 50, df['calculated_host_listings_count'].max() + 1]
    labels_host = ['Host_1', 'Host_2', 'Host_3-5', 'Host_6-10', 'Host_11-50', 'Host_51+'] # Renamed for easier column names
    df['host_type_category'] = pd.cut(df['calculated_host_listings_count'], bins=bins_host, labels=labels_host, right=True)

# Features to aggregate for neighbourhood character
agg_functions = {
    'log_price': ['median', 'std'], # Median log_price, spread of log_price
    'price': ['median'], # Median raw price for easier interpretation
    'minimum_nights': ['median'],
    'number_of_reviews': ['median', 'mean'], # Median total reviews as quality/establishment proxy
    'id': 'count' # Listing density
}

neighbourhood_profiles = df.groupby(['neighbourhood_group', 'neighbourhood']).agg(agg_functions)

# Flatten multi-index columns if any were created by agg
neighbourhood_profiles.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in neighbourhood_profiles.columns.values]
neighbourhood_profiles.rename(columns={'id_count': 'listing_count'}, inplace=True)


# Add room_type proportions
room_type_props = df.groupby(['neighbourhood_group', 'neighbourhood'])['room_type'].value_counts(normalize=True).unstack(fill_value=0)
room_type_props.columns = [f'room_type_prop_{col.replace(" ", "_").replace("/", "_")}' for col in room_type_props.columns] # Clean column names
neighbourhood_profiles = neighbourhood_profiles.join(room_type_props)

# Add price_tier proportions
if 'price_tier' in df.columns:
    price_tier_props = df.groupby(['neighbourhood_group', 'neighbourhood'])['price_tier'].value_counts(normalize=True).unstack(fill_value=0)
    price_tier_props.columns = [f'price_tier_prop_{col}' for col in price_tier_props.columns]
    neighbourhood_profiles = neighbourhood_profiles.join(price_tier_props)

# Add host_type_category proportions (optional, but based on your EDA)
if 'host_type_category' in df.columns:
    host_type_props = df.groupby(['neighbourhood_group', 'neighbourhood'])['host_type_category'].value_counts(normalize=True).unstack(fill_value=0)
    host_type_props.columns = [f'host_type_prop_{col}' for col in host_type_props.columns]
    neighbourhood_profiles = neighbourhood_profiles.join(host_type_props)


neighbourhood_profiles.fillna(0, inplace=True) # Fill any NaNs that might result from unstacking if a category isn't present
neighbourhood_profiles = neighbourhood_profiles.reset_index() # Make neighbourhood_group and neighbourhood actual columns

print("\nNeighbourhood Profiles for Similarity (first 5 rows):")
print(neighbourhood_profiles.head())
print(f"\nShape of neighbourhood_profiles: {neighbourhood_profiles.shape}")
# neighbourhood_profiles.info() # To check dtypes and non-nulls

--- Creating Neighbourhood Profiles for Similarity ---

Neighbourhood Profiles for Similarity (first 5 rows):
  neighbourhood_group neighbourhood  log_price_median  log_price_std  \
0               Bronx      Allerton          4.210781       0.585361   
1               Bronx    Baychester          4.330733       0.227715   
2               Bronx       Belmont          3.978589       0.702946   
3               Bronx     Bronxdale          3.931826       0.359313   
4               Bronx   Castle Hill          3.688879       0.477558   

   price_median  minimum_nights_median  number_of_reviews_median  \
0          66.5                    2.0                      27.0   
1          75.0                    3.0                      11.0   
2          52.5                    2.0                       4.5   
3          50.0                    2.0                      14.0   
4          39.0                    2.0                       0.0   

   number_of_reviews_mean  listing_count  room_t

### 2.4 Define "Busyness" for Neighborhoods

In [6]:
print("\n--- Defining Neighbourhood 'Busyness' Tiers using Clustering ---")

neighbourhood_busyness_metrics = df.groupby(['neighbourhood_group', 'neighbourhood']).agg(
    avg_availability_365=('availability_365', 'mean'),
    avg_reviews_per_month=('reviews_per_month', 'mean'),
    listing_density=('id', 'count') # Same as listing_count in profiles
).reset_index()

print("Raw busyness metrics for neighbourhoods (first 5):")
print(neighbourhood_busyness_metrics.head())

# 2. Prepare features for clustering
#    We want low availability, high reviews/month, and high density to indicate "busyness".
#    Let's invert availability so higher is "busier".
if 'avg_availability_365' in neighbourhood_busyness_metrics.columns: # Check if column exists
    neighbourhood_busyness_metrics['inverse_avg_availability'] = 1 / (neighbourhood_busyness_metrics['avg_availability_365'] + 0.01) # Add small constant
    busyness_features_for_clustering = ['inverse_avg_availability', 'avg_reviews_per_month', 'listing_density']
else: # Fallback if avg_availability_365 is missing for some reason (e.g. dummy data issue)
    busyness_features_for_clustering = ['avg_reviews_per_month', 'listing_density'] 
    print("Warning: 'avg_availability_365' not found, proceeding with fewer features for busyness clustering.")


# Ensure selected features for clustering are present
busyness_features_for_clustering = [f for f in busyness_features_for_clustering if f in neighbourhood_busyness_metrics.columns]
if not busyness_features_for_clustering:
    print("Error: No valid features found for busyness clustering. Please check column names.")
else:
    X_busyness = neighbourhood_busyness_metrics[busyness_features_for_clustering].copy()

    # 3. Scale the features
    scaler_busyness = StandardScaler()
    X_busyness_scaled = scaler_busyness.fit_transform(X_busyness)

    # 4. Determine number of clusters (e.g., using Elbow method - not shown here for brevity, assume 3 clusters for demo)
    n_busyness_clusters = 3 # Busy, Moderate, Less Busy
    kmeans_busyness = KMeans(n_clusters=n_busyness_clusters, random_state=42, n_init='auto')
    neighbourhood_busyness_metrics['busyness_tier_cluster'] = kmeans_busyness.fit_predict(X_busyness_scaled)

    print(f"\nValue counts for 'busyness_tier_cluster' (0 to {n_busyness_clusters-1}):")
    print(neighbourhood_busyness_metrics['busyness_tier_cluster'].value_counts())

    # 5. Interpret the busyness clusters
    print("\nMean feature values for each busyness_tier_cluster:")
    cluster_analysis_df = neighbourhood_busyness_metrics.groupby('busyness_tier_cluster')[busyness_features_for_clustering].mean().sort_values(by='listing_density', ascending=False)
    print(cluster_analysis_df)
    busyness_map = {
        1: 'Supply Constrained/Busy', 
        2: 'High Density/Saturated', 
        0: 'Active & Available/Less Busy' 
    }
    neighbourhood_busyness_metrics['busyness_label'] = neighbourhood_busyness_metrics['busyness_tier_cluster'].map(busyness_map)
    print(neighbourhood_busyness_metrics[['neighbourhood_group', 'neighbourhood', 'busyness_tier_cluster', 'busyness_label']].head())


--- Defining Neighbourhood 'Busyness' Tiers using Clustering ---
Raw busyness metrics for neighbourhoods (first 5):
  neighbourhood_group neighbourhood  avg_availability_365  \
0               Bronx      Allerton            163.666667   
1               Bronx    Baychester            157.857143   
2               Bronx       Belmont            187.666667   
3               Bronx     Bronxdale            145.421053   
4               Bronx   Castle Hill            159.333333   

   avg_reviews_per_month  listing_density  
0               1.615714               42  
1               1.891429                7  
2               1.573333               24  
3               1.614211               19  
4               0.616667                9  

Value counts for 'busyness_tier_cluster' (0 to 2):
busyness_tier_cluster
1    156
0     52
2     13
Name: count, dtype: int64

Mean feature values for each busyness_tier_cluster:
                       inverse_avg_availability  avg_reviews_per_month  

### 2.5 Combine Similarity Profiles with Busyness Information

In [7]:
print("--- Combining Neighbourhood Similarity Profiles with Busyness Information ---")

if 'neighbourhood_profiles' in locals() or 'neighbourhood_profiles' in globals():
    if 'neighbourhood_busyness_metrics' in locals() or 'neighbourhood_busyness_metrics' in globals():
        # Select only the necessary columns from busyness_metrics to avoid duplicate data columns
        # if some raw metrics were also in the similarity profile.
        # Key columns are neighbourhood identifiers and the busyness classification.
        columns_to_merge = ['neighbourhood_group', 'neighbourhood', 'busyness_tier_cluster', 'busyness_label']
        
        # Ensure the columns exist in neighbourhood_busyness_metrics
        actual_cols_to_merge = [col for col in columns_to_merge if col in neighbourhood_busyness_metrics.columns]
        if 'neighbourhood' not in actual_cols_to_merge or 'neighbourhood_group' not in actual_cols_to_merge:
            print("Error: 'neighbourhood' or 'neighbourhood_group' missing in neighbourhood_busyness_metrics for merging.")
        else:
            neighbourhood_busyness_to_merge = neighbourhood_busyness_metrics[actual_cols_to_merge].drop_duplicates()

            # Perform the merge
            final_neighbourhood_profiles_df = pd.merge(
                neighbourhood_profiles,
                neighbourhood_busyness_to_merge,
                on=['neighbourhood_group', 'neighbourhood'],
                how='left' # Use 'left' to keep all characteristic profiles; 'inner' if only those with busyness info
            )

            print("\nFinal Combined Neighbourhood Profiles (first 5 rows):")
            print(final_neighbourhood_profiles_df.head())
            print(f"\nShape of final_neighbourhood_profiles_df: {final_neighbourhood_profiles_df.shape}")
            
            print("\nCheck for any missing busyness labels after merge (should be 0 if all neighborhoods were clustered):")
            if 'busyness_label' in final_neighbourhood_profiles_df.columns:
                print(final_neighbourhood_profiles_df['busyness_label'].isnull().sum())
            else:
                print("'busyness_label' not found in the merged DataFrame, check merging step.")

    else:
        print("Error: 'neighbourhood_busyness_metrics' DataFrame not found. Please ensure it's created and named correctly.")
else:
    print("Error: 'neighbourhood_profiles' DataFrame not found. Please ensure it's created and named correctly.")


--- Combining Similarity Profiles with Busyness Information ---

Final Neighbourhood Profiles (with busyness tier - first 5 rows):
  neighbourhood_group neighbourhood  log_price_median  log_price_std  \
0               Bronx      Allerton          4.210781       0.585361   
1               Bronx    Baychester          4.330733       0.227715   
2               Bronx       Belmont          3.978589       0.702946   
3               Bronx     Bronxdale          3.931826       0.359313   
4               Bronx   Castle Hill          3.688879       0.477558   

   price_median  minimum_nights_median  number_of_reviews_median  \
0          66.5                    2.0                      27.0   
1          75.0                    3.0                      11.0   
2          52.5                    2.0                       4.5   
3          50.0                    2.0                      14.0   
4          39.0                    2.0                       0.0   

   number_of_reviews_mean 