## 2. Data Loading, Re-Pre-Processing and Feature Engineering

Our first step in the EDA process is to load the `AB_NYC_2019.csv` dataset. Missing values handled, log_price created, host_type_category potentially created based on findings in the EDA process.

### 2.1. Load Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors 
import matplotlib.pyplot as plt
import seaborn as sns


try:
    df = pd.read_csv("..\\data\\AB_NYC_2019.csv")
    df['reviews_per_month'] = df['reviews_per_month'].fillna(0)
    df['name'] = df['name'].fillna('Unknown')
    df['host_name'] = df['host_name'].fillna('Unknown')
    if 'price' in df.columns and df['price'].min() >= 0:
        df['log_price'] = np.log1p(df['price'])
    bins_host = [0, 1, 2, 5, 10, 50, df['calculated_host_listings_count'].max() + 1]
    labels_host = ['1', '2', '3-5', '6-10', '11-50', '51+']
    df['host_type_category'] = pd.cut(df['calculated_host_listings_count'], bins=bins_host, labels=labels_host, right=True)
    print("DataFrame loaded and initial cleaning/preparation assumed complete.")
except FileNotFoundError:
    print("Ensure df is loaded and preprocessed in notebook.")
    df = pd.DataFrame({
        'price': np.random.exponential(150, 5000),
        'log_price': np.log1p(np.random.exponential(150, 5000)),
        'room_type': np.random.choice(['Entire home/apt', 'Private room', 'Shared room'], 5000, p=[0.5, 0.45, 0.05]),
        'minimum_nights': np.random.choice([1,2,3,7,30,90], 5000, p=[0.5,0.2,0.1,0.1,0.05,0.05]),
        'calculated_host_listings_count': np.random.choice([1,2,3,10,50,100], 5000, p=[0.6,0.15,0.1,0.05,0.05,0.05]),
        'number_of_reviews': np.random.randint(0,100,5000),
        'availability_365': np.random.randint(0, 366, 5000),
        'reviews_per_month': np.random.rand(5000) * 5,
        'neighbourhood_group': np.random.choice(['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island'], 5000),
        'neighbourhood': [f"Hood_{i%20}_{np.random.choice(['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island'])}" for i in range(5000)],
        'id': range(5000)
    })
    bins_host = [0, 1, 2, 5, 10, 50, df['calculated_host_listings_count'].max() + 1]
    labels_host = ['1', '2', '3-5', '6-10', '11-50', '51+']
    df['host_type_category'] = pd.cut(df['calculated_host_listings_count'], bins=bins_host, labels=labels_host, right=True)
    print("Using dummy data for script execution.")


plt.style.use('ggplot')
sns.set_palette("muted")

DataFrame loaded and initial cleaning/preparation assumed complete.


### 2.1 Define Price Tiers (for Neighborhood Profiling)

In [2]:

print("--- Defining Price Tiers ---")

if 'price' in df.columns:
    # Calculate the required percentiles
    price_q1 = df['price'].quantile(0.25)    # 25th percentile
    price_q3 = df['price'].quantile(0.75)    # 75th percentile
    price_q95 = df['price'].quantile(0.95)   # 95th percentile
    max_price = df['price'].max()

    print(f"Percentiles used for tiers: Q1=${price_q1:.2f}, Q3=${price_q3:.2f}, Q95=${price_q95:.2f}, Max=${max_price:.2f}")

    # Define bins and labels for 4 tiers
    price_bins = [-0.01, price_q1, price_q3, price_q95, max_price + 1]
    price_labels = ['Budget', 'Mid-Range', 'Premium', 'Upper Premium']
    
    # Check for non-monotonic bins which can happen if quantiles are equal
    # (e.g., if Q3 and Q95 are the same due to data distribution)
    is_monotonic = all(price_bins[i] < price_bins[i+1] for i in range(len(price_bins)-2)) # Check up to Q95
    if price_bins[len(price_bins)-2] > price_bins[len(price_bins)-1]: # Check last bin edge with max_price + 1
        is_monotonic = False # max_price + 1 should always be greater unless max_price itself is problematic

    if not is_monotonic or len(set(price_bins)) < len(price_bins):
        print("\nWarning: Bin edges are not strictly monotonic or contain duplicates.")
        print(f"Original calculated bins: {[-0.01, price_q1, price_q3, price_q95, max_price + 1]}")
        # Attempt to create unique sorted bins. This might reduce the number of bins if quantiles are equal.
        price_bins = sorted(list(set([-0.01, price_q1, price_q3, price_q95, max_price + 1])))
        print(f"Adjusted unique sorted bins: {price_bins}")
        
        # If after adjustment we don't have enough bins for 4 labels, we might need to simplify
        if len(price_bins) < 5: # Need 5 edges for 4 labels
            print("Could not form 4 distinct tiers with current quantiles. Consider reviewing thresholds or using fewer tiers.")
        else:
             # If we still have enough bins for 4 labels after sorting unique
             print("Proceeding with adjusted unique bins for 4 tiers.")


    if len(price_bins) == len(price_labels) + 1: # Ensure we have the right number of bins for labels
        df['price_tier'] = pd.cut(df['price'],
                                  bins=price_bins,
                                  labels=price_labels,
                                  right=True,        # (lower_bound, upper_bound]
                                  include_lowest=True) # Ensures min value is included

        print("\nValue counts for 'price_tier' (%):")
        print(df['price_tier'].value_counts(normalize=True).sort_index() * 100)
        
        print("\nPrice Tier Definitions Used:")
        for i in range(len(price_labels)):
            lower_b = price_bins[i]
            if np.isclose(lower_b, -0.01): lower_b = 0.0 # For cleaner display
            upper_b = price_bins[i+1]
            # Adjust display for the last bin to show it includes the max
            inclusive_char = "<=" if i < len(price_labels) -1 else "<=" 
            print(f"  {price_labels[i]}: ${lower_b:.2f} to ${upper_b if i < len(price_labels)-1 else max_price:.2f}")

    else:
        print("\nError: Could not create price tiers. Number of unique bin edges is insufficient for the desired number of labels.")
        print(f"Final bins considered: {price_bins}")
        print(f"Labels: {price_labels}")
else:
    print("Error: 'price' column not found. Cannot create price tiers.")

--- Defining Price Tiers ---
Percentiles used for tiers: Q1=$69.00, Q3=$175.00, Q95=$355.00, Max=$10000.00

Value counts for 'price_tier' (%):
price_tier
Budget           25.301156
Mid-Range        49.794458
Premium          19.912056
Upper Premium     4.992331
Name: proportion, dtype: float64

Price Tier Definitions Used:
  Budget: $0.00 to $69.00
  Mid-Range: $69.00 to $175.00
  Premium: $175.00 to $355.00
  Upper Premium: $355.00 to $10000.00


### 2.3 Aggregate Features to Create Neighborhood Profiles (for Similarity)

In [3]:
print("--- Creating Neighbourhood Profiles for Similarity ---")

if 'host_type_category' not in df.columns and 'calculated_host_listings_count' in df.columns:
    bins_host = [0, 1, 2, 5, 10, 50, df['calculated_host_listings_count'].max() + 1]
    labels_host = ['Host_1', 'Host_2', 'Host_3-5', 'Host_6-10', 'Host_11-50', 'Host_51+'] # Renamed for easier column names
    df['host_type_category'] = pd.cut(df['calculated_host_listings_count'], bins=bins_host, labels=labels_host, right=True)

# Features to aggregate for neighbourhood character
agg_functions = {
    'log_price': ['median', 'std'], # Median log_price, spread of log_price
    'price': ['median'], # Median raw price for easier interpretation
    'minimum_nights': ['median'],
    'number_of_reviews': ['median', 'mean'], # Median total reviews as quality/establishment proxy
    'id': 'count' # Listing density
}

neighbourhood_profiles = df.groupby(['neighbourhood_group', 'neighbourhood']).agg(agg_functions)

# Flatten multi-index columns if any were created by agg
neighbourhood_profiles.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in neighbourhood_profiles.columns.values]
neighbourhood_profiles.rename(columns={'id_count': 'listing_count'}, inplace=True)


# Add room_type proportions
room_type_props = df.groupby(['neighbourhood_group', 'neighbourhood'])['room_type'].value_counts(normalize=True).unstack(fill_value=0)
room_type_props.columns = [f'room_type_prop_{col.replace(" ", "_").replace("/", "_")}' for col in room_type_props.columns] # Clean column names
neighbourhood_profiles = neighbourhood_profiles.join(room_type_props)

# Add price_tier proportions
if 'price_tier' in df.columns:
    price_tier_props = df.groupby(['neighbourhood_group', 'neighbourhood'])['price_tier'].value_counts(normalize=True).unstack(fill_value=0)
    price_tier_props.columns = [f'price_tier_prop_{col}' for col in price_tier_props.columns]
    neighbourhood_profiles = neighbourhood_profiles.join(price_tier_props)

# Add host_type_category proportions (optional, but based on your EDA)
if 'host_type_category' in df.columns:
    host_type_props = df.groupby(['neighbourhood_group', 'neighbourhood'])['host_type_category'].value_counts(normalize=True).unstack(fill_value=0)
    host_type_props.columns = [f'host_type_prop_{col}' for col in host_type_props.columns]
    neighbourhood_profiles = neighbourhood_profiles.join(host_type_props)


neighbourhood_profiles.fillna(0, inplace=True) # Fill any NaNs that might result from unstacking if a category isn't present
neighbourhood_profiles = neighbourhood_profiles.reset_index() # Make neighbourhood_group and neighbourhood actual columns

print("\nNeighbourhood Profiles for Similarity (first 5 rows):")
print(neighbourhood_profiles.head())
print(f"\nShape of neighbourhood_profiles: {neighbourhood_profiles.shape}")
# neighbourhood_profiles.info() # To check dtypes and non-nulls

--- Creating Neighbourhood Profiles for Similarity ---

Neighbourhood Profiles for Similarity (first 5 rows):
  neighbourhood_group neighbourhood  log_price_median  log_price_std  \
0               Bronx      Allerton          4.210781       0.585361   
1               Bronx    Baychester          4.330733       0.227715   
2               Bronx       Belmont          3.978589       0.702946   
3               Bronx     Bronxdale          3.931826       0.359313   
4               Bronx   Castle Hill          3.688879       0.477558   

   price_median  minimum_nights_median  number_of_reviews_median  \
0          66.5                    2.0                      27.0   
1          75.0                    3.0                      11.0   
2          52.5                    2.0                       4.5   
3          50.0                    2.0                      14.0   
4          39.0                    2.0                       0.0   

   number_of_reviews_mean  listing_count  room_t

### 2.4 Define "Busyness" for Neighborhoods

In [4]:
print("\n--- Defining Neighbourhood 'Busyness' Tiers using Clustering ---")

neighbourhood_busyness_metrics = df.groupby(['neighbourhood_group', 'neighbourhood']).agg(
    avg_availability_365=('availability_365', 'mean'),
    avg_reviews_per_month=('reviews_per_month', 'mean'),
    listing_density=('id', 'count') # Same as listing_count in profiles
).reset_index()

print("Raw busyness metrics for neighbourhoods (first 5):")
print(neighbourhood_busyness_metrics.head())

# 2. Prepare features for clustering
#    We want low availability, high reviews/month, and high density to indicate "busyness".
#    Let's invert availability so higher is "busier".
if 'avg_availability_365' in neighbourhood_busyness_metrics.columns: # Check if column exists
    neighbourhood_busyness_metrics['inverse_avg_availability'] = 1 / (neighbourhood_busyness_metrics['avg_availability_365'] + 0.01) # Add small constant
    busyness_features_for_clustering = ['inverse_avg_availability', 'avg_reviews_per_month', 'listing_density']
else: # Fallback if avg_availability_365 is missing for some reason (e.g. dummy data issue)
    busyness_features_for_clustering = ['avg_reviews_per_month', 'listing_density'] 
    print("Warning: 'avg_availability_365' not found, proceeding with fewer features for busyness clustering.")


# Ensure selected features for clustering are present
busyness_features_for_clustering = [f for f in busyness_features_for_clustering if f in neighbourhood_busyness_metrics.columns]
if not busyness_features_for_clustering:
    print("Error: No valid features found for busyness clustering. Please check column names.")
else:
    X_busyness = neighbourhood_busyness_metrics[busyness_features_for_clustering].copy()

    # 3. Scale the features
    scaler_busyness = StandardScaler()
    X_busyness_scaled = scaler_busyness.fit_transform(X_busyness)

    # 4. Determine number of clusters
    n_busyness_clusters = 3 # Busy, Moderate, Less Busy
    kmeans_busyness = KMeans(n_clusters=n_busyness_clusters, random_state=42, n_init='auto')
    neighbourhood_busyness_metrics['busyness_tier_cluster'] = kmeans_busyness.fit_predict(X_busyness_scaled)

    print(f"\nValue counts for 'busyness_tier_cluster' (0 to {n_busyness_clusters-1}):")
    print(neighbourhood_busyness_metrics['busyness_tier_cluster'].value_counts())

    # 5. Interpret the busyness clusters
    print("\nMean feature values for each busyness_tier_cluster:")
    cluster_analysis_df = neighbourhood_busyness_metrics.groupby('busyness_tier_cluster')[busyness_features_for_clustering].mean().sort_values(by='listing_density', ascending=False)
    print(cluster_analysis_df)
    busyness_map = {
        1: 'Supply Constrained/Busy', 
        2: 'High Density/Saturated', 
        0: 'Active & Available/Less Busy' 
    }
    neighbourhood_busyness_metrics['busyness_label'] = neighbourhood_busyness_metrics['busyness_tier_cluster'].map(busyness_map)
    print(neighbourhood_busyness_metrics[['neighbourhood_group', 'neighbourhood', 'busyness_tier_cluster', 'busyness_label']].head())


--- Defining Neighbourhood 'Busyness' Tiers using Clustering ---
Raw busyness metrics for neighbourhoods (first 5):
  neighbourhood_group neighbourhood  avg_availability_365  \
0               Bronx      Allerton            163.666667   
1               Bronx    Baychester            157.857143   
2               Bronx       Belmont            187.666667   
3               Bronx     Bronxdale            145.421053   
4               Bronx   Castle Hill            159.333333   

   avg_reviews_per_month  listing_density  
0               1.615714               42  
1               1.891429                7  
2               1.573333               24  
3               1.614211               19  
4               0.616667                9  

Value counts for 'busyness_tier_cluster' (0 to 2):
busyness_tier_cluster
1    156
0     52
2     13
Name: count, dtype: int64

Mean feature values for each busyness_tier_cluster:
                       inverse_avg_availability  avg_reviews_per_month  

### 2.5 Combine Similarity Profiles with Busyness Information

In [5]:
print("--- Combining Neighbourhood Similarity Profiles with Busyness Information ---")

if 'neighbourhood_profiles' in locals() or 'neighbourhood_profiles' in globals():
    if 'neighbourhood_busyness_metrics' in locals() or 'neighbourhood_busyness_metrics' in globals():
        # Select only the necessary columns from busyness_metrics to avoid duplicate data columns
        # if some raw metrics were also in the similarity profile.
        # Key columns are neighbourhood identifiers and the busyness classification.
        columns_to_merge = ['neighbourhood_group', 'neighbourhood', 'busyness_tier_cluster', 'busyness_label']
        
        # Ensure the columns exist in neighbourhood_busyness_metrics
        actual_cols_to_merge = [col for col in columns_to_merge if col in neighbourhood_busyness_metrics.columns]
        if 'neighbourhood' not in actual_cols_to_merge or 'neighbourhood_group' not in actual_cols_to_merge:
            print("Error: 'neighbourhood' or 'neighbourhood_group' missing in neighbourhood_busyness_metrics for merging.")
        else:
            neighbourhood_busyness_to_merge = neighbourhood_busyness_metrics[actual_cols_to_merge].drop_duplicates()

            # Perform the merge
            final_neighbourhood_profiles_df = pd.merge(
                neighbourhood_profiles,
                neighbourhood_busyness_to_merge,
                on=['neighbourhood_group', 'neighbourhood'],
                how='left' # Use 'left' to keep all characteristic profiles; 'inner' if only those with busyness info
            )

            print("\nFinal Combined Neighbourhood Profiles (first 5 rows):")
            print(final_neighbourhood_profiles_df.head())
            print(f"\nShape of final_neighbourhood_profiles_df: {final_neighbourhood_profiles_df.shape}")
            
            print("\nCheck for any missing busyness labels after merge (should be 0 if all neighborhoods were clustered):")
            if 'busyness_label' in final_neighbourhood_profiles_df.columns:
                print(final_neighbourhood_profiles_df['busyness_label'].isnull().sum())
            else:
                print("'busyness_label' not found in the merged DataFrame, check merging step.")

    else:
        print("Error: 'neighbourhood_busyness_metrics' DataFrame not found. Please ensure it's created and named correctly.")
else:
    print("Error: 'neighbourhood_profiles' DataFrame not found. Please ensure it's created and named correctly.")

--- Combining Neighbourhood Similarity Profiles with Busyness Information ---

Final Combined Neighbourhood Profiles (first 5 rows):
  neighbourhood_group neighbourhood  log_price_median  log_price_std  \
0               Bronx      Allerton          4.210781       0.585361   
1               Bronx    Baychester          4.330733       0.227715   
2               Bronx       Belmont          3.978589       0.702946   
3               Bronx     Bronxdale          3.931826       0.359313   
4               Bronx   Castle Hill          3.688879       0.477558   

   price_median  minimum_nights_median  number_of_reviews_median  \
0          66.5                    2.0                      27.0   
1          75.0                    3.0                      11.0   
2          52.5                    2.0                       4.5   
3          50.0                    2.0                      14.0   
4          39.0                    2.0                       0.0   

   number_of_reviews_mean

## 3. Modelling

### 3.1 Feature Selection and Scaling for KNN

In [6]:
print("--- Selecting and Scaling Features for KNN Similarity ---")

# Define columns from final_neighbourhood_profiles_df to be used for calculating similarity.
# These should be features describing the neighborhood's character.
# Exclude identifiers (neighbourhood, neighbourhood_group) and the busyness label/cluster itself.

# Recommended similarity features based on EDA and profile creation:
similarity_feature_columns = [
    'log_price_median',         # Typical price level (log-transformed)
    'log_price_std',            # Price variability within the neighborhood
    'minimum_nights_median',    # Typical minimum stay
    'number_of_reviews_median', # Proxy for typical listing establishment/perceived quality
    'number_of_reviews_mean',
    'listing_count',            # Can also be part of neighborhood character (small vs. large inventory)
    
    # Room type proportions
    'room_type_prop_Entire_home_apt',
    'room_type_prop_Private_room',
    'room_type_prop_Shared_room',
    
    # Price tier proportions
    'price_tier_prop_Budget',
    'price_tier_prop_Mid-Range',
    'price_tier_prop_Premium',
    'price_tier_prop_Upper_Premium',
    
    # Host type proportions
    'host_type_prop_1',
    'host_type_prop_2',
    'host_type_prop_3-5',
    'host_type_prop_6-10',
    'host_type_prop_11-50',
    'host_type_prop_51+'
]
similarity_feature_columns.extend([col for col in final_neighbourhood_profiles_df.columns if 'host_type_prop_' in col])
similarity_feature_columns = sorted(list(set(similarity_feature_columns))) # Ensure uniqueness and order

# Verify that these columns exist in your DataFrame
existing_similarity_features = [col for col in similarity_feature_columns if col in final_neighbourhood_profiles_df.columns]

if not existing_similarity_features:
    print("Error: No similarity feature columns found in 'final_neighbourhood_profiles_df'. Please check column names.")
    features_scaled = None # Ensure this variable is defined for later checks
else:
    print(f"Using these {len(existing_similarity_features)} features for KNN similarity: {existing_similarity_features}")
    
    features_for_knn = final_neighbourhood_profiles_df[existing_similarity_features].copy()
    
    # Fill any potential NaNs in these selected features (e.g., if a proportion was all NaN and then 0)
    # A good practice before scaling, though your aggregation should handle most.
    features_for_knn.fillna(features_for_knn.mean(), inplace=True) # Impute with mean if any NaNs persist
    
    # Scale the features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features_for_knn)
    
    print("\nShape of scaled features for KNN:", features_scaled.shape)
    print("Scaled features (first row example):", features_scaled[0])

--- Selecting and Scaling Features for KNN Similarity ---
Using these 18 features for KNN similarity: ['host_type_prop_1', 'host_type_prop_11-50', 'host_type_prop_2', 'host_type_prop_3-5', 'host_type_prop_51+', 'host_type_prop_6-10', 'listing_count', 'log_price_median', 'log_price_std', 'minimum_nights_median', 'number_of_reviews_mean', 'number_of_reviews_median', 'price_tier_prop_Budget', 'price_tier_prop_Mid-Range', 'price_tier_prop_Premium', 'room_type_prop_Entire_home_apt', 'room_type_prop_Private_room', 'room_type_prop_Shared_room']

Shape of scaled features for KNN: (221, 18)
Scaled features (first row example): [-0.82373718 -0.25850706 -0.72302177  2.12973271 -0.23393207 -0.40643778
 -0.33506963 -0.6290947   0.1013179  -0.30285419  1.28085333  1.29846227
  0.59921022 -0.04477275 -0.67913045 -0.51235558  0.69540609 -0.50440547]


## 3.2 Fit KNN Model

In [7]:
print("\n--- Fitting KNN Model ---")

knn_model = None # Initialize
if 'features_scaled' in locals() and features_scaled is not None and features_scaled.shape[0] > 0:
    # Number of neighbors: target itself + up to 10 others. Max N-1 if N < 11.
    n_neighbors_knn = min(11, features_scaled.shape[0]) 

    knn_model = NearestNeighbors(n_neighbors=n_neighbors_knn, metric='euclidean') # 'cosine' is also a good option
    knn_model.fit(features_scaled)
    
    print(f"KNN model fitted with k={n_neighbors_knn} and '{knn_model.metric}' metric.")
else:
    print("Error: Scaled features for KNN ('features_scaled') not available or empty. KNN model not fitted.")


--- Fitting KNN Model ---
KNN model fitted with k=11 and 'euclidean' metric.


### 3.3 Develop Recommendation Logic Function

In [8]:
print("--- Defining Recommendation Logic Function ---")

def get_neighbourhood_recommendations(target_neighbourhood_name, 
                                      target_group_name,
                                      df_profiles, # This is your final_neighbourhd_profiles_df
                                      model_knn, 
                                      all_scaled_features, # This is your features_scaled numpy array
                                      num_recommendations=3,
                                      busyness_column='busyness_label', 
                                      less_busy_values=['Active & Available/Less Busy']): 
    
    recommendations_df = pd.DataFrame() # Initialize an empty DataFrame for results
    
    # Find the index and profile of the target neighborhood in the original df_profiles
    # This ensures we get the correct index for 'all_scaled_features'
    target_profile_matches = df_profiles[
        (df_profiles['neighbourhood'] == target_neighbourhood_name) &
        (df_profiles['neighbourhood_group'] == target_group_name)
    ]

    if target_profile_matches.empty:
        print(f"Error: Target neighbourhood '{target_neighbourhood_name}' in `neighbourhood_group` '{target_group_name}' not found in profiles.")
        return recommendations_df # Return empty DataFrame

    target_index = target_profile_matches.index[0] # Get the actual index from df_profiles
    target_busyness_value = target_profile_matches[busyness_column].iloc[0]
    
    print(f"\nTarget: {target_neighbourhood_name} ({target_group_name}), Original Busyness: {target_busyness_value}")

    # Proceed to find alternatives if the target is NOT already considered "less busy"
    # (or adjust this logic if you always want to show alternatives for comparison)
    if target_busyness_value not in less_busy_values:
        print(f"'{target_neighbourhood_name}' is considered busy/constrained. Searching for alternatives...")
        
        # Get its scaled feature vector using the correct index
        target_features_scaled_for_knn = all_scaled_features[target_index].reshape(1, -1)
        
        # Find K nearest neighbors using the fitted KNN model
        distances, indices = model_knn.kneighbors(target_features_scaled_for_knn)
        
        # Get the profiles of these neighbors from df_profiles using the returned indices
        neighbor_profiles_df = df_profiles.iloc[indices[0]].copy() # .iloc uses integer position of rows
        neighbor_profiles_df['similarity_distance'] = distances[0] # Add distance for context
        
        # Filter for recommendations:
        # 1. Exclude the target neighborhood itself
        # 2. Must be in the same neighbourhood_group
        # 3. Must have a "less busy" label (as defined by less_busy_values)
        
        recommendations_df = neighbor_profiles_df[
            (neighbor_profiles_df['neighbourhood'] != target_neighbourhood_name) & 
            (neighbor_profiles_df['neighbourhood_group'] == target_group_name) &
            (neighbor_profiles_df[busyness_column].isin(less_busy_values))
        ].sort_values(by='similarity_distance', ascending=True).head(num_recommendations) # Closest first
        
        if recommendations_df.empty:
            print(f"No suitable less busy, similar neighbourhoods found in '{target_group_name}' for '{target_neighbourhood_name}'.")
        
    else:
        print(f"'{target_neighbourhood_name}' is already in a preferred 'less busy' category: {target_busyness_value}.")

    return recommendations_df

# Test the function definition
print("\nRecommendation function defined: get_neighbourhood_recommendations(target_neighbourhood_name, target_group_name, ...)")
print("You can now call this function with specific inputs in the next cell to get recommendations.")

--- Defining Recommendation Logic Function ---

Recommendation function defined: get_neighbourhood_recommendations(target_neighbourhood_name, target_group_name, ...)
You can now call this function with specific inputs in the next cell to get recommendations.


## Testing

In [9]:
print("\n--- Testing Recommendation Logic with Dynamic 'Less Busy' Criteria ---")

if not all(var in locals() or var in globals() for var in ['final_neighbourhood_profiles_df', 'knn_model', 'features_scaled', 'get_neighbourhood_recommendations']):
    print("ERROR: One or more required variables not found. Run previous cells.")
else:
    # --- Test Case 1: Target 'Midtown' in 'Manhattan' ---
    target_n1 = 'Allerton'
    target_g1 = 'Bronx'
    print(f"\n--- Test Case 1: Target: {target_n1}, {target_g1} ---")

    # Step 1: Get the busyness label of the target neighborhood
    target_n1_profile = final_neighbourhood_profiles_df[
        (final_neighbourhood_profiles_df['neighbourhood'] == target_n1) &
        (final_neighbourhood_profiles_df['neighbourhood_group'] == target_g1)
    ]

    if target_n1_profile.empty:
        print(f"Target neighborhood {target_n1} in {target_g1} not found in profiles.")
    else:
        target_n1_busyness_label = target_n1_profile['busyness_label'].iloc[0]
        print(f"Busyness label for {target_n1}: {target_n1_busyness_label}")

        # Step 2: Define 'less_busy_values' dynamically based on the target's busyness
        # These are example rules - you should refine them based on your project goals
        if target_n1_busyness_label == 'Supply Constrained/Busy':
            # If target is very constrained, maybe accept 'Active & Available' OR 'High Density' (which has high availability)
            dynamic_less_busy_labels = ['Active & Available/Less Busy', 'High Density/Saturated']
            print(f"Target is '{target_n1_busyness_label}'. Relaxed 'less busy' criteria: {dynamic_less_busy_labels}")
        elif target_n1_busyness_label == 'High Density/Saturated':
            # If target is very dense, primary goal might be to find less dense areas.
            # 'Active & Available/Less Busy' is ideal (low density).
            # 'Supply Constrained/Busy' has moderate density, so could be an option if less dense than target.
            dynamic_less_busy_labels = ['Active & Available/Less Busy'] # Start with strictest
            # You could add more sophisticated logic here to compare actual density values if needed
            print(f"Target is '{target_n1_busyness_label}'. Using 'less busy' criteria: {dynamic_less_busy_labels}")
        else: # Default for targets that are already 'Active & Available/Less Busy' or other categories
            dynamic_less_busy_labels = ['Active & Available/Less Busy'] # Should match the function's default if no specific rule
            print(f"Target is '{target_n1_busyness_label}'. Default 'less busy' criteria: {dynamic_less_busy_labels}")

        # Step 3: Call the recommendation function with the dynamic list
        recommendations1 = get_neighbourhood_recommendations(
            target_neighbourhood_name=target_n1,
            target_group_name=target_g1,
            df_profiles=final_neighbourhood_profiles_df,
            model_knn=knn_model,
            all_scaled_features=features_scaled,
            num_recommendations=3,
            busyness_column='busyness_label',
            less_busy_values=dynamic_less_busy_labels # Use the dynamically set list
        )

        if not recommendations1.empty:
            print(f"\nRecommended less busy, similar neighbourhoods in {target_g1} for {target_n1}:")
            display(recommendations1[['neighbourhood', 'busyness_label', 'similarity_distance', 'price_median']]) # Customize displayed columns
        elif not (target_n1_busyness_label in dynamic_less_busy_labels and target_n1_busyness_label == 'Active & Available/Less Busy'): # Avoid double messaging if target was already perfectly fine
            print(f"No suitable recommendations found for {target_n1} in {target_g1} with the dynamic criteria.")
    
    # --- You would then repeat for Test Case 2 (e.g., Williamsburg) with similar logic ---
    # ...


--- Testing Recommendation Logic with Dynamic 'Less Busy' Criteria ---

--- Test Case 1: Target: Allerton, Bronx ---
Busyness label for Allerton: Supply Constrained/Busy
Target is 'Supply Constrained/Busy'. Relaxed 'less busy' criteria: ['Active & Available/Less Busy', 'High Density/Saturated']

Target: Allerton (Bronx), Original Busyness: Supply Constrained/Busy
'Allerton' is considered busy/constrained. Searching for alternatives...

Recommended less busy, similar neighbourhoods in Bronx for Allerton:


Unnamed: 0,neighbourhood,busyness_label,similarity_distance,price_median
16,Highbridge,Active & Available/Less Busy,1.199075,70.0
35,Schuylerville,Active & Available/Less Busy,1.966639,65.0
38,Throgs Neck,Active & Available/Less Busy,2.032911,74.5


In [10]:
manhattan_profiles = final_neighbourhood_profiles_df[final_neighbourhood_profiles_df['neighbourhood_group'] == 'Manhattan']
print(manhattan_profiles['busyness_label'].value_counts())

print(manhattan_profiles[manhattan_profiles['busyness_label'] == 'Active & Available/Less Busy']['neighbourhood'])


busyness_label
Supply Constrained/Busy         23
High Density/Saturated           8
Active & Available/Less Busy     1
Name: count, dtype: int64
122    Two Bridges
Name: neighbourhood, dtype: object


## 4. Solution


### 4.1 Imports for Interactive Demo (and PCA)

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output
from sklearn.decomposition import PCA # For the PCA plot
from sklearn.preprocessing import StandardScaler # If you need to re-scale just for PCA for some reason, but ideally use existing features_scaled

# For the script to proceed if these are not in the immediate environment (in your notebook they would be):
if 'final_neighbourhood_profiles_df' not in locals() or 'knn_model' not in locals() or 'features_scaled' not in locals() or 'get_neighbourhood_recommendations' not in locals():
    print("WARNING: Key components (DataFrame, KNN model, scaled features, or recommendation function) not found.")
    print("This script might not run correctly without them. Ensure they are loaded/defined from your previous work.")
    # Create minimal dummies for script validation if run by the tool without prior state
    final_neighbourhood_profiles_df = pd.DataFrame({
        'neighbourhood_group': ['Manhattan', 'Manhattan', 'Manhattan', 'Brooklyn', 'Brooklyn'],
        'neighbourhood': ['Midtown', 'Harlem', 'Chelsea', 'Williamsburg', 'Bushwick'],
        'log_price_median': np.random.rand(5) * 2 + 4,
        'busyness_label': ['High Density/Saturated', 'Active & Available/Less Busy', 'Supply Constrained/Busy', 'Supply Constrained/Busy', 'Active & Available/Less Busy']
    })
    # Dummy features_scaled for PCA plotting (rows must match df)
    if 'features_scaled' not in locals() or features_scaled is None or features_scaled.shape[0] != len(final_neighbourhood_profiles_df) :
      features_scaled = StandardScaler().fit_transform(np.random.rand(len(final_neighbourhood_profiles_df), 5)) # Dummy with 5 features
    # Dummy knn_model
    if 'knn_model' not in locals() or knn_model is None:
      from sklearn.neighbors import NearestNeighbors
      knn_model = NearestNeighbors(n_neighbors=3).fit(features_scaled)
    # Dummy recommendation function
    if 'get_neighbourhood_recommendations' not in locals():
        def get_neighbourhood_recommendations(target_neighbourhood_name, target_group_name, df_profiles, model_knn, all_scaled_features, **kwargs):
            print(f"Dummy call for {target_neighbourhood_name}, {target_group_name}. In real use, this would return recommendations.")
            return pd.DataFrame({'neighbourhood': ['Dummy Rec 1', 'Dummy Rec 2'], 'busyness_label': ['Active & Available/Less Busy']*2, 'similarity_distance': [0.1,0.2]})


plt.style.use('ggplot')
sns.set_palette("muted")

### 4.2 Prepare Data for PCA Plot

In [12]:
print("--- Preparing 2D coordinates for PCA plot ---")
pca_plot_coords = None # Initialized to None
if 'features_scaled' in locals() and features_scaled is not None and features_scaled.shape[0] > 0 :
    pca = PCA(n_components=2, random_state=42)
    pca_plot_coords = pca.fit_transform(features_scaled) # This is where it's assigned
    print("PCA transformation complete for plotting.")
    # Add these 2D coordinates to your final_neighbourhood_profiles_df for easier plotting
    final_neighbourhood_profiles_df['pca1'] = pca_plot_coords[:, 0]
    final_neighbourhood_profiles_df['pca2'] = pca_plot_coords[:, 1]
else:
    print("Error: 'features_scaled' not available or empty. Cannot prepare PCA plot coordinates.")

--- Preparing 2D coordinates for PCA plot ---
PCA transformation complete for plotting.


### 4.3 Setup ipywidgets for Interaction

In [13]:
print("--- Setting up Interactive Widgets with Dependent Dropdown ---")

# 1. Create the widgets
# Ensure a placeholder/prompt option is the first in the list and set as default.
group_options = ['-- Select a N\'hood Group --'] + sorted(final_neighbourhood_profiles_df["neighbourhood_group"].unique())
group_dropdown = widgets.Dropdown(
    options=group_options,
    value=group_options[0], # Default to the prompt
    description="N'hood Group:",
    layout=widgets.Layout(width="auto"),
    style={'description_width': 'initial'}
)

neighbourhood_dropdown = widgets.Dropdown(
    options=['-- Select a Neighbourhood --'], # Initial prompt
    value='-- Select a Neighbourhood --',
    description="Neighbourhood:",
    disabled=True, # Start disabled
    layout=widgets.Layout(width="auto"),
    style={'description_width': 'initial'}
)

run_button = widgets.Button(
    description="🔍 Find Alternatives",
    button_style='primary', # Using 'primary' for a distinct look
    tooltip='Click to get recommendations',
    icon='search',
    layout=widgets.Layout(width="auto", margin='10px 0 0 0')
)

output_area = widgets.Output() # For displaying results

# 2. Define the single function to update the neighbourhood_dropdown
def on_group_select_update_neighbourhood_options(change):
    """
    This function is called when the value of group_dropdown changes.
    It updates the options in neighbourhood_dropdown based on the selected group.
    """
    selected_group = change.new # The new value of the group_dropdown

    # Always reset neighbourhood_dropdown to its initial prompt state first
    current_neighbourhood_options = ['-- Select a Neighbourhood --']
    neighbourhood_dropdown.options = current_neighbourhood_options
    neighbourhood_dropdown.value = current_neighbourhood_options[0] # Set to prompt
    
    if selected_group and selected_group != '-- Select a N\'hood Group --': # If a valid group is selected
        # Filter neighbourhoods for the selected group
        neighbourhoods_in_group = sorted(
            final_neighbourhood_profiles_df[final_neighbourhood_profiles_df["neighbourhood_group"] == selected_group]["neighbourhood"].unique()
        )
        
        if neighbourhoods_in_group:
            # Update options and enable the dropdown
            neighbourhood_dropdown.options = current_neighbourhood_options + neighbourhoods_in_group
            neighbourhood_dropdown.disabled = False
        else:
            # No neighbourhoods found for this group (should ideally not happen with real data)
            neighbourhood_dropdown.disabled = True
            # print(f"Debug: No neighbourhoods found for group: {selected_group}") # Optional debug
    else:
        # If the prompt "-- Select a N'hood Group --" is re-selected, disable neighbourhood_dropdown
        neighbourhood_dropdown.disabled = True
    
    # Clear any previous results shown in the output_area when the group changes
    output_area.clear_output()

# 3. Link the group_dropdown's 'value' change event to the update function
group_dropdown.observe(on_group_select_update_neighbourhood_options, names='value')

print("Widgets created and dependent dropdown logic is set up.")
print("Please select a Neighbourhood Group; the Neighbourhood dropdown will then populate.")

--- Setting up Interactive Widgets with Dependent Dropdown ---
Widgets created and dependent dropdown logic is set up.
Please select a Neighbourhood Group; the Neighbourhood dropdown will then populate.


### 4.4 Define the Main Action Function (triggered by button)

In [14]:
print("--- Defining Main Action Function for Button Click (Revised for Output & PCA Check) ---")

def on_find_alternatives_clicked(b): # b is the button event
    with output_area: # Ensure output_area is defined globally or passed
        output_area.clear_output(wait=True) 
        
        try: 
            selected_group = group_dropdown.value
            selected_neighbourhood = neighbourhood_dropdown.value

            if not selected_group or selected_group == '-- Select a N\'hood Group --' or \
               not selected_neighbourhood or selected_neighbourhood == '-- Select a Neighbourhood --':
                display(Markdown("<font color='red'>**Error:** Please select a valid Neighbourhood Group and Neighbourhood.</font>"))
                return

            display(Markdown(f"### Processing: Recommending for **{selected_neighbourhood}** in **{selected_group}**..."))
            
            target_profile = final_neighbourhood_profiles_df[
                (final_neighbourhood_profiles_df['neighbourhood_group'] == selected_group) &
                (final_neighbourhood_profiles_df['neighbourhood'] == selected_neighbourhood)
            ]
            
            if target_profile.empty:
                display(Markdown(f"<font color='red'>Error: Profile for {selected_neighbourhood} in {selected_group} not found.</font>"))
                return
                
            target_busyness_label = target_profile['busyness_label'].iloc[0]
            
            dynamic_less_busy_labels = ['Active & Available/Less Busy'] # Default

            if target_busyness_label == 'Supply Constrained/Busy':
                dynamic_less_busy_labels = ['Active & Available/Less Busy', 'High Density/Saturated']
                # print(f"Debug: Target is '{target_busyness_label}'. Relaxed 'less busy' to: {dynamic_less_busy_labels}")
            elif target_busyness_label == 'High Density/Saturated':
                if selected_group == 'Manhattan': 
                     dynamic_less_busy_labels = ['Active & Available/Less Busy', 'Supply Constrained/Busy']
                     # print(f"Debug: Target is '{target_busyness_label}' in Manhattan. Relaxed 'less busy' to: {dynamic_less_busy_labels}")
                else: 
                     dynamic_less_busy_labels = ['Active & Available/Less Busy']
                     # print(f"Debug: Target is '{target_busyness_label}'. Using 'less busy': {dynamic_less_busy_labels}")
            # else: 
                 # print(f"Debug: Target is '{target_busyness_label}'. Default 'less busy': {dynamic_less_busy_labels}")
                         
            recommendations = get_neighbourhood_recommendations(
                target_neighbourhood_name=selected_neighbourhood,
                target_group_name=selected_group,
                df_profiles=final_neighbourhood_profiles_df,
                model_knn=knn_model,
                all_scaled_features=features_scaled,
                num_recommendations=3,
                busyness_column='busyness_label',
                less_busy_values=dynamic_less_busy_labels
            )

            display(Markdown(f"#### Selected: **{selected_neighbourhood}** ({selected_group}) - Busyness: *{target_busyness_label}*"))

            if not recommendations.empty:
                display(Markdown("#### ✅ Recommended Less Busy, Similar Alternatives:"))
                display(recommendations[['neighbourhood', 'busyness_label', 'similarity_distance', 'price_median', 'room_type_prop_Entire_home_apt']])
            elif target_busyness_label in dynamic_less_busy_labels:
                 display(Markdown(f"<font color='green'>'{selected_neighbourhood}' is already considered 'less busy' by the current criteria: {dynamic_less_busy_labels}.</font>"))
            else:
                display(Markdown(f"<font color='orange'>⚠️ No suitable less busy, similar neighbourhoods found in {selected_group} for {selected_neighbourhood} with the current criteria: {dynamic_less_busy_labels}.</font>"))

            # --- Display PCA Plot ---
            # Check if the PCA columns exist in the DataFrame directly
            if 'pca1' in final_neighbourhood_profiles_df.columns and 'pca2' in final_neighbourhood_profiles_df.columns:
                plt.figure(figsize=(12, 8))
                
                sns.scatterplot(
                    x=final_neighbourhood_profiles_df['pca1'], 
                    y=final_neighbourhood_profiles_df['pca2'],
                    hue=final_neighbourhood_profiles_df['busyness_label'],
                    style=final_neighbourhood_profiles_df['neighbourhood_group'],
                    palette={'Active & Available/Less Busy': 'green', 'Supply Constrained/Busy': 'orange', 'High Density/Saturated': 'red'}, # Adjust palette to your labels
                    alpha=0.6, s=50
                )
                
                selected_coords_df = final_neighbourhood_profiles_df[
                    (final_neighbourhood_profiles_df['neighbourhood'] == selected_neighbourhood) &
                    (final_neighbourhood_profiles_df['neighbourhood_group'] == selected_group)
                ]
                if not selected_coords_df.empty:
                    plt.scatter(selected_coords_df['pca1'].iloc[0], selected_coords_df['pca2'].iloc[0], 
                                s=250, edgecolor='black', facecolor='yellow', label=f"Selected: {selected_neighbourhood}", zorder=5)

                if not recommendations.empty:
                    # Use .loc with original DataFrame to get PCA coords for recommended items
                    recommended_pca_coords = final_neighbourhood_profiles_df.loc[recommendations.index]
                    plt.scatter(recommended_pca_coords['pca1'], 
                                recommended_pca_coords['pca2'], 
                                s=180, edgecolor='black', facecolor='blue', marker='*', label="Recommended", zorder=5)
                    for idx, row in recommended_pca_coords.iterrows():
                        plt.text(row['pca1'] + 0.05, row['pca2'] + 0.05, row['neighbourhood'], fontsize=9, zorder=5)

                plt.title(f"Neighbourhood Similarity Map (PCA Projection)\nTarget: {selected_neighbourhood}")
                plt.xlabel("PCA Component 1")
                plt.ylabel("PCA Component 2")
                plt.legend(title="Legend", bbox_to_anchor=(1.05, 1), loc='upper left')
                plt.grid(True)
                plt.tight_layout(rect=[0, 0, 0.85, 1])
                plt.show()
            else:
                display(Markdown("<font color='brown'>PCA columns ('pca1', 'pca2') not found in final_neighbourhood_profiles_df. Plot cannot be generated.</font>"))
        
        except Exception as e:
            import traceback
            display(Markdown(f"<font color='red'>**An error occurred in on_find_alternatives_clicked:**\n```\n{traceback.format_exc()}\n```</font>"))

# Ensure this line is run after the function definition and after run_button is defined.
# It's best to have this in the same cell as the on_find_alternatives_clicked definition
# or in the cell immediately following it, ensuring all names are in scope.
run_button.on_click(on_find_alternatives_clicked)
print("Action function 'on_find_alternatives_clicked' (Corrected for PCA and Output) defined and linked to button.")

SyntaxError: invalid syntax (2516239575.py, line 89)

### 4.5 Display the Interface

In [15]:
print("--- Displaying Interactive Recommendation Interface ---")

# Initialize neighbourhood options for the first selected group
if group_dropdown.options and group_dropdown.options[0] != '': # Check if options exist and first is not blank
    initial_group = group_dropdown.options[0] if group_dropdown.value == '' else group_dropdown.value
    if initial_group: # if a valid group is selected or defaulted
        initial_neighbourhoods = sorted(
            final_neighbourhood_profiles_df[final_neighbourhood_profiles_df["neighbourhood_group"] == initial_group]["neighbourhood"].unique()
        )
        if initial_neighbourhoods:
             neighbourhood_dropdown.options = [''] + initial_neighbourhoods
             # neighbourhood_dropdown.value = '' # Start with blank selection
             neighbourhood_dropdown.disabled = False


# Display the widgets
display(Markdown("## 🗽 NYC Airbnb Neighbourhood Recommender"))
display(Markdown("Select a `Neighbourhood Group` and then a `Neighbourhood` you are interested in:"))
display(widgets.VBox([
    widgets.HBox([group_dropdown, neighbourhood_dropdown]),
    run_button
]))
display(output_area)

print("\nInterface is now active. Interact with the dropdowns and button above.")

--- Displaying Interactive Recommendation Interface ---


## 🗽 NYC Airbnb Neighbourhood Recommender

Select a `Neighbourhood Group` and then a `Neighbourhood` you are interested in:

VBox(children=(HBox(children=(Dropdown(description="N'hood Group:", layout=Layout(width='auto'), options=("-- …

Output()


Interface is now active. Interact with the dropdowns and button above.
