# load data

In [1]:
import pandas as pd
from google.cloud import bigquery
import plotly.express as px

# 1. Configuration (Uses the full View ID you provided)
PROJECT_ID = 'lap-coffee-476107'
DATASET_ID = 'lap_locations_final_merged'
VIEW_ID = 'lap_data_imputed'
FULL_VIEW_PATH = f"{PROJECT_ID}.{DATASET_ID}.{VIEW_ID}"

# 2. Initialize BigQuery client
# This assumes your environment is authenticated (e.g., via gcloud)
client = bigquery.Client(project=PROJECT_ID)

# 3. SQL Query to load all data from the imputed view
query = f"""
    SELECT *
    FROM `{FULL_VIEW_PATH}`
"""

print(f"Loading data from BigQuery view: {FULL_VIEW_PATH}")

# 4. Load data into a Pandas DataFrame
df = client.query(query).to_dataframe()

print("\n--- Data Loaded Successfully ---")
print(f"DataFrame shape: {df.shape}")
print("First 5 rows of imputed data:")
print(df.head())

Loading data from BigQuery view: lap-coffee-476107.lap_locations_final_merged.lap_data_imputed





--- Data Loaded Successfully ---
DataFrame shape: (3920, 19)
First 5 rows of imputed data:
         date        name        lat       lon  \
0  2024-09-01  LAP COFFEE  52.486768  13.35549   
1  2024-09-02  LAP COFFEE  52.486768  13.35549   
2  2024-09-03  LAP COFFEE  52.486768  13.35549   
3  2024-09-04  LAP COFFEE  52.486768  13.35549   
4  2024-09-05  LAP COFFEE  52.486768  13.35549   

                                   address  cafe_rating  \
0  Akazienstraße 3A, 10823 Berlin, Germany          4.7   
1  Akazienstraße 3A, 10823 Berlin, Germany          4.7   
2  Akazienstraße 3A, 10823 Berlin, Germany          4.7   
3  Akazienstraße 3A, 10823 Berlin, Germany          4.7   
4  Akazienstraße 3A, 10823 Berlin, Germany          4.7   

   cafe_user_ratings_total                cafe_place_id weather_date  season  \
0                    151.0  ChIJGzWI8E5RqEcRYF2oui3pSKc   2024-09-01  Autumn   
1                    151.0  ChIJGzWI8E5RqEcRYF2oui3pSKc   2024-09-01  Autumn   
2           

# data str

In [8]:
df

Unnamed: 0,date,name,lat,lon,address,cafe_rating,cafe_user_ratings_total,season,parks_count_1km,open_bars_count_500m,lst_celsius_1km,geometry,temp_max,temp_min,precip_mm,ndvi,nightlight,name_updated
0,2024-09-01,LAP COFFEE,52.486768,13.355490,"Akazienstraße 3A, 10823 Berlin, Germany",4.7,151.0,Autumn,11.0,12.0,30.890000,POINT (13.35549 52.4867684),25.20000,13.900000,0.0000,0.254739,0.000000,LAP COFFEE_Akazienstraße
1,2024-09-02,LAP COFFEE,52.486768,13.355490,"Akazienstraße 3A, 10823 Berlin, Germany",4.7,151.0,Autumn,11.0,12.0,30.890000,POINT (13.35549 52.4867684),26.70000,12.700000,0.0000,0.254739,0.000000,LAP COFFEE_Akazienstraße
2,2024-09-03,LAP COFFEE,52.486768,13.355490,"Akazienstraße 3A, 10823 Berlin, Germany",4.7,151.0,Autumn,11.0,12.0,30.890000,POINT (13.35549 52.4867684),32.60000,18.000000,0.2000,0.254739,0.000000,LAP COFFEE_Akazienstraße
3,2024-09-04,LAP COFFEE,52.486768,13.355490,"Akazienstraße 3A, 10823 Berlin, Germany",4.7,151.0,Autumn,11.0,12.0,30.890000,POINT (13.35549 52.4867684),33.90000,20.200000,0.0000,0.254739,0.000000,LAP COFFEE_Akazienstraße
4,2024-09-05,LAP COFFEE,52.486768,13.355490,"Akazienstraße 3A, 10823 Berlin, Germany",4.7,151.0,Autumn,11.0,12.0,31.150000,POINT (13.35549 52.4867684),31.10000,21.500000,0.0000,0.273590,0.000000,LAP COFFEE_Akazienstraße
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3915,2025-10-29,LAP COFFEE,52.501178,13.418971,"Adalbertstraße 91, 10999 Berlin, Germany",4.4,41.0,Autumn,13.0,14.0,16.456979,POINT (13.35549 52.4867684),16.59888,8.994582,1.6977,0.192969,23.742678,LAP COFFEE_Adalbertstraße
3916,2025-10-30,LAP COFFEE,52.501178,13.418971,"Adalbertstraße 91, 10999 Berlin, Germany",4.4,41.0,Autumn,13.0,14.0,16.456979,POINT (13.35549 52.4867684),16.59888,8.994582,1.6977,0.192969,23.742678,LAP COFFEE_Adalbertstraße
3917,2025-10-31,LAP COFFEE,52.501178,13.418971,"Adalbertstraße 91, 10999 Berlin, Germany",4.4,41.0,Autumn,13.0,14.0,16.456979,POINT (13.35549 52.4867684),16.59888,8.994582,1.6977,0.192969,23.742678,LAP COFFEE_Adalbertstraße
3918,2025-11-01,LAP COFFEE,52.501178,13.418971,"Adalbertstraße 91, 10999 Berlin, Germany",4.4,41.0,Autumn,13.0,14.0,16.456979,POINT (13.35549 52.4867684),16.59888,8.994582,1.6977,0.192969,23.742678,LAP COFFEE_Adalbertstraße


In [2]:
df.describe()

Unnamed: 0,lat,lon,cafe_rating,cafe_user_ratings_total,parks_count_1km,open_bars_count_500m,lst_celsius_1km,temp_max,temp_min,precip_mm,ndvi,nightlight
count,3920.0,3920.0,3920.0,3920.0,3920.0,3920.0,3920.0,3920.0,3920.0,3920.0,3920.0,3920.0
mean,52.515041,13.399718,4.54375,129.1875,6.375,11.8125,16.456979,16.59888,8.994582,1.6977,0.192969,23.742678
std,0.021107,0.041309,0.264535,152.021949,2.997777,4.419539,8.016358,6.487201,4.731907,3.029786,0.088971,13.945256
min,52.479598,13.320827,4.0,18.0,1.0,6.0,-4.75,-1.3,-8.1,0.0,-0.020354,0.0
25%,52.50042,13.391518,4.375,47.0,4.75,8.0,12.35,12.9,6.5,0.0,0.16405,20.309999
50%,52.50821,13.412825,4.7,97.0,7.0,10.0,16.456979,16.59888,8.994582,0.4,0.192969,23.742678
75%,52.534887,13.420839,4.725,139.0,7.0,14.0,22.83,20.3,12.2,1.6977,0.192969,25.75
max,52.54932,13.459298,4.9,689.0,13.0,22.0,31.85,33.9,21.5,26.5,0.757688,86.209999


# feature space

In [5]:
import pandas as pd
import plotly.express as px

# 1️⃣ Update df: add first word of address to name
df['name_updated'] = df['name'] + "_" + df['address'].str.split().str[0]

# 2️⃣ Ensure season is categorical
df['season'] = pd.Categorical(
    df['season'],
    categories=['Winter', 'Spring', 'Summer', 'Autumn'],
    ordered=True
)

# 3️⃣ Drop unnecessary columns if they exist
cols_to_drop = ['first_word', 'month', 'weather_date', 'cafe_place_id']
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])

# 4️⃣ Define features for plotting
features = {
    'ndvi': 'NDVI',
    'temp_max': 'Temperature (°C)',
    'lst_celsius_1km': 'land surface temperature',
    'nightlight': 'Nightlight Intensity'
}

# 5️⃣ Generate separate plots for each feature
for var, label in features.items():
    fig = px.scatter(
        df,
        x=var,
        y='name_updated',
        color='season',
        hover_data=['date', 'temp_max', 'lst_celsius_1km', 'nightlight'],
        title=f'{label} by Season and Location',
    )

    fig.update_layout(
        height=600,
        legend_title_text='Season',
        title_font_size=18,
        margin=dict(t=60, l=80, r=40, b=40),
        xaxis_title=label,
        yaxis_title='Location (Name)',
    )

    fig.show()


# bars, parks

In [6]:
import plotly.express as px

# Scatter map for parks within 1 km
fig_parks = px.scatter_mapbox(
    df,
    lat="lat",
    lon="lon",
    color="parks_count_1km",
    size="parks_count_1km",
    hover_name="name_updated",
    hover_data={"lat": True, "lon": True, "season": True},
    color_continuous_scale="Greens",
    size_max=20,
    zoom=11,
    title="Nearby Parks within 500m"
)

fig_parks.update_layout(
    mapbox_style="carto-positron",
    margin=dict(t=60, l=10, r=10, b=10)
)
fig_parks.show()

# Scatter map for open bars within 500 m
fig_bars = px.scatter_mapbox(
    df,
    lat="lat",
    lon="lon",
    color="open_bars_count_500m",
    size="open_bars_count_500m",
    hover_name="name_updated",
    hover_data={"lat": True, "lon": True, "season": True},
    color_continuous_scale="Reds",
    size_max=20,
    zoom=11,
    title="Open Bars within 500 m"
)

fig_bars.update_layout(
    mapbox_style="carto-positron",
    margin=dict(t=60, l=10, r=10, b=10)
)
fig_bars.show()



*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



# rfc

In [17]:
# Define target and features
target_variable = 'name_updated'
feature_variables = [
    'parks_count_1km',
    'open_bars_count_500m', 'lst_celsius_1km', 'temp_max', 'temp_min',
    'precip_mm', 'ndvi', 'nightlight'
]

In [18]:
# --- 2.1 Encode Categorical Features (X) ---
# Use One-Hot Encoding for 'season'
X = df[feature_variables].copy()
# --- 2.2 Encode Target Variable (y) ---
# Use Label Encoding for the classification target
le = LabelEncoder()
y = le.fit_transform(df[target_variable])

# Identify the final list of features after encoding
final_features = X.columns.tolist()

print(f"Total unique classes (cafés) found: {len(le.classes_)}")
print(f"Total features used: {len(final_features)}")
# Since your sample data only has one unique café, the model will train on only one class.
# We will proceed, but note that a real classification requires multiple unique target classes.

Total unique classes (cafés) found: 16
Total features used: 8


In [19]:

import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
# --- 3.1 Split Data ---
# Note: With only 2 records of 1 class, this split is illustrative. 
# A real dataset would need hundreds of records and multiple classes.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# --- 3.2 Train Random Forest Classifier (RFC) ---
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

# --- 3.3 Predict and Evaluate ---
y_pred = rfc.predict(X_test)

In [20]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0)

print("## 📊 Model Results (Random Forest Classifier) 📊")
print("------------------------------------------------")
print(f"Overall Accuracy: {accuracy:.2f}\n")
print("### Classification Report:")
print(report)

## 📊 Model Results (Random Forest Classifier) 📊
------------------------------------------------
Overall Accuracy: 0.96

### Classification Report:
                               precision    recall  f1-score   support

    LAP COFFEE_Adalbertstraße       1.00      1.00      1.00        73
     LAP COFFEE_Akazienstraße       1.00      1.00      1.00        73
LAP COFFEE_Falckensteinstraße       1.00      1.00      1.00        74
      LAP COFFEE_Graefestraße       1.00      1.00      1.00        74
        LAP COFFEE_Kantstraße       0.73      0.96      0.83        74
  LAP COFFEE_Karl-Marx-Straße       0.94      0.81      0.87        73
    LAP COFFEE_Kastanienallee       1.00      1.00      1.00        74
         LAP COFFEE_Krossener       1.00      1.00      1.00        74
    LAP COFFEE_Kurfürstendamm       0.88      0.97      0.92        73
        LAP COFFEE_Oderberger       1.00      1.00      1.00        73
      LAP COFFEE_Raumerstraße       0.97      0.86      0.91        74

### 🧠 Tricky Locations: Model Classification Errors

| Café Location | Precision | Recall | F1-Score | Interpretation |
| :--- | :--- | :--- | :--- | :--- |
| **LAP COFFEE\_Kantstraße** | **0.73** | 0.96 | 0.83 | **Low Precision:** When the model predicted Kantstraße, it was only right **73%** of the time. This means the model frequently mistook *other* cafés for Kantstraße (high **False Positives**). |
| **LAP COFFEE\_Karl-Marx-Straße** | 0.94 | **0.81** | 0.87 | **Low Recall:** The model missed **19%** of the actual Karl-Marx-Straße instances, classifying them as something else (high **False Negatives**). |
| **LAP COFFEE\_Kurfürstendamm** | 0.88 | 0.97 | 0.92 | **Moderate Precision:** **12%** of the time the model predicted Kurfürstendamm, it was wrong (some **False Positives**). |
| **LAP COFFEE\_Rykestraße** | 0.95 | **0.78** | 0.86 | **Low Recall:** The model missed **22%** of the actual Rykestraße instances, classifying them as another café (high **False Negatives**). |

---

### Key Takeaway:

The model suggests that **Kantstraße** has features highly similar to other locations, while **Karl-Marx-Straße** and **Rykestraße** have unique feature profiles that are often missed by the model when they appear in the test set.

In [22]:
import pandas as pd
# Assuming 'rfc' is your trained RandomForestClassifier object
# Assuming 'X' is the DataFrame used for training, and 'final_features' are the column names

# Get feature importances from the trained RFC model
importances = rfc.feature_importances_

# Create a Series for easy viewing and sorting
feature_series = pd.Series(importances, index=X.columns).sort_values(ascending=False)

print("### 📈 Top Feature Importances (Model's Focus):")
print(feature_series)

### 📈 Top Feature Importances (Model's Focus):
parks_count_1km         0.347558
open_bars_count_500m    0.343886
ndvi                    0.113012
nightlight              0.099530
lst_celsius_1km         0.043215
temp_max                0.021603
temp_min                0.018649
precip_mm               0.012546
dtype: float64


| Feature | Importance Score | Type | Model's Rationale |
| :--- | :--- | :--- | :--- |
| **`parks_count_1km`** | $\mathbf{0.3476}$ | Static | **Primary Identifier:** The number of nearby parks is the single most unique feature for distinguishing café locations. |
| **`open_bars_count_500m`** | $\mathbf{0.3439}$ | Static | **Secondary Identifier:** The density of open bars is the second most powerful feature, confirming the model uses the static neighborhood profile. |
| **`ndvi`** | $0.1130$ | Dynamic | **Most Relevant Environmental Feature:** The variation in greenness is the best non-static differentiator between locations. |
| **`nightlight`** | $0.0995$ | Dynamic | **Second Most Relevant Environmental Feature:** Urban light density helps distinguish environments. |
| **`lst_celsius_1km`** | $0.0432$ | Dynamic | **Low Relevance:** Land Surface Temperature is not a strong location identifier. |
| **`temp_max`** | $0.0216$ | Dynamic | **Very Low Relevance:** Ambient max temperature is not unique enough to distinguish locations. |
| **`temp_min`** | $0.0186$ | Dynamic | **Very Low Relevance:** Ambient min temperature is not unique enough to distinguish locations. |
| **`precip_mm`** | $0.0125$ | Dynamic | **Least Relevant:** Precipitation provides virtually no unique information about the café's location. |

# feature range for prediction set

In [26]:
# Assuming your DataFrame is named 'df'
import pandas as pd
from IPython.display import display
# import numpy # (Only needed if creating the placeholder df)

# Features of interest for defining mood thresholds
features_to_analyze = [
    'parks_count_1km', 'open_bars_count_500m', 'lst_celsius_1km', 
    'temp_max', 'temp_min', 'precip_mm', 'ndvi', 'nightlight'
]

# Calculate descriptive statistics
feature_ranges = df[features_to_analyze].agg(
    ['min', 'max', lambda x: x.quantile(0.25), lambda x: x.quantile(0.75)]
).T
# Rename the percentile columns for clarity
feature_ranges.columns = ['Min', 'Max', '25th Percentile (Low)', '75th Percentile (High)']

# Round the values for clean presentation
feature_ranges = feature_ranges.round(2)

print("### 📊 Feature Ranges for Mood Thresholds")

# Display the Pandas DataFrame as an HTML table
display(feature_ranges) 
# OR, simply ensure 'feature_ranges' is the last line of the code cell.

### 📊 Feature Ranges for Mood Thresholds


Unnamed: 0,Min,Max,25th Percentile (Low),75th Percentile (High)
parks_count_1km,1.0,13.0,4.75,7.0
open_bars_count_500m,6.0,22.0,8.0,14.0
lst_celsius_1km,-4.75,31.85,12.35,22.83
temp_max,-1.3,33.9,12.9,20.3
temp_min,-8.1,21.5,6.5,12.2
precip_mm,0.0,26.5,0.0,1.7
ndvi,-0.02,0.76,0.16,0.19
nightlight,0.0,86.21,20.31,25.75


In [32]:
import pandas as pd
import numpy as np
from IPython.display import display, Markdown

# NOTE: This script assumes:
# 1. Your DataFrame is loaded and named 'df'.
# 2. Your trained Random Forest model is loaded and named 'rfc'. (The rfc variable name is used for prediction)

# --- 1. Feature Range Calculation and Display (Now includes 50th percentile) ---

# Features of interest (MUST match the features and order used to train 'rfc')
features_to_analyze = [
    'parks_count_1km', 'open_bars_count_500m', 'lst_celsius_1km', 
    'temp_max', 'temp_min', 'precip_mm', 'ndvi', 'nightlight'
]

# Calculate descriptive statistics (Min, Max, P25, P50, P75)
feature_ranges = df[features_to_analyze].agg(
    ['min', 'max', lambda x: x.quantile(0.25), lambda x: x.quantile(0.50), lambda x: x.quantile(0.75)]
).T
# Rename the percentile columns for clarity
feature_ranges.columns = ['Min', 'Max', '25th Percentile (Low)', '50th Percentile (Mid)', '75th Percentile (High)']

# Round the values for clean presentation
feature_ranges = feature_ranges.round(2)

print("### 📊 Feature Ranges for Mood Thresholds (P50 Included)\n")
display(feature_ranges) 




### 📊 Feature Ranges for Mood Thresholds (P50 Included)



Unnamed: 0,Min,Max,25th Percentile (Low),50th Percentile (Mid),75th Percentile (High)
parks_count_1km,1.0,13.0,4.75,7.0,7.0
open_bars_count_500m,6.0,22.0,8.0,10.0,14.0
lst_celsius_1km,-4.75,31.85,12.35,16.46,22.83
temp_max,-1.3,33.9,12.9,16.6,20.3
temp_min,-8.1,21.5,6.5,8.99,12.2
precip_mm,0.0,26.5,0.0,0.4,1.7
ndvi,-0.02,0.76,0.16,0.19,0.19
nightlight,0.0,86.21,20.31,23.74,25.75


In [33]:
# --- 2. Dynamic Threshold Assignment (Automatic Ranges) ---

# STATIC FEATURES
P_75_PARKS = feature_ranges.loc['parks_count_1km', '75th Percentile (High)']
P_25_BARS = feature_ranges.loc['open_bars_count_500m', '25th Percentile (Low)']
P_75_BARS = feature_ranges.loc['open_bars_count_500m', '75th Percentile (High)']
P_50_PARKS = feature_ranges.loc['parks_count_1km', '50th Percentile (Mid)']

# DYNAMIC FEATURES
P_75_NDVI = feature_ranges.loc['ndvi', '75th Percentile (High)']
P_25_NDVI = feature_ranges.loc['ndvi', '25th Percentile (Low)']
P_75_NIGHTLIGHT = feature_ranges.loc['nightlight', '75th Percentile (High)']
P_25_NIGHTLIGHT = feature_ranges.loc['nightlight', '25th Percentile (Low)']
P_25_TEMP_MAX = feature_ranges.loc['temp_max', '25th Percentile (Low)']
P_75_TEMP_MAX = feature_ranges.loc['temp_max', '75th Percentile (High)']
P_50_TEMP_MAX = feature_ranges.loc['temp_max', '50th Percentile (Mid)']
P_25_TEMP_MIN = feature_ranges.loc['temp_min', '25th Percentile (Low)']
P_75_TEMP_MIN = feature_ranges.loc['temp_min', '75th Percentile (High)']
P_50_TEMP_MIN = feature_ranges.loc['temp_min', '50th Percentile (Mid)']
P_50_LST = feature_ranges.loc['lst_celsius_1km', '50th Percentile (Mid)']
P_75_LST = feature_ranges.loc['lst_celsius_1km', '75th Percentile (High)']
P_MIN_PRECIP = feature_ranges.loc['precip_mm', 'Min']
# The following values are not automatically calculated in the range, 
# so they are set to P75 and P25 of their respective feature for consistency in synthetic data creation.
# NOTE: If these features are NOT in features_to_analyze, they must be set to a representative value.
try:
    P_75_PRECIP = feature_ranges.loc['precip_mm', '75th Percentile (High)']
except KeyError:
    # Fallback in case columns are missing (though they are in features_to_analyze)
    P_75_PRECIP = 1.0 # Placeholder if 75th percentile cannot be found dynamically
try:
    P_25_LST = feature_ranges.loc['lst_celsius_1km', '25th Percentile (Low)']
except KeyError:
    P_25_LST = 20.0 # Placeholder
try:
    P_50_BARS = feature_ranges.loc['open_bars_count_500m', '50th Percentile (Mid)']
except KeyError:
    P_50_BARS = 10.0 # Placeholder
try:
    P_50_NDVI = feature_ranges.loc['ndvi', '50th Percentile (Mid)']
except KeyError:
    P_50_NDVI = 0.5 # Placeholder
try:
    P_50_NIGHTLIGHT = feature_ranges.loc['nightlight', '50th Percentile (Mid)']
except KeyError:
    P_50_NIGHTLIGHT = 5.0 # Placeholder
try:
    P_25_PARKS = feature_ranges.loc['parks_count_1km', '25th Percentile (Low)']
except KeyError:
    P_25_PARKS = 3.0 # Placeholder


# --- 3. Synthetic Data Creation (Ideal Mood Profiles) ---

def create_mood_profile(mood_name, park, bar, lst, tmax, tmin, precip, ndvi, nightlight):
    """Creates a single-row DataFrame for a synthetic ideal mood profile."""
    # The feature list and order MUST match the training data for rfc.predict()
    data = [[park, bar, lst, tmax, tmin, precip, ndvi, nightlight]]
    profile_df = pd.DataFrame(data, columns=features_to_analyze)
    return profile_df

def get_mood_dataframes(df):
    
    # ☕ COZY: Deep Cold (P25 TMin) + High Shelter (P75 Parks)
    # We set non-critical features to P25 to represent low outdoor desire.
    cozy_profile_df = create_mood_profile('Cozy',
        park=P_75_PARKS, bar=P_25_BARS, lst=P_25_LST, tmax=P_25_TEMP_MAX, 
        tmin=P_25_TEMP_MIN, precip=P_MIN_PRECIP, ndvi=P_25_NDVI, nightlight=P_25_NIGHTLIGHT
    )

    # 🌿 GREEN: Max Greenness (P75 NDVI) + Low Urban Noise (P25 Bars/Light)
    green_profile_df = create_mood_profile('Green',
        park=P_75_PARKS, bar=P_25_BARS, lst=P_75_LST, tmax=P_75_TEMP_MAX, 
        tmin=P_75_TEMP_MIN, precip=P_MIN_PRECIP, ndvi=P_75_NDVI, nightlight=P_25_NIGHTLIGHT
    )

    # 🥂 BUZZ: Highest Activity (P75 Bars/Light) + Warm (P75 Temps)
    # We set Park count to P25 as urban areas often have less park space.
    buzz_profile_df = create_mood_profile('Buzz',
        park=P_25_PARKS, bar=P_75_BARS, lst=P_75_LST, tmax=P_75_TEMP_MAX, 
        tmin=P_75_TEMP_MIN, precip=P_75_PRECIP, ndvi=P_25_NDVI, nightlight=P_75_NIGHTLIGHT
    )
    
    # 🛋️ LAZY: Mild Weather (P50 TMax) + Low Effort (No Rain)
    # We set non-critical features to P50 (median) for a balanced, average day.
    lazy_profile_df = create_mood_profile('Lazy',
        park=P_50_PARKS, bar=P_50_BARS, lst=P_50_LST, tmax=P_50_TEMP_MAX, 
        tmin=P_50_TEMP_MIN, precip=P_MIN_PRECIP, ndvi=P_50_NDVI, nightlight=P_50_NIGHTLIGHT
    )

    # 🎲 RANDOM: Average Profile
    # Use the mean of each feature as a balanced, neutral point.
    random_data = df[features_to_analyze].mean().round(2).to_list()
    random_profile_df = pd.DataFrame([random_data], columns=features_to_analyze)

    return {
        '☕ Cozy (Cold & Sheltered)': cozy_profile_df,
        '🌿 Green (Nature Escape)': green_profile_df,
        '🥂 Buzz (Urban Activity)': buzz_profile_df,
        '🛋️ Lazy (High Comfort)': lazy_profile_df,
        '🎲 Random (Average Profile)': random_profile_df
    }




In [None]:
import pandas as pd
import numpy as np
from IPython.display import display, Markdown

# NOTE: This script assumes:
# 1. Your DataFrame is loaded and named 'df'.
# 2. Your trained Random Forest model is loaded and named 'rfc'. 

# --- CRITICAL: Data Cleaning Step (Fixes Lookup Mismatch) ---
# Ensure the columns used for prediction classes and lookups are clean strings.
# This prepares the lookup table index and address columns.
for col in ['name_updated', 'address']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()


# --- 1. Feature Range Calculation and Display (Now includes 50th percentile) ---

# Features of interest (MUST match the features and order used to train 'rfc')
features_to_analyze = [
    'parks_count_1km', 'open_bars_count_500m', 'lst_celsius_1km', 
    'temp_max', 'temp_min', 'precip_mm', 'ndvi', 'nightlight'
]

# Calculate descriptive statistics (Min, Max, P25, P50, P75)
feature_ranges = df[features_to_analyze].agg(
    ['min', 'max', lambda x: x.quantile(0.25), lambda x: x.quantile(0.50), lambda x: x.quantile(0.75)]
).T
# Rename the percentile columns for clarity
feature_ranges.columns = ['Min', 'Max', '25th Percentile (Low)', '50th Percentile (Mid)', '75th Percentile (High)']

# Round the values for clean presentation
feature_ranges = feature_ranges.round(2)

print("### 📊 Feature Ranges for Mood Thresholds (P50 Included)\n")
display(feature_ranges) 


# --- 2. Dynamic Threshold Assignment (Automatic Ranges) ---
# Values are pulled directly from the calculated feature_ranges table.

# General Min/Max for establishing dynamic range bounds
F_MIN = feature_ranges['Min']
F_MAX = feature_ranges['Max']
F_P25 = feature_ranges['25th Percentile (Low)']
F_P50 = feature_ranges['50th Percentile (Mid)']
F_P75 = feature_ranges['75th Percentile (High)']


# --- 2.5 Cafe Name and Address Lookup Table ---
# Create a unique mapping from the model's prediction output (name_updated) 
# to the user-friendly name and address.

# Attempt to find the best friendly name column (prioritizing 'name' based on common convention)
FRIENDLY_NAME_COL = None
# Added 'date' as a possible prefix to 'name' based on your snippet
for col in ['name', 'datenamelatlon', 'date_name', 'name_updated']: 
    if col in df.columns:
        FRIENDLY_NAME_COL = col
        break

if not FRIENDLY_NAME_COL:
    FRIENDLY_NAME_COL = 'name_updated' 
    print(f"\n⚠️ WARNING: Could not find a friendly name column. Falling back to 'name_updated'.")
elif FRIENDLY_NAME_COL != 'name_updated':
    print(f"\nℹ️ INFO: Using column '{FRIENDLY_NAME_COL}' for friendly cafe names.")


# Create the lookup table
# Ensure the index is composed of strings for consistent matching
cafe_lookup_df = df[['name_updated', FRIENDLY_NAME_COL, 'address']].drop_duplicates(subset=['name_updated'])
cafe_lookup_df['name_updated'] = cafe_lookup_df['name_updated'].astype(str)
cafe_lookup_df = cafe_lookup_df.set_index('name_updated')

# --- 0. DIAGNOSTICS ---
print("\n### 🔍 Lookup Diagnostics (Crucial for Debugging N/A values):")
try:
    print(f"RFC Class Type: {type(rfc.classes_[0])}")
    # Show the first 5 predicted classes (what the model outputs)
    print(f"First 5 RFC Classes (Predicted): {rfc.classes_[:5]}")
except Exception:
    print("Could not access rfc.classes_ for diagnostics.")
try:
    # Show the first 5 keys in the lookup table (what we need to match)
    print(f"First 5 Lookup Keys (Expected): {cafe_lookup_df.index[:5].to_list()}")
except Exception:
    print("Could not access cafe_lookup_df index for diagnostics.")
print("----------------------------------------------------\n")


# --- 3. Dynamic Synthetic Data Creation (Ideal Mood Profiles) ---

def create_dynamic_profile(park_range, bar_range, lst_range, tmax_range, tmin_range, precip_range, ndvi_range, nightlight_range):
    """Creates a single-row DataFrame with features sampled randomly within the given ranges."""
    
    # Use np.random.uniform(low, high) to generate random values within the specified bounds
    data = [[
        np.random.uniform(*park_range),
        np.random.uniform(*bar_range),
        np.random.uniform(*lst_range),
        np.random.uniform(*tmax_range),
        np.random.uniform(*tmin_range),
        np.random.uniform(*precip_range),
        np.random.uniform(*ndvi_range),
        np.random.uniform(*nightlight_range),
    ]]
    
    # The feature list and order MUST match the training data for rfc.predict()
    profile_df = pd.DataFrame(data, columns=features_to_analyze)
    return profile_df

def get_dynamic_mood_dataframes():
    
    # Define ranges for each mood using the F_PXX variables

    # ☕ COZY: Deep Cold (TMin: Min-P25) + High Shelter (Parks: P75-Max) + Low Activity (Bars: Min-P25)
    cozy_profile_df = create_dynamic_profile(
        park_range=[F_P75['parks_count_1km'], F_MAX['parks_count_1km']], 
        bar_range=[F_MIN['open_bars_count_500m'], F_P25['open_bars_count_500m']], 
        lst_range=[F_P25['lst_celsius_1km'], F_P50['lst_celsius_1km']],
        tmax_range=[F_P25['temp_max'], F_P50['temp_max']], 
        tmin_range=[F_MIN['temp_min'], F_P25['temp_min']], 
        precip_range=[F_MIN['precip_mm'], F_MIN['precip_mm'] + 0.1], # Almost no rain
        ndvi_range=[F_P25['ndvi'], F_P50['ndvi']], 
        nightlight_range=[F_MIN['nightlight'], F_P25['nightlight']]
    )

    # 🌿 GREEN: Max Greenness (NDVI: P75-Max) + Low Urban Noise (Bars/Light: Min-P25)
    green_profile_df = create_dynamic_profile(
        park_range=[F_P75['parks_count_1km'], F_MAX['parks_count_1km']], 
        bar_range=[F_MIN['open_bars_count_500m'], F_P25['open_bars_count_500m']], 
        lst_range=[F_P50['lst_celsius_1km'], F_P75['lst_celsius_1km']],
        tmax_range=[F_P50['temp_max'], F_P75['temp_max']], 
        tmin_range=[F_P50['temp_min'], F_P75['temp_min']], 
        precip_range=[F_MIN['precip_mm'], F_P25['precip_mm']],
        ndvi_range=[F_P75['ndvi'], F_MAX['ndvi']], 
        nightlight_range=[F_MIN['nightlight'], F_P25['nightlight']]
    )

    # 🥂 BUZZ: Highest Activity (Bars/Light: P75-Max) + Warm (Temps: P75-Max)
    buzz_profile_df = create_dynamic_profile(
        park_range=[F_MIN['parks_count_1km'], F_P25['parks_count_1km']], 
        bar_range=[F_P75['open_bars_count_500m'], F_MAX['open_bars_count_500m']], 
        lst_range=[F_P75['lst_celsius_1km'], F_MAX['lst_celsius_1km']],
        tmax_range=[F_P75['temp_max'], F_MAX['temp_max']], 
        tmin_range=[F_P75['temp_min'], F_MAX['temp_min']], 
        precip_range=[F_P50['precip_mm'], F_MAX['precip_mm']], # Rain is ok for urban life
        ndvi_range=[F_MIN['ndvi'], F_P25['ndvi']], 
        nightlight_range=[F_P75['nightlight'], F_MAX['nightlight']]
    )
    
    # 🛋️ LAZY: Mild Weather (TMax: P25-P75) + Low Effort (No Rain/Low LST)
    lazy_profile_df = create_dynamic_profile(
        park_range=[F_P25['parks_count_1km'], F_P75['parks_count_1km']], 
        bar_range=[F_P25['open_bars_count_500m'], F_P75['open_bars_count_500m']], 
        lst_range=[F_P25['lst_celsius_1km'], F_P75['lst_celsius_1km']],
        tmax_range=[F_P25['temp_max'], F_P75['temp_max']], 
        tmin_range=[F_P25['temp_min'], F_P75['temp_min']], 
        precip_range=[F_MIN['precip_mm'], F_MIN['precip_mm'] + 0.1], # Almost no rain
        ndvi_range=[F_P25['ndvi'], F_P75['ndvi']], 
        nightlight_range=[F_P25['nightlight'], F_P75['nightlight']]
    )

    # 🎲 RANDOM: Balanced Profile (Mean +/- 10% for dynamic range)
    # Mean of each feature as a balanced, neutral point, with a small random deviation
    random_profile_df = create_dynamic_profile(
        park_range=[F_P50['parks_count_1km'] * 0.9, F_P50['parks_count_1km'] * 1.1], 
        bar_range=[F_P50['open_bars_count_500m'] * 0.9, F_P50['open_bars_count_500m'] * 1.1], 
        lst_range=[F_P50['lst_celsius_1km'] * 0.9, F_P50['lst_celsius_1km'] * 1.1],
        tmax_range=[F_P50['temp_max'] * 0.9, F_P50['temp_max'] * 1.1], 
        tmin_range=[F_P50['temp_min'] * 0.9, F_P50['temp_min'] * 1.1], 
        precip_range=[F_P50['precip_mm'] * 0.9, F_P50['precip_mm'] * 1.1],
        ndvi_range=[F_P50['ndvi'] * 0.9, F_P50['ndvi'] * 1.1], 
        nightlight_range=[F_P50['nightlight'] * 0.9, F_P50['nightlight'] * 1.1]
    )


    return {
        '☕ Cozy (Cold & Sheltered)': cozy_profile_df,
        '🌿 Green (Nature Escape)': green_profile_df,
        '🥂 Buzz (Urban Activity)': buzz_profile_df,
        '🛋️ Lazy (High Comfort)': lazy_profile_df,
        '🎲 Random (Balanced Profile)': random_profile_df
    }

# --- 4. Execute and Summarize Predictions (Top 2) ---

# WARNING: This section requires a trained model named 'rfc'
if 'rfc' not in locals() and 'rfc' not in globals():
    raise NameError(
        "\n\n🛑 ERROR: The 'rfc' (Random Forest Classifier) is not defined.\n"
        "Please ensure you run your model training cell and assign the trained model "
        "to a variable named 'rfc' before running this script."
    )

# Seed numpy for reproducibility of the random profiles within a single run
np.random.seed()
mood_dataframes = get_dynamic_mood_dataframes()

# New summary structure for cleaner columnar output
summary_data = {
    'Mood': [], 'Ideal Profile (Key Features)': [], 
    'Cafe 1 Name': [], 'Cafe 1 Address': [], 'Cafe 1 Prob (%)': [], 
    'Cafe 2 Name': [], 'Cafe 2 Address': [], 'Cafe 2 Prob (%)': []
}

for mood, profile_df in mood_dataframes.items():
    
    # 1. Get probability scores for all classes
    probabilities = rfc.predict_proba(profile_df)[0]
    
    # 2. Get the indices of the top 2 highest probabilities
    top_2_indices = np.argsort(probabilities)[::-1][:2]
    
    # 3. Map indices back to café names (class labels)
    top_2_cafes_updated_name = rfc.classes_[top_2_indices]
    
    # Extract the feature profile string for the summary table
    profile_str = (
        f"Park:{profile_df['parks_count_1km'].iloc[0]:.2f}, "
        f"Bar:{profile_df['open_bars_count_500m'].iloc[0]:.2f}, "
        f"TMin:{profile_df['temp_min'].iloc[0]:.2f}, "
        f"NDVI:{profile_df['ndvi'].iloc[0]:.2f}, "
        f"Light:{profile_df['nightlight'].iloc[0]:.2f}"
    )
    
    summary_data['Mood'].append(mood)
    summary_data['Ideal Profile (Key Features)'].append(profile_str)

    # 4. Populate the new columnar structure
    for i in range(2):
        
        # --- FINAL FIX: Force predicted class to string and strip whitespace ---
        cafe_name_updated = str(top_2_cafes_updated_name[i]).strip() 
        # ----------------------------------------------------------------------
        
        friendly_name = "N/A"
        full_address = "N/A"
        probability = 0.0
        
        # Check if the predicted class exists in the lookup index
        if cafe_name_updated in cafe_lookup_df.index:
            friendly_name = cafe_lookup_df.loc[cafe_name_updated, FRIENDLY_NAME_COL]
            full_address = cafe_lookup_df.loc[cafe_name_updated, 'address']
            probability = probabilities[top_2_indices[i]] * 100
        
        summary_data[f'Cafe {i+1} Name'].append(friendly_name)
        summary_data[f'Cafe {i+1} Address'].append(full_address)
        summary_data[f'Cafe {i+1} Prob (%)'].append(f"{probability:.1f}%")


recommendations_summary = pd.DataFrame(summary_data)

# Print the Final Summary
print("\n## ✅ Predictive Recommendation System Results (Top 2 Cafés)\n")
display(recommendations_summary)

# Optional: Display the actual synthetic data used for prediction
print("\n--- Synthetic Data Used for Prediction (Unique per run) ---")
# Concatenate the dynamic profiles and round to 2 decimals for clean display
dynamic_profiles_display = pd.concat(mood_dataframes.values(), keys=mood_dataframes.keys())
display(dynamic_profiles_display.round(2))


### 📊 Feature Ranges for Mood Thresholds (P50 Included)



Unnamed: 0,Min,Max,25th Percentile (Low),50th Percentile (Mid),75th Percentile (High)
parks_count_1km,1.0,13.0,4.75,7.0,7.0
open_bars_count_500m,6.0,22.0,8.0,10.0,14.0
lst_celsius_1km,-4.75,31.85,12.35,16.46,22.83
temp_max,-1.3,33.9,12.9,16.6,20.3
temp_min,-8.1,21.5,6.5,8.99,12.2
precip_mm,0.0,26.5,0.0,0.4,1.7
ndvi,-0.02,0.76,0.16,0.19,0.19
nightlight,0.0,86.21,20.31,23.74,25.75



ℹ️ INFO: Using column 'name' for friendly cafe names.

## ✅ Predictive Recommendation System Results (Top 2 Cafés)



Unnamed: 0,Mood,Ideal Profile (Key Features),Cafe 1 Name,Cafe 1 Address,Cafe 1 Prob (%),Cafe 2 Name,Cafe 2 Address,Cafe 2 Prob (%)
0,☕ Cozy (Cold & Sheltered),"Park:11.64, Bar:7.50, TMin:0.05, NDVI:0.17, Li...",,,0.0%,,,0.0%
1,🌿 Green (Nature Escape),"Park:8.58, Bar:6.22, TMin:9.34, NDVI:0.64, Lig...",,,0.0%,,,0.0%
2,🥂 Buzz (Urban Activity),"Park:1.55, Bar:16.50, TMin:17.86, NDVI:0.14, L...",,,0.0%,,,0.0%
3,🛋️ Lazy (High Comfort),"Park:6.99, Bar:11.84, TMin:6.52, NDVI:0.17, Li...",,,0.0%,,,0.0%
4,🎲 Random (Balanced Profile),"Park:6.50, Bar:9.44, TMin:8.80, NDVI:0.20, Lig...",,,0.0%,,,0.0%



--- Synthetic Data Used for Prediction (Unique per run) ---


Unnamed: 0,Unnamed: 1,parks_count_1km,open_bars_count_500m,lst_celsius_1km,temp_max,temp_min,precip_mm,ndvi,nightlight
☕ Cozy (Cold & Sheltered),0,11.64,7.5,15.09,16.07,0.05,0.02,0.17,17.58
🌿 Green (Nature Escape),0,8.58,6.22,21.7,19.29,9.34,0.0,0.64,12.82
🥂 Buzz (Urban Activity),0,1.55,16.5,25.5,20.51,17.86,15.39,0.14,59.3
🛋️ Lazy (High Comfort),0,6.99,11.84,16.81,15.51,6.52,0.0,0.17,25.21
🎲 Random (Balanced Profile),0,6.5,9.44,14.91,18.17,8.8,0.44,0.2,23.36
