In [1]:

!pip install tensorflow scikit-learn pandas numpy matplotlib scikit-fuzzy
!pip install --quiet joblib
!pip install scikit-fuzzy
!pip install osmnx geopandas shapely


Collecting scikit-fuzzy
  Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl (920 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m920.8/920.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-fuzzy
Successfully installed scikit-fuzzy-0.5.0
Collecting osmnx
  Downloading osmnx-2.0.7-py3-none-any.whl.metadata (4.9 kB)
Downloading osmnx-2.0.7-py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: osmnx
Successfully installed osmnx-2.0.7


In [2]:


import osmnx as ox
import geopandas as gpd
import pandas as pd
import numpy as np


place = "Stanford University, California"
campus = ox.geocode_to_gdf(place)


buildings = ox.features_from_polygon(
    campus.geometry.iloc[0],
    tags={"building": True}
)


zones = buildings.reset_index()
zones["zone_id"] = zones.index + 1

zones["raw_name"] = zones["name"].fillna("").astype(str)
zones.loc[zones["raw_name"] == "", "raw_name"] = "Building " + zones.loc[zones["raw_name"] == "", "zone_id"].astype(str)

zones["lat"] = zones.geometry.centroid.y
zones["lon"] = zones.geometry.centroid.x
zones["area_m2"] = zones.geometry.area


def classify_zone(row):
    """Classify zone based on building tags and name"""
    name = str(row.get('name', '')).lower()
    building = str(row.get('building', '')).lower()
    amenity = str(row.get('amenity', '')).lower()


    if any(word in name for word in ['dining', 'cafe', 'cafeteria', 'restaurant', 'food', 'stern']):
        return 'Dining'
    if amenity in ['restaurant', 'cafe', 'fast_food']:
        return 'Dining'


    if any(word in name for word in ['library', 'green', 'meyer']):
        return 'Library'
    if amenity == 'library':
        return 'Library'


    if any(word in name for word in ['student center', 'tresidder', 'union', 'gym', 'athletic', 'recreation']):
        return 'StudentCenter'
    if amenity in ['community_centre', 'social_facility']:
        return 'StudentCenter'


    if any(word in name for word in ['engineering', 'gates', 'packard', 'huang', 'computer', 'science']):
        return 'Engineering'
    if any(word in name for word in ['business', 'gsb', 'knight', 'management']):
        return 'Business'
    if any(word in name for word in ['law', 'neukom']):
        return 'Law'
    if any(word in name for word in ['medical', 'medicine', 'hospital', 'clinic', 'health']):
        return 'Medicine'
    if any(word in name for word in ['art', 'museum', 'cantor', 'anderson', 'theater', 'theatre']):
        return 'Arts'
    if any(word in name for word in ['humanities', 'history', 'language', 'literature']):
        return 'Humanities'
    if building in ['university', 'college', 'school']:
        return 'General'


    if any(word in name for word in ['hall', 'residence', 'dorm', 'house', 'housing']):
        return 'Residential'
    if building == 'residential':
        return 'Residential'


    if any(word in name for word in ['gate', 'parking', 'garage', 'station', 'transit']):
        return 'Transit'
    if amenity in ['parking', 'bicycle_parking']:
        return 'Transit'

    return 'General'

zones["school_type"] = zones.apply(classify_zone, axis=1)


def create_display_name(row):
    """Create user-friendly zone names"""
    name = str(row.get('name', ''))
    zone_type = row['school_type']
    zone_id = row['zone_id']

    if name and name != 'nan' and name != '':

        if len(name) > 35:
            name = name[:32] + "..."
        return name
    else:
        return f"{zone_type} Zone {zone_id}"

zones["zone_name"] = zones.apply(create_display_name, axis=1)

try:
    trees = ox.features_from_polygon(
        campus.geometry.iloc[0],
        {"natural": "tree"}
    )

    def compute_shade(geom):
        nearby = trees[trees.distance(geom.centroid) < 30]
        return len(nearby)

    zones["shade_raw"] = zones.geometry.apply(compute_shade)
    tree_data_available = True
    print("Tree data found and processed")

except Exception as e:
    print(f"No tree data available: {e}")
    zones["shade_raw"] = 0
    tree_data_available = False

def assign_shade(row):
    """Combine tree count with zone type assumptions"""
    zone_type = row['school_type']

    if tree_data_available and zones['shade_raw'].max() > 0:
        tree_shade = row['shade_raw'] / zones['shade_raw'].max()
    else:
        tree_shade = 0

    if zone_type == 'Residential':
        base_shade = 0.4
    elif zone_type == 'Library':
        base_shade = 0.2
    elif zone_type == 'Dining':
        base_shade = 0.3
    elif zone_type == 'Engineering':
        base_shade = 0.1
    elif zone_type == 'Transit':
        base_shade = 0.05
    elif zone_type == 'StudentCenter':
        base_shade = 0.5
    elif zone_type == 'Arts':
        base_shade = 0.4
    else:
        base_shade = 0.3

    variation = np.random.uniform(-0.05, 0.05)
    final_shade = 0.7 * base_shade + 0.3 * tree_shade + variation
    return np.clip(final_shade, 0, 1)

np.random.seed(42)
zones["shade_score"] = zones.apply(assign_shade, axis=1)

df_zones = zones[[
    "zone_id", "zone_name", "school_type", "shade_score", "lat", "lon", "area_m2"
]]

df_zones.to_csv("zones.csv", index=False)

print("="*60)
print("🏗️  ZONE CLASSIFICATION RESULTS")
print("="*60)
print("\nZone Type Distribution:")
print(df_zones['school_type'].value_counts())
print(f"\nTotal zones: {len(df_zones)}")
print(f"\nShade Score Statistics:")
print(df_zones['shade_score'].describe())

print("\n📋 Sample Zones by Type:")
for zone_type in df_zones['school_type'].unique()[:5]:
    sample = df_zones[df_zones['school_type'] == zone_type].head(2)
    print(f"\n{zone_type}:")
    for _, row in sample.iterrows():
        print(f"  - {row['zone_name']}")

print("\nZones saved to zones.csv")

df_zones.head(10)


  zones["lat"] = zones.geometry.centroid.y

  zones["lon"] = zones.geometry.centroid.x

  zones["area_m2"] = zones.geometry.area

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nearby = trees[trees.distance(geom.centroid) < 30]

  nea

Tree data found and processed
🏗️  ZONE CLASSIFICATION RESULTS

Zone Type Distribution:
school_type
General          433
Residential       40
Engineering       19
Dining            11
Medicine          10
StudentCenter      8
Transit            7
Arts               5
Library            5
Humanities         2
Law                2
Name: count, dtype: int64

Total zones: 542

Shade Score Statistics:
count    542.000000
mean       0.510828
std        0.051218
min        0.286822
25%        0.482483
50%        0.514913
75%        0.543366
max        0.668423
Name: shade_score, dtype: float64

📋 Sample Zones by Type:

Humanities:
  - Center for the Study of Language...
  - Stanford Humanities Center

General:
  - Galvez Help Center
  - Sterling Quadrangle

Residential:
  - Spruce Hall
  - Turing Hall

Dining:
  - Lucie Stern Hall
  - Faculty Club

Medicine:
  - Stanford Hospital
  - Beckman Center for Molecular and...

Zones saved to zones.csv


Unnamed: 0,zone_id,zone_name,school_type,shade_score,lat,lon,area_m2
0,1,Center for the Study of Language...,Humanities,0.497454,37.430423,-122.178758,0.0
1,2,Galvez Help Center,General,0.555071,37.430773,-122.164964,0.0
2,3,Sterling Quadrangle,General,0.533199,37.425793,-122.180278,0.0
3,4,Lagunita Court,General,0.519866,37.424889,-122.176277,0.0
4,5,Polya Data Center,General,0.475602,37.428828,-122.177743,0.0
5,6,Spruce Hall,Residential,0.545599,37.42855,-122.176909,0.0
6,7,Turing Hall,Residential,0.535808,37.428851,-122.177552,0.0
7,8,Keck Hall,Residential,0.616618,37.431658,-122.171922,0.0
8,9,Crothers Memorial Hall,Residential,0.590112,37.42577,-122.164747,0.0
9,10,Lucie Stern Hall,Dining,0.530807,37.424461,-122.165677,1e-06


In [3]:
df_zones

Unnamed: 0,zone_id,zone_name,school_type,shade_score,lat,lon,area_m2
0,1,Center for the Study of Language...,Humanities,0.497454,37.430423,-122.178758,0.000000e+00
1,2,Galvez Help Center,General,0.555071,37.430773,-122.164964,0.000000e+00
2,3,Sterling Quadrangle,General,0.533199,37.425793,-122.180278,0.000000e+00
3,4,Lagunita Court,General,0.519866,37.424889,-122.176277,0.000000e+00
4,5,Polya Data Center,General,0.475602,37.428828,-122.177743,0.000000e+00
...,...,...,...,...,...,...,...
537,538,General Zone 538,General,0.484735,37.435299,-122.161989,4.380760e-09
538,539,General Zone 539,General,0.505054,37.433815,-122.161846,1.101385e-08
539,540,General Zone 540,General,0.472916,37.426480,-122.184199,1.381646e-08
540,541,General Zone 541,General,0.555405,37.426186,-122.184257,4.776729e-08


In [4]:

import numpy as np
from datetime import datetime, timedelta
import pandas as pd

HOURS = 72
start = datetime(2024, 11, 1, 0, 0, 0)

np.random.seed(42)
zone_personalities = {}
for zone_name in df_zones['zone_name']:
    zone_personalities[zone_name] = {
        'base_popularity': np.random.uniform(0.1, 1.8),
        'morning_boost': np.random.uniform(0.3, 2.0),
        'lunch_boost': np.random.uniform(0.4, 3.0),
        'evening_boost': np.random.uniform(0.2, 1.8),
        'weekend_reduction': np.random.uniform(0.1, 0.7),
        'volatility': np.random.uniform(0.2, 0.6)
    }

school_bias = {
    "Humanities": 0.3,
    "Arts": 0.2,
    "Engineering": 1.6,
    "Business": 1.0,
    "Law": 0.2,
    "Medicine": 0.9,
    "Library": 1.2,
    "Rec": 0.8,
    "StudentCenter": 1.4,
    "Dining": 2.0,
    "Residential": 0.15,
    "Transit": 1.3,
    "General": 0.4
}

rows = []
for h in range(HOURS):
    ts = start + timedelta(hours=h)
    hour = ts.hour
    is_weekend = 1 if ts.weekday() >= 5 else 0

    if 6 <= hour < 8:
        base_schedule = 0.5
        time_phase = 'morning'
    elif 8 <= hour < 10:
        base_schedule = 1.2
        time_phase = 'morning'
    elif 10 <= hour < 12:
        base_schedule = 1.4
        time_phase = 'morning'
    elif 12 <= hour < 14:
        base_schedule = 1.6
        time_phase = 'lunch'
    elif 14 <= hour < 17:
        base_schedule = 1.1
        time_phase = 'afternoon'
    elif 17 <= hour < 19:
        base_schedule = 0.7
        time_phase = 'evening'
    elif 19 <= hour < 22:
        base_schedule = 0.5
        time_phase = 'evening'
    else:
        base_schedule = 0.08
        time_phase = 'night'

    if is_weekend:
        base_schedule *= 0.2

    for _, z in df_zones.iterrows():
        zone_name = z['zone_name']
        zone_type = z['school_type']
        personality = zone_personalities[zone_name]

        type_bias = school_bias.get(zone_type, 0.5)
        popularity = personality['base_popularity']
        volatility = personality['volatility']

        if time_phase == 'morning':
            time_modifier = personality['morning_boost']
        elif time_phase == 'lunch':
            time_modifier = personality['lunch_boost']
        elif time_phase == 'evening':
            time_modifier = personality['evening_boost']
        else:
            time_modifier = 0.5

        class_density = type_bias * base_schedule * popularity * time_modifier

        dining_peak = 0.0
        library_surge = 0.0
        transit_rush = 0.0
        residential_night = 0.0

        if zone_type == "Dining":
            if 7 <= hour <= 9:
                dining_peak = 3.5 * np.random.uniform(0.8, 1.3)
            elif 11 <= hour <= 14:
                dining_peak = 4.0 * np.random.uniform(0.9, 1.4)
            elif 17 <= hour <= 19:
                dining_peak = 3.8 * np.random.uniform(0.8, 1.3)
            else:
                dining_peak = 0.02

        if zone_type == "Library":
            if 18 <= hour <= 23:
                library_surge = 2.0 * np.random.uniform(0.8, 1.3)
            elif 14 <= hour <= 18:
                library_surge = 1.2

        if zone_type == "Transit":
            if hour in [8, 9, 17, 18]:
                transit_rush = 2.5 * np.random.uniform(0.8, 1.3)

        if zone_type == "Residential":
            if 20 <= hour or hour <= 7:
                residential_night = 1.5
            else:
                residential_night = 0.1

        random_component = np.random.normal(0.5, volatility)

        sim_crowd = (
            0.20 * class_density +
            0.40 * dining_peak +
            0.15 * library_surge +
            0.15 * transit_rush +
            0.10 * residential_night +
            0.50 * random_component
        )

        if is_weekend and zone_type not in ["Residential", "Rec"]:
            sim_crowd *= personality['weekend_reduction']

        sim_crowd = np.clip(sim_crowd, 0, 1.2)
        sim_crowd = np.clip(sim_crowd + np.random.normal(0, 0.1), 0, 1)

        rows.append({
            'zone_id': int(z['zone_id']),
            'zone_name': zone_name,
            'timestamp': ts.isoformat(),
            'is_weekend': is_weekend,
            'hour_of_day': hour,
            'class_density': class_density,
            'shade_score': z['shade_score'],
            'sim_crowd': sim_crowd
        })

df_feat = pd.DataFrame(rows)
df_feat.to_csv("features_simulated.csv", index=False)

zone_stats = df_feat.groupby('zone_name')['sim_crowd'].agg(['mean', 'std', 'min', 'max'])
print("\n🏆 TOP 10 BUSIEST ZONES:")
print(zone_stats.sort_values('mean', ascending=False).head(10)[['mean', 'max']])

print("\n🏞️  TOP 10 EMPTIEST ZONES:")
print(zone_stats.sort_values('mean').head(10)[['mean', 'min']])

print("\n📈 ZONES WITH MOST VARIATION (high std):")
print(zone_stats.sort_values('std', ascending=False).head(10)[['std']])

df_feat.head()



🏆 TOP 10 BUSIEST ZONES:
                                         mean       max
zone_name                                              
Ricker Dining Hall                   0.554038  1.000000
Beefeaters                           0.553213  1.000000
Arrillaga Family Dining Commons      0.538891  1.000000
Annette's Cafe                       0.535236  1.000000
Lakeside Dining                      0.514855  1.000000
Avanti                               0.510279  1.000000
Coupa Café                           0.483017  1.000000
Warehouse, Maps & Records Office...  0.397757  0.963567
Memorial Hall                        0.393182  1.000000
Middle Earth                         0.392991  1.000000

🏞️  TOP 10 EMPTIEST ZONES:
                                         mean  min
zone_name                                         
General Zone 122                     0.117896  0.0
Schwab Residential Center            0.121813  0.0
Center for the Study of Language...  0.121872  0.0
Bleeker Hall        

Unnamed: 0,zone_id,zone_name,timestamp,is_weekend,hour_of_day,class_density,shade_score,sim_crowd
0,1,Center for the Study of Language...,2024-11-01T00:00:00,0,0,0.008841,0.497454,0.155892
1,2,Galvez Help Center,2024-11-01T00:00:00,0,0,0.00318,0.555071,0.0
2,3,Sterling Quadrangle,2024-11-01T00:00:00,0,0,0.024242,0.533199,0.115868
3,4,Lagunita Court,2024-11-01T00:00:00,0,0,0.013349,0.519866,0.377954
4,5,Polya Data Center,2024-11-01T00:00:00,0,0,0.014005,0.475602,0.112121


In [5]:

import numpy as np
import pandas as pd
from datetime import datetime, timedelta

start_date = datetime(2024, 11, 1, 0, 0, 0)
hours = 72

print("="*60)
print("🌤️ GENERATING REALISTIC WEATHER DATA")
print("="*60)
print(f"Location: Stanford, CA (37.4275, -122.1697)")
print(f"Period: {start_date} for {hours} hours")

month = start_date.month

if month in [12, 1, 2]:
    temp_min, temp_max = 8, 16
    season = "Winter"
elif month in [3, 4, 5]:
    temp_min, temp_max = 12, 22
    season = "Spring"
elif month in [6, 7, 8]:
    temp_min, temp_max = 18, 32
    season = "Summer"
else:  #fall (9, 10, 11)
    temp_min, temp_max = 10, 20
    season = "Fall"

print(f"Season: {season}")
print(f"Expected temperature range: {temp_min}°C - {temp_max}°C")

np.random.seed(42)
timestamps = [start_date + timedelta(hours=h) for h in range(hours)]
weather_data = []

temp_range = temp_max - temp_min
base_temp = (temp_min + temp_max) / 2

for i, ts in enumerate(timestamps):
    hour = ts.hour
    day = ts.day


    time_factor = np.sin((hour - 6) * np.pi / 12)  # Peak at 2 PM
    daily_variation = (temp_range / 2) * time_factor

    day_variation = np.sin(day * np.pi / 5) * (temp_range * 0.15)

    random_noise = np.random.normal(0, 0.8)

    temp = base_temp + daily_variation + day_variation + random_noise
    temp = np.clip(temp, temp_min - 2, temp_max + 2)

    humidity = 70 - (temp - base_temp) * 1.2 + np.random.normal(0, 5)
    humidity = np.clip(humidity, 35, 85)

    wind = 8 + 4 * (time_factor + 1) / 2 + np.random.normal(0, 2)
    wind = np.clip(wind, 3, 20)

    if 6 <= hour <= 18:
        uv_base = 2 if season == "Winter" else (5 if season in ["Spring", "Fall"] else 8)
        uv = uv_base + 2 * time_factor + np.random.normal(0, 0.5)
        uv = np.clip(uv, 0, 11)
    else:
        uv = 0


    cloud = 30 + 20 * np.sin(day * np.pi / 3) + np.random.normal(0, 10)
    cloud = np.clip(cloud, 10, 80)

    weather_data.append({
        'timestamp': ts.strftime('%Y-%m-%d %H:%M'),
        'temperature_C': round(temp, 1),
        'humidity_pct': round(humidity, 0),
        'wind_kph': round(wind, 1),
        'uv_index': round(uv, 1),
        'cloud_pct': round(cloud, 0)
    })

df_weather = pd.DataFrame(weather_data)
df_weather.to_csv("weather_hourly.csv", index=False)

print(f"\n Generated {len(df_weather)} hours of realistic {season} weather")
print(f"\nWeather Statistics:")
print(f"   Temperature: {df_weather['temperature_C'].min():.1f}°C to {df_weather['temperature_C'].max():.1f}°C")
print(f"   Humidity: {df_weather['humidity_pct'].min():.0f}% to {df_weather['humidity_pct'].max():.0f}%")
print(f"   Wind: {df_weather['wind_kph'].min():.1f} to {df_weather['wind_kph'].max():.1f} km/h")

print("\n📊 Sample (first 24 hours):")
print(df_weather.head(24)[['timestamp', 'temperature_C', 'humidity_pct', 'wind_kph', 'uv_index']])

df_weather.head()

🌤️ GENERATING REALISTIC WEATHER DATA
Location: Stanford, CA (37.4275, -122.1697)
Period: 2024-11-01 00:00:00 for 72 hours
Season: Fall
Expected temperature range: 10°C - 20°C

 Generated 72 hours of realistic Fall weather

Weather Statistics:
   Temperature: 10.9°C to 22.0°C
   Humidity: 55% to 84%
   Wind: 4.9 to 16.0 km/h

📊 Sample (first 24 hours):
           timestamp  temperature_C  humidity_pct  wind_kph  uv_index
0   2024-11-01 00:00           11.3          74.0       9.3       0.0
1   2024-11-01 01:00           10.9          74.0      11.2       0.0
2   2024-11-01 02:00           11.2          77.0       7.3       0.0
3   2024-11-01 03:00           12.5          63.0       5.1       0.0
4   2024-11-01 04:00           12.6          74.0       7.2       0.0
5   2024-11-01 05:00           15.8          68.0       9.6       0.0
6   2024-11-01 06:00           15.4          70.0       7.7       5.2
7   2024-11-01 07:00           16.9          65.0      14.2       5.5
8   2024-11-01 0

Unnamed: 0,timestamp,temperature_C,humidity_pct,wind_kph,uv_index,cloud_pct
0,2024-11-01 00:00,11.3,74.0,9.3,0.0,63.0
1,2024-11-01 01:00,10.9,74.0,11.2,0.0,55.0
2,2024-11-01 02:00,11.2,77.0,7.3,0.0,43.0
3,2024-11-01 03:00,12.5,63.0,5.1,0.0,42.0
4,2024-11-01 04:00,12.6,74.0,7.2,0.0,33.0


In [6]:

import pandas as pd

df_feat = pd.read_csv("features_simulated.csv", parse_dates=['timestamp'])
df_weather = pd.read_csv("weather_hourly.csv", parse_dates=['timestamp'])

df_feat['timestamp_hour'] = pd.to_datetime(df_feat['timestamp']).dt.floor('H')
df_weather['timestamp_hour'] = pd.to_datetime(df_weather['timestamp']).dt.floor('H')

print(f"Features: {len(df_feat)} rows, {df_feat['timestamp_hour'].nunique()} unique hours")
print(f"Weather: {len(df_weather)} rows, {df_weather['timestamp_hour'].nunique()} unique hours")

df = pd.merge(
    df_feat,
    df_weather[['timestamp_hour','temperature_C','humidity_pct','wind_kph','uv_index']],
    on='timestamp_hour',
    how='left'
)

print(f"\nAfter merge: {len(df)} rows")

missing = df[['temperature_C','humidity_pct','wind_kph','uv_index']].isnull().sum()
print(f"\nMissing values:\n{missing}")

df[['temperature_C','humidity_pct','wind_kph','uv_index']] = (
    df[['temperature_C','humidity_pct','wind_kph','uv_index']]
    .fillna(method='ffill')
    .fillna(method='bfill')
)

missing_after = df[['temperature_C','humidity_pct','wind_kph','uv_index']].isnull().sum()
print(f"\nMissing values after fill:\n{missing_after}")

print(f"\nTemperature range: {df['temperature_C'].min():.1f}°C to {df['temperature_C'].max():.1f}°C")

sample_time = df['timestamp_hour'].iloc[0]
same_hour = df[df['timestamp_hour'] == sample_time]
temp_check = same_hour['temperature_C'].nunique()
print(f"\nAt {sample_time}:")
print(f"   Temperature: {same_hour['temperature_C'].iloc[0]:.1f}°C")
print(f"   Unique temps across zones: {temp_check} (should be 1)")


if temp_check == 1:
    print("Weather properly shared across all zones")
else:
    print("ERROR: Different weather per zone!")

df.to_csv("features_with_weather.csv", index=False)
print("\nSaved: features_with_weather.csv")
df.head()

  df_feat['timestamp_hour'] = pd.to_datetime(df_feat['timestamp']).dt.floor('H')
  df_weather['timestamp_hour'] = pd.to_datetime(df_weather['timestamp']).dt.floor('H')
  .fillna(method='ffill')
  .fillna(method='bfill')


Features: 39024 rows, 72 unique hours
Weather: 72 rows, 72 unique hours

After merge: 39024 rows

Missing values:
temperature_C    0
humidity_pct     0
wind_kph         0
uv_index         0
dtype: int64

Missing values after fill:
temperature_C    0
humidity_pct     0
wind_kph         0
uv_index         0
dtype: int64

Temperature range: 10.9°C to 22.0°C

At 2024-11-01 00:00:00:
   Temperature: 11.3°C
   Unique temps across zones: 1 (should be 1)
Weather properly shared across all zones

Saved: features_with_weather.csv


Unnamed: 0,zone_id,zone_name,timestamp,is_weekend,hour_of_day,class_density,shade_score,sim_crowd,timestamp_hour,temperature_C,humidity_pct,wind_kph,uv_index
0,1,Center for the Study of Language...,2024-11-01,0,0,0.008841,0.497454,0.155892,2024-11-01,11.3,74.0,9.3,0.0
1,2,Galvez Help Center,2024-11-01,0,0,0.00318,0.555071,0.0,2024-11-01,11.3,74.0,9.3,0.0
2,3,Sterling Quadrangle,2024-11-01,0,0,0.024242,0.533199,0.115868,2024-11-01,11.3,74.0,9.3,0.0
3,4,Lagunita Court,2024-11-01,0,0,0.013349,0.519866,0.377954,2024-11-01,11.3,74.0,9.3,0.0
4,5,Polya Data Center,2024-11-01,0,0,0.014005,0.475602,0.112121,2024-11-01,11.3,74.0,9.3,0.0


In [7]:


import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers, Model
import joblib

df = pd.read_csv("features_with_weather.csv", parse_dates=['timestamp','timestamp_hour'])

print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

le_zone = LabelEncoder()
df['zone_encoded'] = le_zone.fit_transform(df['zone_name'])

df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day'] / 24)

df['day_of_week'] = pd.to_datetime(df['timestamp']).dt.dayofweek
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

df['temp_humidity'] = df['temperature_C'] * df['humidity_pct'] / 100
df['heat_discomfort'] = df['temperature_C'] * (1 / (1 + df['wind_kph']))

numeric_feature_cols = [
    'hour_sin', 'hour_cos',
    'day_sin', 'day_cos',
    'is_weekend',
    'class_density',
    'shade_score',
    'temperature_C',
    'humidity_pct',
    'wind_kph',
    'uv_index',
    'temp_humidity',
    'heat_discomfort'
]

X_numeric = df[numeric_feature_cols].values
X_zone = df['zone_encoded'].values.reshape(-1, 1)
y = df['sim_crowd'].values.reshape(-1, 1)

print(f"\nNumeric features shape: {X_numeric.shape}")
print(f"Zone IDs shape: {X_zone.shape}")
print(f"Target shape: {y.shape}")

num_zones = len(le_zone.classes_)
embedding_dim = min(50, max(10, num_zones // 4))

print(f"\nNumber of zones: {num_zones}")
print(f"Embedding dimension: {embedding_dim}")

zone_input = layers.Input(shape=(1,), name='zone_input')
numeric_input = layers.Input(shape=(X_numeric.shape[1],), name='numeric_input')

zone_embedding = layers.Embedding(
    input_dim=num_zones,
    output_dim=embedding_dim,
    name='zone_embedding'
)(zone_input)
zone_flat = layers.Flatten()(zone_embedding)

concat = layers.Concatenate()([zone_flat, numeric_input])

x = layers.Dense(128, activation='relu')(concat)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.3)(x)

x = layers.Dense(64, activation='relu')(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.2)(x)

x = layers.Dense(32, activation='relu')(x)
x = layers.Dropout(0.1)(x)

outputs = layers.Dense(1, activation='sigmoid')(x)

model = Model(inputs=[zone_input, numeric_input], outputs=outputs)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='mse',
    metrics=['mae']
)

model.summary()

print("\nTraining model...")
history = model.fit(
    [X_zone, X_numeric], y,
    epochs=50,
    batch_size=64,
    validation_split=0.15,
    verbose=1
)

df['predicted_crowd'] = model.predict([X_zone, X_numeric], verbose=0)

print("\n" + "="*60)
print("🔍 PREDICTION QUALITY CHECK")
print("="*60)

print("\n1. Overall Crowd Statistics:")
print(df['predicted_crowd'].describe())

pred_std = df['predicted_crowd'].std()
pred_range = df['predicted_crowd'].max() - df['predicted_crowd'].min()

print(f"\n   Standard Deviation: {pred_std:.4f}")
print(f"   Range: {pred_range:.4f}")

if pred_std < 0.15:
    print("FAIL - Predictions too uniform")
elif pred_std < 0.20:
    print("BORDERLINE - Limited variation")
else:
    print("PASS - Good diversity")

print("\n2. Crowd by Hour of Day:")
hourly = df.groupby('hour_of_day')['predicted_crowd'].agg(['mean', 'std'])
print(hourly)

print("\n3. Top 10 Busiest Zones (Average Predicted Crowd):")
zone_preds = df.groupby('zone_name')['predicted_crowd'].mean().sort_values(ascending=False)
print(zone_preds.head(10))

print("\n4. Top 10 Emptiest Zones (Average Predicted Crowd):")
print(zone_preds.tail(10))

print("\n5. Prediction vs Reality (Sample):")
sample = df[['zone_name', 'hour_of_day', 'sim_crowd', 'predicted_crowd']].sample(10)
print(sample)

model.save("crowd_model.keras")
joblib.dump(le_zone, "label_encoder_zone.joblib")

df.to_csv("features_with_predictions.csv", index=False)

print("\n Model saved: crowd_model.keras")
print(" Encoder saved: label_encoder_zone.joblib")
print(" Predictions saved: features_with_predictions.csv")

Dataset loaded: 39024 rows, 13 columns

Numeric features shape: (39024, 13)
Zone IDs shape: (39024, 1)
Target shape: (39024, 1)

Number of zones: 534
Embedding dimension: 50



Training model...
Epoch 1/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - loss: 0.0541 - mae: 0.1754 - val_loss: 0.0227 - val_mae: 0.1143
Epoch 2/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0332 - mae: 0.1398 - val_loss: 0.0199 - val_mae: 0.1109
Epoch 3/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0309 - mae: 0.1347 - val_loss: 0.0195 - val_mae: 0.1023
Epoch 4/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0289 - mae: 0.1310 - val_loss: 0.0192 - val_mae: 0.1088
Epoch 5/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0285 - mae: 0.1300 - val_loss: 0.0181 - val_mae: 0.1013
Epoch 6/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0281 - mae: 0.1290 - val_loss: 0.0189 - val_mae: 0.1061
Epoch 7/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [8]:


df.columns = df.columns.str.lower().str.strip()

hum_col = None
for c in df.columns:
    if "humid" in c:
        hum_col = c
        break

temp_col = None
for c in df.columns:
    if "temp" in c:
        temp_col = c
        break

wind_col = None
for c in df.columns:
    if "wind" in c:
        wind_col = c
        break

shade_col = None
for c in df.columns:
    if "shade" in c:
        shade_col = c
        break

print("Temperature column:", temp_col)
print("Humidity column:", hum_col)
print("Wind column:", wind_col)
print("Shade column:", shade_col)


Temperature column: temperature_c
Humidity column: humidity_pct
Wind column: wind_kph
Shade column: shade_score


In [None]:

import numpy as np
import pandas as pd
import skfuzzy as fuzz
from skfuzzy import control as ctrl

df = pd.read_csv("features_with_predictions.csv", parse_dates=['timestamp','timestamp_hour'])
print(f"Loaded {df.shape[0]} rows")

df.columns = df.columns.str.lower().str.strip()

temp_col = [c for c in df.columns if "temp" in c][0]
hum_col = [c for c in df.columns if "humid" in c][0]
wind_col = [c for c in df.columns if "wind" in c][0]
print(f"Using columns: {temp_col}, {hum_col}, {wind_col}")

df['month'] = pd.to_datetime(df['timestamp']).dt.month

def get_season(month):
    """Return season based on month."""
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df['season'] = df['month'].apply(get_season)

def compute_seasonal_heat_stress(T, RH, season):
    """
    Compute heat stress relative to seasonal expectations.
    LOWER is better (less stress).

    Winter: Comfortable = 8-16°C → stress 0-15
    Fall: Comfortable = 12-20°C → stress 0-20
    Spring: Comfortable = 14-22°C → stress 0-25
    Summer: Comfortable up to 25°C, hot above 28°C → stress 0-100
    """

    if T >= 27:
        Tf = T * 9/5 + 32
        HI = (-42.379 + 2.04901523*Tf + 10.14333127*RH
              - 0.22475541*Tf*RH - 6.83783e-3*Tf**2
              - 5.481717e-2*RH**2 + 1.22874e-3*Tf**2*RH
              + 8.5282e-4*Tf*RH**2 - 1.99e-6*Tf**2*RH**2)
        heat_index = (HI - 32) * 5/9
    else:
        heat_index = T

    if season == 'Winter':
        if heat_index < 0:
            stress = 30 + abs(heat_index) * 2
        elif heat_index < 8:
            stress = (8 - heat_index) * 2
        elif heat_index <= 16:
            stress = 0
        elif heat_index <= 20:
            stress = (heat_index - 16) * 3
        else:
            stress = 12 + (heat_index - 20) * 2
        max_stress = 30

    elif season == 'Summer':
        if heat_index < 18:
            stress = (18 - heat_index) * 1.5
        elif heat_index <= 25:
            stress = 0
        elif heat_index <= 30:
            stress = (heat_index - 25) * 8
        else:
            stress = 40 + (heat_index - 30) * 15
        max_stress = 100

    elif season == 'Spring':
        if heat_index < 10:
            stress = (10 - heat_index) * 2
        elif heat_index <= 14:
            stress = (14 - heat_index) * 1
        elif heat_index <= 22:
            stress = 0
        elif heat_index <= 26:
            stress = (heat_index - 22) * 4
        else:
            stress = 16 + (heat_index - 26) * 6
        max_stress = 50

    else:
        if heat_index < 8:
            stress = (8 - heat_index) * 2
        elif heat_index <= 12:
            stress = (12 - heat_index) * 1
        elif heat_index <= 20:
            stress = 0
        elif heat_index <= 24:
            stress = (heat_index - 20) * 3
        else:
            stress = 12 + (heat_index - 24) * 5
        max_stress = 30

    return np.clip(stress, 0, max_stress)

print("\nComputing season-aware heat stress...")
df['heat_stress'] = df.apply(
    lambda row: compute_seasonal_heat_stress(
        row[temp_col],
        row[hum_col],
        row['season']
    ),
    axis=1
)
print("Heat stress computed")
print(f"\nHeat stress by season:")
for season in ['Winter', 'Spring', 'Summer', 'Fall']:
    season_data = df[df['season'] == season]['heat_stress']
    if len(season_data) > 0:
        print(f"   {season}: {season_data.min():.1f} to {season_data.max():.1f} (mean: {season_data.mean():.1f})")

print("\n🌡️ Temperature → Heat Stress Examples:")
sample_hours = df.groupby('hour_of_day').agg({
    'temperature_c': 'mean',
    'heat_stress': 'mean',
    'season': 'first'
}).round(1).head(24)
print(sample_hours)

print("\nBuilding STRICTER fuzzy logic system...")

crowd = ctrl.Antecedent(np.linspace(0, 1, 201), 'crowd')
heat = ctrl.Antecedent(np.linspace(0, 100, 201), 'heat')
shade = ctrl.Antecedent(np.linspace(0, 1, 201), 'shade')
comfort = ctrl.Consequent(np.linspace(0, 100, 201), 'comfort')

crowd['low'] = fuzz.trapmf(crowd.universe, [0, 0, 0.10, 0.20])
crowd['med'] = fuzz.trapmf(crowd.universe, [0.15, 0.30, 0.50, 0.65])
crowd['high'] = fuzz.trapmf(crowd.universe, [0.60, 0.75, 1, 1])

heat['low'] = fuzz.trapmf(heat.universe, [0, 0, 20, 35])
heat['med'] = fuzz.trapmf(heat.universe, [30, 45, 60, 70])
heat['high'] = fuzz.trapmf(heat.universe, [65, 80, 100, 100])

shade['none'] = fuzz.trapmf(shade.universe, [0, 0, 0.20, 0.35])
shade['partial'] = fuzz.trapmf(shade.universe, [0.30, 0.45, 0.55, 0.65])
shade['full'] = fuzz.trapmf(shade.universe, [0.60, 0.75, 1, 1])

comfort['verylow'] = fuzz.trapmf(comfort.universe, [0, 0, 15, 25])
comfort['low'] = fuzz.trapmf(comfort.universe, [20, 30, 40, 50])
comfort['medium'] = fuzz.trapmf(comfort.universe, [45, 55, 65, 75])
comfort['high'] = fuzz.trapmf(comfort.universe, [70, 80, 90, 95])
comfort['veryhigh'] = fuzz.trapmf(comfort.universe, [92, 97, 100, 100])

rules = [
    ctrl.Rule(crowd['high'] & heat['high'], comfort['verylow']),
    ctrl.Rule(crowd['high'] & heat['high'] & shade['none'], comfort['verylow']),
    ctrl.Rule(crowd['high'] & heat['med'] & shade['none'], comfort['verylow']),
    ctrl.Rule(crowd['med'] & heat['high'] & shade['none'], comfort['verylow']),

    ctrl.Rule(crowd['high'] & heat['med'], comfort['low']),
    ctrl.Rule(crowd['med'] & heat['high'], comfort['low']),
    ctrl.Rule(crowd['high'] & shade['none'], comfort['low']),
    ctrl.Rule(heat['high'] & shade['partial'], comfort['low']),
    ctrl.Rule(crowd['med'] & heat['med'] & shade['none'], comfort['low']),

    ctrl.Rule(crowd['med'] & heat['med'], comfort['medium']),
    ctrl.Rule(crowd['low'] & heat['high'], comfort['medium']),
    ctrl.Rule(crowd['high'] & heat['low'] & shade['full'], comfort['medium']),
    ctrl.Rule(crowd['med'] & shade['partial'], comfort['medium']),
    ctrl.Rule(heat['med'] & shade['partial'], comfort['medium']),

    ctrl.Rule(crowd['low'] & heat['low'], comfort['high']),
    ctrl.Rule(crowd['low'] & heat['med'] & shade['full'], comfort['high']),
    ctrl.Rule(crowd['med'] & heat['low'] & shade['full'], comfort['high']),
    ctrl.Rule(shade['full'] & heat['low'], comfort['high']),

    ctrl.Rule(crowd['low'] & heat['low'] & shade['full'], comfort['veryhigh']),
    ctrl.Rule(crowd['low'] & heat['low'] & shade['partial'], comfort['veryhigh']),
]

comfort_ctrl = ctrl.ControlSystem(rules)
comfort_sim = ctrl.ControlSystemSimulation(comfort_ctrl)
print(f"Fuzzy system built with {len(rules)} STRICTER rules")

def compute_comfort_strict(row):
    """Compute comfort score with BALANCED time-aware fuzzy logic."""
    crowd_val = np.clip(float(row['predicted_crowd']), 0.01, 0.99)
    heat_val = np.clip(float(row['heat_stress']), 0.01, 99.99)
    shade_val = np.clip(float(row['shade_score']), 0.01, 0.99)

    hour = pd.to_datetime(row['timestamp']).hour
    is_night = (hour >= 22 or hour <= 6)

    try:
        comfort_sim.input['crowd'] = crowd_val
        comfort_sim.input['heat'] = heat_val
        comfort_sim.input['shade'] = shade_val
        comfort_sim.compute()
        score = comfort_sim.output.get('comfort', None)

        if score is None:
            raise ValueError("No fuzzy output")

        if is_night:
            if crowd_val < 0.2 and heat_val < 30:
                score *= 1.10
            elif crowd_val < 0.4 and heat_val < 40:
                score *= 1.05
        else:
            if crowd_val > 0.7:
                score *= 0.75
            if heat_val > 70:
                score *= 0.80
            if heat_val > 50 and shade_val < 0.3:
                score *= 0.85

        return np.clip(score, 0, 100)

    except Exception:
        if is_night and crowd_val < 0.3:
            base = 75 - (crowd_val * 35)
        else:
            base = 60 - (crowd_val * 50 + heat_val * 0.35)

        shade_bonus = shade_val * 10 if not is_night else 0
        interaction_penalty = (crowd_val * heat_val * 0.25) if not is_night else 0
        fallback = base + shade_bonus - interaction_penalty

        return np.clip(fallback, 20, 90)

print("\nComputing STRICTER comfort scores...")
df['comfort_score'] = df.apply(compute_comfort_strict, axis=1)

df.to_csv("features_with_comfort_final.csv", index=False)

print("\n" + "="*60)
print("🔍 SEASON-AWARE HEAT STRESS DIAGNOSTICS")
print("="*60)

print("\nComfort Score Statistics:")
print(df['comfort_score'].describe())

comfort_std = df['comfort_score'].std()
comfort_range = df['comfort_score'].max() - df['comfort_score'].min()

print(f"\nStandard Deviation: {comfort_std:.2f}")
print(f"Range: {comfort_range:.2f}")

if comfort_std >= 15:
    print(" EXCELLENT variation")
elif comfort_std >= 10:
    print("Good variation")
else:
    print("Needs more variation")

print("\n Saved: features_with_comfort_final.csv")

print("\n📊 Comfort Distribution:")
bins = [0, 20, 40, 60, 80, 100]
labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
df['comfort_cat'] = pd.cut(df['comfort_score'], bins=bins, labels=labels)
print(df['comfort_cat'].value_counts().sort_index())

Loaded 39024 rows
Using columns: temperature_c, humidity_pct, wind_kph

Computing season-aware heat stress...
Heat stress computed

Heat stress by season:
   Fall: 0.0 to 6.0 (mean: 0.7)

🌡️ Temperature → Heat Stress Examples:
             temperature_c  heat_stress season
hour_of_day                                   
0                     11.2          0.8   Fall
1                     12.2          0.4   Fall
2                     11.5          0.5   Fall
3                     12.3          0.0   Fall
4                     14.6          0.0   Fall
5                     15.0          0.0   Fall
6                     16.3          0.0   Fall
7                     17.5          0.0   Fall
8                     19.6          0.6   Fall
9                     19.8          0.4   Fall
10                    19.4          0.9   Fall
11                    19.9          0.4   Fall
12                    21.5          4.5   Fall
13                    21.1          3.4   Fall
14                   

In [None]:

import pandas as pd
import numpy as np

df = pd.read_csv("features_with_comfort_final.csv")

print("=" * 60)
print("🔍 MODEL QUALITY DIAGNOSTICS")
print("=" * 60)

# Print stats and variation for predicted crowd values
print("\nCROWD PREDICTION QUALITY")
print(df['predicted_crowd'].describe())
crowd_std = df['predicted_crowd'].std()
crowd_range = df['predicted_crowd'].max() - df['predicted_crowd'].min()

print(f"\nStandard Deviation: {crowd_std:.4f}")
print(f"Range: {crowd_range:.4f}")

if crowd_std < 0.10:
    print("FAIL - Predictions too similar")
elif crowd_std < 0.15:
    print("BORDERLINE - Limited diversity")
else:
    print("PASS - Good prediction diversity")

#comfort score distribution
print("\nCOMFORT SCORE QUALITY")
print(df['comfort_score'].describe())
comfort_std = df['comfort_score'].std()
comfort_range = df['comfort_score'].max() - df['comfort_score'].min()

print(f"\n   Standard Deviation: {comfort_std:.2f}")
print(f"   Range: {comfort_range:.2f}")

if comfort_std < 8:
    print("FAIL - Scores too clustered")
elif comfort_std < 12:
    print("BORDERLINE - Acceptable for demo")
else:
    print("PASS - Good score diversity")

if comfort_range < 35:
    print("FAIL - Range too narrow")
elif comfort_range < 50:
    print("BORDERLINE - Acceptable for demo")
else:
    print("PASS - Good score range")

fallback_count = (df['comfort_score'] == 50.0).sum()
fallback_rate = fallback_count / len(df) * 100

print(f"\nFUZZY LOGIC ERROR RATE")
print(f"Fallback values (50.0): {fallback_count} ({fallback_rate:.1f}%)")

if fallback_rate > 10:
    print("FAIL - Too many fuzzy errors")
elif fallback_rate > 5:
    print("WARN - Some fuzzy errors")
else:
    print("PASS - Fuzzy logic working well")

print("\nCOMFORT SCORE DISTRIBUTION")
bins = [0, 20, 40, 60, 80, 100]
labels = ['Very Low (0-20)', 'Low (20-40)', 'Med (40-60)', 'High (60-80)', 'Very High (80-100)']
df['comfort_category'] = pd.cut(df['comfort_score'], bins=bins, labels=labels)
distribution = df['comfort_category'].value_counts().sort_index()
print(distribution)

high_zone_count = distribution['High (60-80)']
total_zones = len(df)
concentration = high_zone_count / total_zones * 100

if concentration > 70:
    print(f"WARNING: {concentration:.1f}% of scores in High range (too concentrated)")
else:
    print(f"Distribution spread: {concentration:.1f}% in High range")

print("\nTOP 5 BEST ZONES (Average Comfort)")
zone_avg = df.groupby('zone_name')['comfort_score'].mean().sort_values(ascending=False)
print(zone_avg.head(5))

print("\nTOP 5 WORST ZONES (Average Comfort)")
print(zone_avg.tail(5))

best_zone_score = zone_avg.iloc[0]
worst_zone_score = zone_avg.iloc[-1]
zone_spread = best_zone_score - worst_zone_score

print(f"\nBest zone: {best_zone_score:.1f}")
print(f"Worst zone: {worst_zone_score:.1f}")
print(f"Spread: {zone_spread:.1f} points")

if zone_spread < 10:
    print("FAIL - Zones too similar")
elif zone_spread < 20:
    print("BORDERLINE - Some differentiation")
else:
    print("PASS - Clear zone differences")

print("\nCOMFORT BY HOUR OF DAY")
hourly = df.groupby('hour_of_day')['comfort_score'].agg(['mean', 'std'])
print(hourly)

print("\nCORRELATION WITH INPUTS")
print(f"Comfort vs Crowd: {df['comfort_score'].corr(df['predicted_crowd']):.3f}")
print(f"Comfort vs Heat: {df['comfort_score'].corr(df['heat_stress']):.3f}")
print(f"Comfort vs Shade: {df['comfort_score'].corr(df['shade_score']):.3f}")

print("\n" + "=" * 60)
print("OVERALL VERDICT")
print("=" * 60)

passes = 0
warnings = 0

if crowd_std >= 0.10: passes += 1
elif crowd_std >= 0.08: warnings += 1

if comfort_std >= 8: passes += 1
elif comfort_std >= 6: warnings += 1

if comfort_range >= 35: passes += 1
elif comfort_range >= 30: warnings += 1

if fallback_rate <= 5: passes += 1
elif fallback_rate <= 10: warnings += 1

if zone_spread >= 10: passes += 1
elif zone_spread >= 7: warnings += 1

print(f"Passes: {passes}/5")
print(f"Warnings: {warnings}/5")

if passes >= 4:
    print("\nREADY FOR STREAMLIT - Model quality is good!")
elif passes + warnings >= 4:
    print("\nACCEPTABLE FOR DEMO - Model works but could be better")
    print("→ Streamlit will function, but recommendations may be similar")
else:
    print("\n NOT READY - Model needs improvement")
    print("→ Consider re-running Cell 3 with different random seed")

print("=" * 60)

🔍 MODEL QUALITY DIAGNOSTICS

1️⃣ CROWD PREDICTION QUALITY
count    39024.000000
mean         0.197758
std          0.131720
min          0.045970
25%          0.099219
50%          0.160188
75%          0.270512
max          0.995757
Name: predicted_crowd, dtype: float64

   Standard Deviation: 0.1317
   Range: 0.9498
   ⚠️  BORDERLINE - Limited diversity

2️⃣ COMFORT SCORE QUALITY
count    39024.000000
mean        76.727453
std         15.089519
min         20.000000
25%         60.000000
50%         84.967169
75%         86.663341
max         95.329675
Name: comfort_score, dtype: float64

   Standard Deviation: 15.09
   Range: 75.33
   ✅ PASS - Good score diversity
   ✅ PASS - Good score range

3️⃣ FUZZY LOGIC ERROR RATE
   Fallback values (50.0): 0 (0.0%)
   ✅ PASS - Fuzzy logic working well

4️⃣ COMFORT SCORE DISTRIBUTION
comfort_category
Very Low (0-20)         106
Low (20-40)             388
Med (40-60)            7490
High (60-80)           9868
Very High (80-100)    21172
Name:

In [None]:


import pandas as pd
import numpy as np
from datetime import timedelta

df = pd.read_csv("features_with_comfort_final.csv", parse_dates=['timestamp','timestamp_hour'])

print(f"Loaded {df.shape[0]} rows for recommendations")

def best_zone_now(df_current):
    """
    Find the zone with highest comfort score at current time.

    Returns:
        dict with zone info, or error dict if no data
    """
    if df_current.empty:
        return {
            'zone_name': 'No data available',
            'comfort_score': 0,
            'predicted_crowd': 0,
            'heat_stress': 0,
            'shade_score': 0,
            'error': True
        }

    best_idx = df_current['comfort_score'].idxmax()
    best_row = df_current.loc[best_idx]

    return {
        'zone_name': best_row['zone_name'],
        'comfort_score': float(best_row['comfort_score']),
        'predicted_crowd': float(best_row['predicted_crowd']),
        'heat_stress': float(best_row['heat_stress']),
        'shade_score': float(best_row['shade_score']),
        'error': False
    }


def worst_zones_now(df_current, n=3):
    """
    Find the N worst zones to avoid right now.
    """
    if df_current.empty:
        return []

    worst = df_current.nsmallest(n, 'comfort_score')
    return worst[['zone_name', 'comfort_score', 'predicted_crowd', 'heat_stress']].to_dict('records')


def best_time_to_walk(df_all, origin_zone, dest_zone, start_time, window_hours=3):
    """
    Find best time in next N hours to walk between two zones.

    Returns:
        dict with timing info, or None if no valid times
    """
    start = pd.to_datetime(start_time)

    times = sorted(df_all['timestamp_hour'].unique())
    candidates = [t for t in times if start <= t <= start + timedelta(hours=window_hours)]

    if not candidates:
        return None

    best_time = None
    best_score = -1

    for t in candidates:
        subset = df_all[df_all['timestamp_hour'] == t]

        origin_data = subset[subset['zone_name'] == origin_zone]
        dest_data = subset[subset['zone_name'] == dest_zone]

        if origin_data.empty or dest_data.empty:
            continue

        origin_comfort = float(origin_data['comfort_score'].iloc[0])
        dest_comfort = float(dest_data['comfort_score'].iloc[0])
        route_comfort = min(origin_comfort, dest_comfort)

        if route_comfort > best_score:
            best_score = route_comfort
            best_time = t

    if best_time is None:
        return None

    minutes_from_now = int((best_time - start).total_seconds() / 60)

    return {
        'best_time': best_time,
        'comfort_score': best_score,
        'minutes_from_now': minutes_from_now
    }


def zone_forecast(df_all, zone_name, start_time, hours=6):
    """
    Get comfort forecast for a specific zone over next N hours.
    """
    start = pd.to_datetime(start_time)
    end = start + timedelta(hours=hours)

    forecast = df_all[
        (df_all['zone_name'] == zone_name) &
        (df_all['timestamp_hour'] >= start) &
        (df_all['timestamp_hour'] <= end)
    ].sort_values('timestamp_hour')

    return forecast[['timestamp_hour', 'predicted_crowd', 'heat_stress', 'comfort_score']]


current_time = df['timestamp_hour'].min()
print(f"\n🕐 Current time: {current_time}")

df_now = df[df['timestamp_hour'] == current_time]

print("\n" + "="*60)
print(" BEST ZONE RIGHT NOW")
print("="*60)
best_zone = best_zone_now(df_now)
if not best_zone['error']:
    print(f"Zone: {best_zone['zone_name']}")
    print(f"Comfort Score: {best_zone['comfort_score']:.1f}/100")
    print(f"Predicted Crowd: {best_zone['predicted_crowd']:.2f} (0=empty, 1=packed)")
    print(f"Heat Stress: {best_zone['heat_stress']:.1f}/100")
    print(f"Shade: {best_zone['shade_score']:.2f}")
else:
    print(" No data available")

print("\n" + "="*60)
print("ZONES TO AVOID RIGHT NOW")
print("="*60)
avoid = worst_zones_now(df_now, n=3)
for i, z in enumerate(avoid, 1):
    print(f"{i}. {z['zone_name']}: Comfort={z['comfort_score']:.1f}, Crowd={z['predicted_crowd']:.2f}")

print("\n" + "="*60)
print("🚶 BEST TIME TO WALK")
print("="*60)

zones = df['zone_name'].unique()
if len(zones) >= 2:
    origin = zones[0]
    dest = zones[1]

    walk_result = best_time_to_walk(df, origin, dest, current_time, window_hours=4)

    if walk_result:
        print(f"From: {origin}")
        print(f"To: {dest}")
        print(f"Best time: {walk_result['best_time']} ({walk_result['minutes_from_now']} minutes from now)")
        print(f"Expected comfort: {walk_result['comfort_score']:.1f}/100")
    else:
        print("No valid walking times found in next 4 hours")

print("\n" + "="*60)
print("6-HOUR FORECAST FOR SAMPLE ZONE")
print("="*60)
sample_zone = zones[0]
forecast = zone_forecast(df, sample_zone, current_time, hours=6)
print(f"\nZone: {sample_zone}\n")
print(forecast.to_string(index=False))

print("\nRecommendation engine tested successfully!")

Loaded 39024 rows for recommendations

🕐 Current time: 2024-11-01 00:00:00

🏆 BEST ZONE RIGHT NOW
Zone: Arrillaga Center for Sports and ...
Comfort Score: 92.7/100
Predicted Crowd: 0.18 (0=empty, 1=packed)
Heat Stress: 0.7/100
Shade: 0.66

⚠️  ZONES TO AVOID RIGHT NOW
1. Crothers Memorial Hall: Comfort=60.0, Crowd=0.42
2. Cedar Hall: Comfort=60.0, Crowd=0.43
3. Potter House: Comfort=60.0, Crowd=0.41

🚶 BEST TIME TO WALK
From: Center for the Study of Language...
To: Galvez Help Center
Best time: 2024-11-01 03:00:00 (180 minutes from now)
Expected comfort: 79.5/100

📈 6-HOUR FORECAST FOR SAMPLE ZONE

Zone: Center for the Study of Language...

     timestamp_hour  predicted_crowd  heat_stress  comfort_score
2024-11-01 00:00:00         0.202585          0.7      63.000000
2024-11-01 01:00:00         0.201942          1.1      63.000000
2024-11-01 02:00:00         0.204711          0.8      63.000000
2024-11-01 03:00:00         0.181259          0.0      79.532522
2024-11-01 04:00:00       

In [None]:


import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

df = pd.read_csv("features_with_comfort_final.csv", parse_dates=['timestamp','timestamp_hour'])

print(f"Dataset loaded: {df.shape[0]} rows")
print(f"Unique zones: {df['zone_name'].nunique()}")
print(f"Time range: {df['timestamp_hour'].min()} to {df['timestamp_hour'].max()}")

ts0 = df['timestamp_hour'].unique()[0]
df_ts0 = df[df['timestamp_hour'] == ts0].copy()

df_ts0 = df_ts0.sort_values('comfort_score', ascending=False)

df_ts0_top = df_ts0.head(30)

fig1 = px.bar(
    df_ts0_top,
    x='zone_name',
    y='comfort_score',
    title=f'Top 30 Comfort Scores at {ts0}',
    labels={'zone_name': 'Zone', 'comfort_score': 'Comfort Score'},
    color='comfort_score',
    color_continuous_scale='RdYlGn',
    range_color=[0, 100]
)

fig1.update_layout(
    xaxis_tickangle=45,
    height=600,
    showlegend=False
)

fig1.show()


print("\n=== COMFORT SCORE STATISTICS ===")
print(df_ts0['comfort_score'].describe())
print(f"\nUnique comfort scores: {df_ts0['comfort_score'].nunique()}")
print(f"Zones with score = 50.0: {(df_ts0['comfort_score'] == 50.0).sum()}")
print(f"Zones with score ~82.64: {((df_ts0['comfort_score'] > 82) & (df_ts0['comfort_score'] < 83)).sum()}")

available_zones = df['zone_name'].value_counts().head(10).index.tolist()

print("\n=== AVAILABLE ZONES (Top 10 by data points) ===")
for i, zone in enumerate(available_zones, 1):
    print(f"{i}. {zone}")

selected_zone = available_zones[0]
print(f"\nPlotting trends for: {selected_zone}")

df_zone = df[df['zone_name'] == selected_zone].sort_values('timestamp_hour')

print(f"Data points: {len(df_zone)}")
print(f"Crowd range: {df_zone['predicted_crowd'].min():.3f} to {df_zone['predicted_crowd'].max():.3f}")
print(f"Heat range: {df_zone['heat_stress'].min():.1f} to {df_zone['heat_stress'].max():.1f}")
print(f"Comfort range: {df_zone['comfort_score'].min():.1f} to {df_zone['comfort_score'].max():.1f}")

fig2 = go.Figure()

fig2.add_trace(go.Scatter(
    x=df_zone['timestamp_hour'],
    y=df_zone['predicted_crowd'] * 100,
    name='Crowd Level (%)',
    line=dict(color='blue', width=2)
))

fig2.add_trace(go.Scatter(
    x=df_zone['timestamp_hour'],
    y=df_zone['heat_stress'],
    name='Heat Stress',
    line=dict(color='red', width=2)
))

fig2.add_trace(go.Scatter(
    x=df_zone['timestamp_hour'],
    y=df_zone['comfort_score'],
    name='Comfort Score',
    line=dict(color='green', width=3)
))

fig2.update_layout(
    title=f'Trends for {selected_zone}',
    xaxis_title='Time',
    yaxis_title='Score (0-100)',
    height=500,
    hovermode='x unified'
)

fig2.show()


fig3 = px.histogram(
    df_ts0,
    x='comfort_score',
    nbins=20,
    title=f'Distribution of Comfort Scores at {ts0}',
    labels={'comfort_score': 'Comfort Score', 'count': 'Number of Zones'}
)

fig3.update_layout(height=400)
fig3.show()

fig4 = px.scatter(
    df_ts0.sample(min(100, len(df_ts0))),
    x='predicted_crowd',
    y='comfort_score',
    color='heat_stress',
    size='shade_score',
    hover_data=['zone_name'],
    title=f'Crowd vs Comfort (sized by shade) at {ts0}',
    labels={
        'predicted_crowd': 'Predicted Crowd (0=empty, 1=full)',
        'comfort_score': 'Comfort Score',
        'heat_stress': 'Heat Stress'
    },
    color_continuous_scale='Reds'
)

fig4.update_layout(height=500)
fig4.show()

print("\n=== DATA QUALITY CHECK ===")
print(f"Rows with comfort = 50.0 (default fallback): {(df['comfort_score'] == 50.0).sum()}")
print(f"Rows with comfort = 82.64 (repeated value): {((df['comfort_score'] > 82.6) & (df['comfort_score'] < 82.7)).sum()}")
print(f"Total rows: {len(df)}")
print(f"\nThis suggests fuzzy logic may not be working correctly if most values are the same!")

print("\n=== SAMPLE DATA ===")
print(df[['zone_name', 'predicted_crowd', 'heat_stress', 'shade_score', 'comfort_score']].head(10))

Dataset loaded: 39024 rows
Unique zones: 534
Time range: 2024-11-01 00:00:00 to 2024-11-03 23:00:00



=== COMFORT SCORE STATISTICS ===
count    542.000000
mean      65.134786
std        5.649305
min       60.000000
25%       63.000000
50%       63.000000
75%       63.000000
max       92.660373
Name: comfort_score, dtype: float64

Unique comfort scores: 137
Zones with score = 50.0: 0
Zones with score ~82.64: 2

=== AVAILABLE ZONES (Top 10 by data points) ===
1. Environmental Safety Facility Bu...
2. Spruce Hall
3. Lucile Packard Children's Hospit...
4. John A. & Cynthia Fry Gunn Building
5. Burnham Pavilion
6. Mechanical Engineering, TSG
7. Stanford Hospital
8. General Zone 357
9. Lake House
10. Acacia

Plotting trends for: Environmental Safety Facility Bu...
Data points: 216
Crowd range: 0.067 to 0.706
Heat range: 0.0 to 6.0
Comfort range: 26.9 to 95.3



=== DATA QUALITY CHECK ===
Rows with comfort = 50.0 (default fallback): 0
Rows with comfort = 82.64 (repeated value): 41
Total rows: 39024

This suggests fuzzy logic may not be working correctly if most values are the same!

=== SAMPLE DATA ===
                             zone_name  predicted_crowd  heat_stress  \
0  Center for the Study of Language...         0.202585          0.7   
1                   Galvez Help Center         0.190394          0.7   
2                  Sterling Quadrangle         0.266089          0.7   
3                       Lagunita Court         0.231090          0.7   
4                    Polya Data Center         0.260400          0.7   
5                          Spruce Hall         0.398817          0.7   
6                          Turing Hall         0.358850          0.7   
7                            Keck Hall         0.423567          0.7   
8               Crothers Memorial Hall         0.424318          0.7   
9                     Lucie Stern 

In [None]:


import os
from google.colab import files

print("="*60)
print("DOWNLOADING FILES FROM COLAB")
print("="*60)

required_files = [
    'crowd_model.keras',
    'label_encoder_zone.joblib',
    'features_with_comfort_final.csv',
    'zones.csv'
]

print("\n📁 Checking files...")
existing_files = []
missing_files = []

for filename in required_files:
    if os.path.exists(filename):
        size = os.path.getsize(filename) / (1024 * 1024)
        print(f"{filename} ({size:.2f} MB)")
        existing_files.append(filename)
    else:
        print(f"{filename} - NOT FOUND")
        missing_files.append(filename)

if missing_files:
    print(f"\nWARNING: {len(missing_files)} files are missing!")
    print("Missing files:", missing_files)
    print("\nPossible issues:")
    print("1. Cell 7 didn't run successfully (model not saved)")
    print("2. Cell 8 didn't run successfully (comfort scores not computed)")
    print("\nPlease re-run the cells that create these files.")
else:
    print(f"\nAll {len(existing_files)} files found!")

# Download existing files
if existing_files:
    print("\nDownloading files...")
    for filename in existing_files:
        print(f"Downloading {filename}...")
        files.download(filename)

    print("\nDownload complete!")
    print("\n Next steps:")
    print("1. Place all downloaded files in the same folder as streamlit_app.py")
    print("2. Run: python check_setup.py")
    print("3. Run: streamlit run streamlit_app.py")
else:
    print("\nNo files to download. Please fix the issues above first.")

print("="*60)

📥 DOWNLOADING FILES FROM COLAB

📁 Checking files...
✅ crowd_model.keras (0.58 MB)
✅ label_encoder_zone.joblib (0.01 MB)
✅ features_with_comfort_final.csv (10.81 MB)
✅ zones.csv (0.06 MB)

✅ All 4 files found!

📥 Downloading files...
Downloading crowd_model.keras...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading label_encoder_zone.joblib...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading features_with_comfort_final.csv...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading zones.csv...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Download complete!

📋 Next steps:
1. Place all downloaded files in the same folder as streamlit_app.py
2. Run: python check_setup.py
3. Run: streamlit run streamlit_app.py
