# Import Packages

In [1]:
%load_ext autoreload
%autoreload 2

In [89]:
import sys
import requests
import pandas as pd
# import geopandas as gpd
from shapely.geometry import Point, Polygon
import folium
import json
import time
import numpy as np
import h3
from folium.plugins import HeatMap
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
import time
import json

In [90]:
sys.path.append('src')

In [91]:
from fetch_data import *
from data_io import *
from data_prep import *
from scoring import *
from threshold_clustering import *
from dbscan_clustering import *
from visualization import *
from budget_filter import *

# Inputs

In [109]:
#### FRONTEND INPUTS ####

#the UI will instead give ranks, we need to convert to weights
# user_weights = {
#     'police_station': 6,
#     'grocery_store': 1,
#     'hospital': 5,
#     'marta_stop': 2,
#     'school': 0,
#     'restaurant': 3,
#     'park': 4
# }

user_weights = {
    "restaurant": 0.49,
    "grocery_store": 0.86,
    "school": 0.72,
    "hospital": 0.7,
    "marta_stop": 0.41,
    "police_station": 0.79,
    "park": 0.75,
    "crime_incident": 0.8
}



# radius in miles
user_radius_miles = 12/1.60934
user_has_vehicle = False
center = (33.749, -84.388)
budget = 1200


# convert to km
user_radius_km = user_radius_miles * 1.60934 

# Query Data

In [110]:
# all_pois = []
# all_pois = query_restaurant_data(ATLANTA_BBOX, all_pois)
# all_pois = query_park_data(ATLANTA_BBOX, all_pois)
# all_pois = query_hospital_and_clinic_data(ATLANTA_BBOX, all_pois)
# print(f"Total POIs fetched: {len(all_pois)}")

# Query Save & Load

In [111]:
name_of_the_file = "combine_datasets_v2"

In [112]:
# save_pois(all_pois, name_of_the_file)

In [113]:
df_pois = load_pois(name_of_the_file)

Loaded 170681
Summary by type:
type
school            102130
crime_incident     50202
marta_stop          9266
hospital            8013
park                 503
grocery_store        300
police_station       247
restaurant            20
Name: count, dtype: int64


# Basic EDA

In [97]:
df_pois.isnull().sum(axis=0)

type    0
name    0
lat     0
lon     0
dtype: int64

In [98]:
df_pois.groupby('type').agg({'lat':['min','max'] , 'lon':['min','max'], 'type':['count']})
#looks like the data has higher coverage than just Atlanta and Metro Atlanta

Unnamed: 0_level_0,lat,lat,lon,lon,type
Unnamed: 0_level_1,min,max,min,max,count
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
crime_incident,32.151695,34.777687,-85.711189,-82.680045,50202
grocery_store,33.293267,34.261048,-84.88864,-83.886021,300
hospital,-14.290242,71.297725,-176.640263,145.724472,8013
marta_stop,33.432372,34.105822,-84.669803,-84.083455,9266
park,33.453098,33.976909,-84.552848,-84.153698,503
police_station,32.84539,34.557793,-85.287257,-83.596301,247
restaurant,33.761162,33.859863,-84.455464,-84.32903,20
school,-14.348924,71.300337,-176.640331,145.78443,102130


In [99]:
df_pois = keep_pois_within_bbox(df_pois, user_radius_km)
df_pois.head()

Filtered POIs from 170681 to 57626 within bbox


Unnamed: 0,type,name,lat,lon
0,police_station,DEKALB TECHNICAL COLLEGE POLICE,33.789938,-84.234406
1,police_station,DEKALB COUNTY SHERIFFS OFFICE / DEKALB COUNTY ...,33.775744,-84.244527
2,police_station,DEKALB COUNTY MARSHALS OFFICE,33.77407,-84.297214
3,police_station,GEORGIA BUREAU OF INVESTIGATION,33.692882,-84.272514
4,police_station,FULTON COUNTY MARSHALS OFFICE,33.750652,-84.391145


In [100]:
df_pois.groupby('type').agg({'lat':['min','max'] , 'lon':['min','max'], 'type':['count']})

Unnamed: 0_level_0,lat,lat,lon,lon,type
Unnamed: 0_level_1,min,max,min,max,count
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
crime_incident,33.624855,33.878633,-84.543773,-84.252012,50129
grocery_store,33.620239,33.876639,-84.531024,-84.239643,62
hospital,33.680272,33.857813,-84.513144,-84.248184,25
marta_stop,33.619354,33.878604,-84.543062,-84.23217,6642
park,33.654428,33.876902,-84.538924,-84.284425,497
police_station,33.620332,33.849226,-84.540764,-84.234406,55
restaurant,33.761162,33.859863,-84.455464,-84.32903,20
school,33.6219,33.8758,-84.5389,-84.2329,196


In [101]:
df_pois['type'].unique()

array(['police_station', 'grocery_store', 'hospital', 'marta_stop',
       'school', 'restaurant', 'crime_incident', 'park'], dtype=object)

# Data Prep - Data Points

In [107]:
# either use boundaries or radius function to create hex grids
# hexagons = create_hex_grids_with_boundaries(df_pois)
hexagons = create_hex_grids_with_radius(df_pois, radius_km=user_radius_km, center = center, size_of_grid = 8)



Using circular boundary: center (33.7490, -84.3880), radius 12.0 km
Generated 890 hexagons (before filtering)
Filtered to 536 hexagons within 12.0 km of center


# Data Prep - Features

In [15]:
## define config for each POI type
# sameple config, the below dictionary represents the default settings for each POI type. For custom seetings, modify and pass it to calculate_accessibility_scores function

# poi_types_config = {
#     'restaurant': {'types': ['restaurant'], 'decay_rate': 1.5, 'max_distance_km': 10, 'invert': False},
#     'grocery_store': {'types': ['grocery_store'], 'decay_rate': 2, 'max_distance_km': 8, 'invert': False},
#     'school': {'types': ['school'], 'decay_rate': 1, 'max_distance_km': 15, 'invert': False},
#     'hospital': {'types': ['hospital'], 'decay_rate': 0.8, 'max_distance_km': 20, 'invert': False},
#     'marta_stop': {'types': ['marta_stop'], 'decay_rate': 0.5, 'max_distance_km': 5, 'invert': False},
#     'police_station': {'types': ['police_station'], 'decay_rate': 0.5, 'max_distance_km': 10, 'invert': False},
#     'park': {'types': ['park'], 'decay_rate': 1.0, 'max_distance_km': 5, 'invert': False},
#     'crime_incident': {'types': ['crime_incident'], 'decay_rate': 2.0, 'max_distance_km': 3, 'invert': True},
# }

In [117]:
# will take some time to run
# pass user_has_vehicle to the function later
df_hexagons = calculate_accessibility_scores(hexagons, df_pois, user_has_vehicle)

Calculating accessibility scores for each hexagon...
  Processing hexagon 0/536...
  Processing hexagon 25/536...
  Processing hexagon 50/536...
  Processing hexagon 75/536...
  Processing hexagon 100/536...
  Processing hexagon 125/536...
  Processing hexagon 150/536...
  Processing hexagon 175/536...
  Processing hexagon 200/536...
  Processing hexagon 225/536...
  Processing hexagon 250/536...
  Processing hexagon 275/536...
  Processing hexagon 300/536...
  Processing hexagon 325/536...
  Processing hexagon 350/536...
  Processing hexagon 375/536...
  Processing hexagon 400/536...
  Processing hexagon 425/536...
  Processing hexagon 450/536...
  Processing hexagon 475/536...
  Processing hexagon 500/536...
  Processing hexagon 525/536...

Calculated accessibility scores for 536 hexagons

Accessibility Score Statistics:
       restaurant_accessibility  grocery_store_accessibility  \
count                536.000000                   536.000000   
mean                   0.419494      

In [118]:
df_budget = load_budget_data("Rent_atlanta")
df_budget_hex = convert_rent_data_to_h3(df_budget)
df_out = get_nearest_rent(df_budget_hex, hexagons, K=1)
df_hexagons = merge_budget_with_accessibility(df_hexagons, df_out)



Merging budget data with 536 with accessibility data with 536
Merge successful: all accessibility hexagons have budget data; total entries: 536


In [119]:
df_hexagons = smooth_scores_spatially(df_hexagons, neighbor_weight=0.3)


Applying spatial smoothing to 8 score columns...
  Smoothing hexagon 0/536...
  Smoothing hexagon 50/536...
  Smoothing hexagon 100/536...
  Smoothing hexagon 150/536...
  Smoothing hexagon 200/536...
  Smoothing hexagon 250/536...
  Smoothing hexagon 300/536...
  Smoothing hexagon 350/536...
  Smoothing hexagon 400/536...
  Smoothing hexagon 450/536...
  Smoothing hexagon 500/536...
Spatial smoothing complete


In [19]:
df_hexagons = filter_hexagons_by_budget(df_hexagons, max_budget=budget)


Before filtering: 536 hexagons
After filtering: 313 hexagons


In [84]:
# df_hexagons = apply_user_weights(df_hexagons, user_weights)


df_hexagons = apply_user_weights(df_hexagons, user_weights)



print(df_hexagons.head())

Applying user preferences: {'restaurant': 0.49, 'grocery_store': 0.86, 'school': 0.72, 'hospital': 0.7, 'marta_stop': 0.41, 'police_station': 0.79, 'park': 0.75, 'crime_incident': 0.85}
Normalized weights (exponential): {'restaurant': 0.14337991512507273, 'grocery_store': 0.11094486871232079, 'school': 0.1222507184143652, 'hospital': 0.12395727484738708, 'marta_stop': 0.1515551569413614, 'police_station': 0.11646067964969396, 'park': 0.11973483500961317, 'crime_incident': 0.11171655130018544}

User Match Score Statistics:
count    313.000000
mean       0.399157
std        0.149016
min        0.141104
25%        0.286525
50%        0.381022
75%        0.496429
max        0.794623
Name: user_match_score, dtype: float64
            hex_id        lat        lon  restaurant_accessibility  \
2  8844c1a865fffff  33.715965 -84.342377                  0.438628   
3  8844c1aa35fffff  33.705560 -84.390105                  0.413945   
4  8844c1a1abfffff  33.749898 -84.486790                  0.063

In [85]:
print(df_hexagons['user_match_score'].describe())

count    313.000000
mean       0.399157
std        0.149016
min        0.141104
25%        0.286525
50%        0.381022
75%        0.496429
max        0.794623
Name: user_match_score, dtype: float64


In [86]:
# Top 5 hexagons
top_5 = df_hexagons.nlargest(5, 'user_match_score')
print(top_5[['hex_id', 'user_match_score', 'restaurant_accessibility', 'grocery_store_accessibility']])

# Bottom 5 hexagons
bottom_5 = df_hexagons.nsmallest(5, 'user_match_score')
print(bottom_5[['hex_id', 'user_match_score', 'restaurant_accessibility', 'grocery_store_accessibility']])

              hex_id  user_match_score  restaurant_accessibility  \
481  8844c1a8e9fffff          0.794623                  2.384242   
404  8844c1a817fffff          0.774579                  1.783589   
486  8844c1a8c7fffff          0.739854                  2.127873   
238  8844c1a8c5fffff          0.738196                  1.913084   
535  8844c1a9cbfffff          0.733761                  2.070169   

     grocery_store_accessibility  
481                     1.066123  
404                     0.803686  
486                     0.957885  
238                     0.834557  
535                     1.227367  
              hex_id  user_match_score  restaurant_accessibility  \
86   8844c1ab61fffff          0.141104                  0.000000   
435  8844c1ab45fffff          0.145877                  0.000000   
192  8844c1ab67fffff          0.151111                  0.002145   
115  8844c1ab6bfffff          0.155334                  0.001647   
51   8844c1ab49fffff          0.155364   

# Percentile Based Clustering

In [23]:
# df_threshold = cluster_based_on_score(df_hexagons)
# df_threshold.head()


df_classified = cluster_based_on_score(
    df_hexagons,
    n_tiers=10
)

Classifying into 10 tiers:
  Tier 0 threshold (top 90.0%): 0.450
  Tier 1 threshold (top 80.0%): 0.355
  Tier 2 threshold (top 70.0%): 0.285
  Tier 3 threshold (top 60.0%): 0.248
  Tier 4 threshold (top 50.0%): 0.214
  Tier 5 threshold (top 40.0%): 0.189
  Tier 6 threshold (top 30.0%): 0.155
  Tier 7 threshold (top 20.0%): 0.111
  Tier 8 threshold (top 10.0%): 0.064

Suitability Distribution:
suitability_label
Less Suitable    31
Most Suitable    32
Okay             31
Name: count, dtype: int64

SUITABILITY TIER CHARACTERISTICS

Most Suitable (32 hexagons):
  Match Score Range: 0.450 - 0.830
  Avg restaurant: 1.408
  Avg grocery_store: 0.732
  Avg school: 18.166

Okay (31 hexagons):
  Match Score Range: 0.357 - 0.450
  Avg restaurant: 0.617
  Avg grocery_store: 0.397
  Avg school: 18.234

Less Suitable (31 hexagons):
  Match Score Range: 0.286 - 0.353
  Avg restaurant: 0.420
  Avg grocery_store: 0.318
  Avg school: 16.801


In [28]:
#not required, will be handles by UI

threshold_map_name = "data/output_data/atlanta_threshold_map_v2.html"
map_threshold = create_suitability_map(df_classified, user_weights)
map_threshold.save(threshold_map_name)

Adding hexagons to map...
  Added 50/313 hexagons...
  Added 100/313 hexagons...
  Added 200/313 hexagons...
  Added 250/313 hexagons...
  Added 350/313 hexagons...
  Added 400/313 hexagons...
  Added 450/313 hexagons...
  Added 500/313 hexagons...


In [29]:
save_csv(df_classified, "combined_data_outputs")

Saved CSV: data/output_data/combined_data_outputs.csv


In [30]:
df_classified_json = convert_json(df_classified)

In [31]:
save_json(df_classified_json, "combined_data_outputs_json")

In [34]:
df_classified.head().columns.tolist()

['hex_id',
 'lat',
 'lon',
 'restaurant_accessibility',
 'grocery_store_accessibility',
 'school_accessibility',
 'hospital_accessibility',
 'marta_stop_accessibility',
 'police_station_accessibility',
 'park_accessibility',
 'crime_incident_accessibility',
 'avg_rent',
 'police_station_norm',
 'grocery_store_norm',
 'hospital_norm',
 'marta_stop_norm',
 'restaurant_norm',
 'park_norm',
 'user_match_score',
 'suitability',
 'suitability_label']

# Experiment 2: DBSCAN Clustering

In [14]:
df_dbscan = dbscan_score_clustering(df_hexagons, eps=0.1, min_samples=3)


DBSCAN CLUSTERING (Score-Based)
Parameters: eps=0.1, min_samples=3
Score range after scaling: [0.000, 1.000]

Results:
  Clusters found: 1
  Noise points: 0
  Cluster 0: 536 hexagons, avg score = 0.161


In [15]:
cluster_colors = get_cluster_colors(df_dbscan)

In [16]:
dbscan_map_name = "data/output_data/atlanta_dbscan_map.html"
map_dbscan = create_dbscan_map(df_dbscan, user_weights, cluster_colors=cluster_colors, use_heatmap=True, heatmap_radius=15)
map_dbscan.save(dbscan_map_name)

Adding hexagons to map...
  Added 0/536 hexagons...
  Added 50/536 hexagons...
  Added 100/536 hexagons...
  Added 150/536 hexagons...
  Added 200/536 hexagons...
  Added 250/536 hexagons...
  Added 300/536 hexagons...
  Added 350/536 hexagons...
  Added 400/536 hexagons...
  Added 450/536 hexagons...
  Added 500/536 hexagons...
Adding heatmap overlay for smooth visualization...


# Experiment 3 - Sptial DBSCAN

In [17]:
df_dbscan_spatial = dbscan_spatial_clustering(
    df_hexagons,
    eps=0.2,
    min_samples=3,
    spatial_weight=0.4
)


DBSCAN CLUSTERING (Spatially-Aware)
Parameters: eps=0.2, min_samples=3, spatial_weight=0.4

Results:
  Clusters found: 6
  Noise points: 1
  Noise/Uncertain: 1 hexagons
  Region 0: 502 hexagons, avg score = 0.127, extent = 33.1 km
  Region 1: 5 hexagons, avg score = 0.783, extent = 3.7 km
  Region 2: 15 hexagons, avg score = 0.590, extent = 7.3 km
  Region 3: 5 hexagons, avg score = 0.699, extent = 4.9 km
  Region 4: 5 hexagons, avg score = 0.899, extent = 2.6 km
  Region 5: 3 hexagons, avg score = 0.521, extent = 3.4 km


In [18]:
cluster_colors_spatial = get_cluster_colors(df_dbscan_spatial)

In [19]:
map_dbscan_spatial = create_dbscan_map(
    df_dbscan_spatial,
    user_weights,
    cluster_colors=cluster_colors_spatial,
    use_heatmap=True,
    heatmap_radius=15
)

spatial_dbscan_map_name = "data/output_data/atlanta_spatial_dbscan_map.html"
map_dbscan_spatial.save(spatial_dbscan_map_name)

Adding hexagons to map...
  Added 0/536 hexagons...
  Added 50/536 hexagons...
  Added 100/536 hexagons...
  Added 150/536 hexagons...
  Added 200/536 hexagons...
  Added 250/536 hexagons...
  Added 300/536 hexagons...
  Added 350/536 hexagons...
  Added 400/536 hexagons...
  Added 450/536 hexagons...
  Added 500/536 hexagons...
Adding heatmap overlay for smooth visualization...


# Collaborative Filtering

## Create Bsaeline Data

In [115]:
user_profiles = [
    {
        "name": "young_family_with_toddler",
        "restaurant": 0.6,
        "grocery_store": 0.9,
        "school": 0.95,
        "hospital": 0.8,
        "marta_stop": 0,
        "police_station": 0.7,
        "park": 0.95,
        "crime_incident": 0.85
    },
    {
        "name": "university_student",
        "restaurant": 0.8,
        "grocery_store": 0.6,
        "school": 0,
        "hospital": 0.3,
        "marta_stop": 0.95,
        "police_station": 0.5,
        "park": 0.4,
        "crime_incident": 0.6
    },
    {
        "name": "young_professional_no_car",
        "restaurant": 0.9,
        "grocery_store": 0.7,
        "school": 0.1,
        "hospital": 0.5,
        "marta_stop": 0.95,
        "police_station": 0.6,
        "park": 0.5,
        "crime_incident": 0.7
    },
    {
        "name": "retiree_couple",
        "restaurant": 0.5,
        "grocery_store": 0.8,
        "school": 0,
        "hospital": 0.95,
        "marta_stop": 0,
        "police_station": 0.8,
        "park": 0.9,
        "crime_incident": 0.9
    },
    {
        "name": "remote_worker_homebody",
        "restaurant": 0.4,
        "grocery_store": 0.9,
        "school": 0,
        "hospital": 0.6,
        "marta_stop": 0,
        "police_station": 0.7,
        "park": 0,
        "crime_incident": 0.85
    },
    {
        "name": "foodie_socialite",
        "restaurant": 0.95,
        "grocery_store": 0,
        "school": 0,
        "hospital": 0.4,
        "marta_stop": 0,
        "police_station": 0.5,
        "park": 0,
        "crime_incident": 0.5
    },
    {
        "name": "health_conscious_athlete",
        "restaurant": 0.6,
        "grocery_store": 0.8,
        "school": 0.1,
        "hospital": 0.7,
        "marta_stop": 0.4,
        "police_station": 0.6,
        "park": 0.95,
        "crime_incident": 0.7
    },
    {
        "name": "family_with_teenagers",
        "restaurant": 0.7,
        "grocery_store": 0.85,
        "school": 0.9,
        "hospital": 0.7,
        "marta_stop": 0.6,
        "police_station": 0.8,
        "park": 0.7,
        "crime_incident": 0.85
    },
    {
        "name": "elderly_living_alone",
        "restaurant": 0.3,
        "grocery_store": 0.95,
        "school": 0.1,
        "hospital": 0.95,
        "marta_stop": 0.5,
        "police_station": 0.9,
        "park": 0.6,
        "crime_incident": 0.95
    },
    {
        "name": "graduate_student_researcher",
        "restaurant": 0.6,
        "grocery_store": 0.7,
        "school": 0,
        "hospital": 0.4,
        "marta_stop": 0.9,
        "police_station": 0.5,
        "park": 0.5,
        "crime_incident": 0.65
    },
    {
        "name": "young_couple_no_kids",
        "restaurant": 0.85,
        "grocery_store": 0.7,
        "school": 0.1,
        "hospital": 0.5,
        "marta_stop": 0.8,
        "police_station": 0.6,
        "park": 0.7,
        "crime_incident": 0.7
    },
    {
        "name": "single_parent_two_kids",
        "restaurant": 0.5,
        "grocery_store": 0.9,
        "school": 0.95,
        "hospital": 0.85,
        "marta_stop": 0.7,
        "police_station": 0.85,
        "park": 0.8,
        "crime_incident": 0.9
    },
    {
        "name": "safety_focused_conservative",
        "restaurant": 0.4,
        "grocery_store": 0.8,
        "school": 0.6,
        "hospital": 0.8,
        "marta_stop": 0.3,
        "police_station": 0.95,
        "park": 0.6,
        "crime_incident": 0.95
    },
    {
        "name": "eco_warrior_cyclist",
        "restaurant": 0.7,
        "grocery_store": 0.8,
        "school": 0.3,
        "hospital": 0.5,
        "marta_stop": 0.9,
        "police_station": 0.5,
        "park": 0.95,
        "crime_incident": 0.6
    },
    {
        "name": "busy_executive",
        "restaurant": 0.8,
        "grocery_store": 0.6,
        "school": 0.2,
        "hospital": 0.7,
        "marta_stop": 0.85,
        "police_station": 0.6,
        "park": 0.4,
        "crime_incident": 0.7
    },
    {
        "name": "artist_creative_type",
        "restaurant": 0.75,
        "grocery_store": 0.6,
        "school": 0.2,
        "hospital": 0.4,
        "marta_stop": 0.8,
        "police_station": 0.4,
        "park": 0.8,
        "crime_incident": 0.5
    },
    {
        "name": "medical_professional",
        "restaurant": 0.6,
        "grocery_store": 0.7,
        "school": 0.5,
        "hospital": 0.95,
        "marta_stop": 0.6,
        "police_station": 0.7,
        "park": 0.5,
        "crime_incident": 0.75
    },
    {
        "name": "teacher_educator",
        "restaurant": 0.5,
        "grocery_store": 0.8,
        "school": 0.9,
        "hospital": 0.6,
        "marta_stop": 0.7,
        "police_station": 0.7,
        "park": 0.75,
        "crime_incident": 0.8
    },
    {
        "name": "budget_conscious_minimalist",
        "restaurant": 0.3,
        "grocery_store": 0.95,
        "school": 0.2,
        "hospital": 0.6,
        "marta_stop": 0.85,
        "police_station": 0.6,
        "park": 0.5,
        "crime_incident": 0.7
    },
    {
        "name": "suburban_transplant",
        "restaurant": 0.5,
        "grocery_store": 0.85,
        "school": 0.7,
        "hospital": 0.7,
        "marta_stop": 0.4,
        "police_station": 0.8,
        "park": 0.8,
        "crime_incident": 0.85
    }
]
df_user_profiles = pd.DataFrame(user_profiles)

In [None]:
counter = 1
for _, row in df_user_profiles.iterrows():

    row = row.drop('name').to_dict()
    df_hexagons_fake = calculate_accessibility_scores(hexagons, df_pois, user_has_vehicle) 
    df_hexagons_fake = smooth_scores_spatially(df_hexagons_fake, neighbor_weight=0.3)
    df_hexagons_fake = apply_user_weights(df_hexagons_fake, row)
    df_classified_fake = cluster_based_on_score(df_hexagons_fake,n_tiers=10)
    counter = counter + 1

    

#run pipelien for each user profile
# df_hexagons_fake = calculate_accessibility_scores(hexagons, df_pois, user_has_vehicle)
# df_hexagons_fake = smooth_scores_spatially(df_hexagons_fake, neighbor_weight=0.3)
# df_hexagons_fake = apply_user_weights(df_hexagons_fake, user_weights)
# df_classified_fake = cluster_based_on_score(df_hexagons_fake,n_tiers=10)



>> Using cached accessibility scores

Applying spatial smoothing to 8 score columns...
  Smoothing hexagon 0/536...
  Smoothing hexagon 50/536...
  Smoothing hexagon 100/536...
  Smoothing hexagon 150/536...
  Smoothing hexagon 200/536...
  Smoothing hexagon 250/536...
  Smoothing hexagon 300/536...
  Smoothing hexagon 350/536...
  Smoothing hexagon 400/536...
  Smoothing hexagon 450/536...
  Smoothing hexagon 500/536...
Spatial smoothing complete
Applying user preferences: {'restaurant': 0.6, 'grocery_store': 0.9, 'school': 0.95, 'hospital': 0.8, 'marta_stop': 0.0, 'police_station': 0.7, 'park': 0.95, 'crime_incident': 0.85}
Normalized weights (exponential): {'restaurant': 0.16595029899213465, 'grocery_store': 0.1347935280323951, 'school': 0.13020196563044004, 'hospital': 0.144468126266763, 'police_station': 0.15483710391505903, 'park': 0.13020196563044004, 'crime_incident': 0.13954701153276813}

User Match Score Statistics:
count    536.000000
mean       0.403130
std        0.109760


In [122]:
df_classified_fake

Unnamed: 0,hex_id,lat,lon,restaurant_accessibility,grocery_store_accessibility,school_accessibility,hospital_accessibility,marta_stop_accessibility,police_station_accessibility,park_accessibility,...,restaurant_norm,grocery_store_norm,school_norm,hospital_norm,police_station_norm,park_norm,crime_incident_norm,user_match_score,suitability,suitability_label
0,8844c1a829fffff,33.731158,-84.350962,0.305215,0.263756,13.772425,4.664697,180.461520,13.358962,9.480341,...,0.092067,0.165329,0.868521,0.471022,0.632925,0.292035,0.848846,0.473172,2,Less Suitable
1,8844c1322bfffff,33.839478,-84.349656,0.902156,0.167882,7.798623,5.633020,136.625944,4.269603,6.135270,...,0.272131,0.105233,0.278610,0.647234,0.110458,0.188084,0.829491,0.346470,6,
2,8844c1a865fffff,33.715965,-84.342377,0.015559,0.221574,12.979695,3.961777,151.779299,11.771927,4.210946,...,0.004693,0.138888,0.790239,0.343106,0.541701,0.128284,0.947552,0.404765,4,
3,8844c1aa35fffff,33.705560,-84.390105,0.000000,0.000000,14.210065,3.699478,216.079881,14.325937,12.761633,...,0.000000,0.000000,0.911738,0.295374,0.688508,0.394004,0.824272,0.434314,3,
4,8844c1a1abfffff,33.749898,-84.486790,0.000000,0.000000,10.157395,2.630510,189.300217,5.088223,9.665883,...,0.000000,0.000000,0.511538,0.100847,0.157513,0.297801,0.807604,0.257034,9,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531,8844c1ab3bfffff,33.701572,-84.343178,0.000000,0.007742,11.892824,3.675520,112.192703,10.977200,3.377375,...,0.000000,0.004853,0.682911,0.291014,0.496019,0.102380,0.969560,0.357044,6,
532,8844c1ab17fffff,33.702371,-84.352561,0.000000,0.014819,12.854577,3.723136,132.814751,11.388887,5.858156,...,0.000000,0.009289,0.777884,0.299679,0.519683,0.179472,0.941253,0.381011,5,
533,8844c1ad29fffff,33.837507,-84.411148,0.336205,0.028501,6.477364,5.152813,36.250369,6.258129,9.352454,...,0.101415,0.017865,0.148136,0.559847,0.224761,0.288060,0.975443,0.327833,7,
534,8844c1aa31fffff,33.706356,-84.399494,0.000000,0.006174,13.773973,3.659971,235.995140,14.523476,12.008309,...,0.000000,0.003870,0.868674,0.288185,0.699863,0.370593,0.811608,0.425133,3,


In [87]:
from sklearn.neighbors import NearestNeighbors

feature_cols = [col for col in df_user_profiles.columns if col != 'name']
samples = df_user_profiles[feature_cols].values


neigh = NearestNeighbors(n_neighbors=1)
neigh.fit(samples)

fake_user_array = [user_weights[col] for col in feature_cols]
distances, indices = neigh.kneighbors([fake_user_array])

similarity = 1 - distances[0][0]
print(similarity)
if similarity >= 0.85:
    print("go for collaborative filtering")
else:
    df_classified = cluster_based_on_score(df_hexagons, n_tiers=10)



0.9425543735346197
go for collaborative filtering


In [88]:
df_user_profiles

Unnamed: 0,name,restaurant,grocery_store,school,hospital,marta_stop,police_station,park,crime_incident
0,young_family_with_toddler,0.6,0.9,0.95,0.8,0.0,0.7,0.95,0.85
1,university_student,0.8,0.6,0.0,0.3,0.95,0.5,0.4,0.6
2,young_professional_no_car,0.9,0.7,0.1,0.5,0.95,0.6,0.5,0.7
3,retiree_couple,0.5,0.8,0.0,0.95,0.0,0.8,0.9,0.9
4,remote_worker_homebody,0.4,0.9,0.0,0.6,0.0,0.7,0.0,0.85
5,foodie_socialite,0.95,0.0,0.0,0.4,0.0,0.5,0.0,0.5
6,health_conscious_athlete,0.6,0.8,0.1,0.7,0.4,0.6,0.95,0.7
7,family_with_teenagers,0.7,0.85,0.9,0.7,0.6,0.8,0.7,0.85
8,elderly_living_alone,0.3,0.95,0.1,0.95,0.5,0.9,0.6,0.95
9,graduate_student_researcher,0.6,0.7,0.0,0.4,0.9,0.5,0.5,0.65


Unnamed: 0,hex_id,lat,lon,restaurant_accessibility,grocery_store_accessibility,school_accessibility,hospital_accessibility,marta_stop_accessibility,police_station_accessibility,park_accessibility,crime_incident_accessibility,avg_rent,police_station_norm,grocery_store_norm,hospital_norm,marta_stop_norm,restaurant_norm,park_norm,user_match_score
2,8844c1a865fffff,33.715965,-84.342377,0.438628,0.377914,16.941963,3.551997,367.612584,11.771927,11.459812,-127.309229,1033.734472,0.54,0.198047,0.446968,0.439361,0.18397,0.276774,0.275874
3,8844c1aa35fffff,33.70556,-84.390105,0.413945,0.148154,18.247054,3.232534,492.821023,14.036823,22.435637,-426.491125,898.027613,0.670672,0.063914,0.376514,0.644751,0.173617,0.556719,0.276203
4,8844c1a1abfffff,33.749898,-84.48679,0.063626,0.153459,13.623664,2.332016,406.328485,5.088223,16.664738,-466.941507,769.55152,0.154386,0.067011,0.177916,0.50287,0.026686,0.409529,0.19924
5,8844c1ab2dfffff,33.677182,-84.311553,0.008696,0.04559,11.58709,2.609583,147.656074,2.412307,0.949278,-0.029979,941.854419,0.0,0.004038,0.23913,0.078548,0.003647,0.008696,0.030607
7,8844c1aabdfffff,33.751939,-84.425297,0.812334,0.366644,18.678869,3.729351,640.696173,14.223571,32.562874,-823.409142,879.421965,0.681446,0.191467,0.486081,0.887323,0.340709,0.81502,0.443865
