### New idea


1) total headcount across all sclusters
2) parameter -> #regions
3) calculate average headcount per region
4) calculate min & max headcount per region based on threshold parameter values (eg 20% width, ie. 80%-120% of average headcount is acceptable)
5) assign clusters to region iteratively based on max distance b/w clsuter centers (within acceptance criteria)

In [1]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
import os
from scipy.spatial.distance import pdist, squareform
from matplotlib import pyplot as plt
import geopandas
from geodatasets import get_path
from shapely.geometry import Point

In [None]:
def calculate_distance_miles(coord1, coord2):
    """Calculate distance between two coordinates in miles"""
    return geodesic(coord1, coord2).miles


def safe_join(series):
    """Join series values handling different data types and null values"""
    if series is None or len(series) == 0:
        return ""
    return ', '.join(str(x) for x in pd.Series(series).dropna().unique() if str(x).strip() != '')


# Analyze regions an
def analyze_regions(df):
    """Generate summary statistics and staffing analysis for each campus."""
    cluster_stats = []
    cluster_ids = df['cluster_id'].unique()
    
    # Compute quartile thresholds for OPS_H from valid (non-outlier) clusters.
    campus_sizes = []
    for cid in cluster_ids:
        cluster_data = df[df['cluster_id'] == cid]
        #if not cluster_data['is_outlier'].all():
        #    campus_sizes.append(cluster_data['OPS_H'].sum())
    if campus_sizes:
        q1_size = np.percentile(campus_sizes, 25)
        q2_size = np.percentile(campus_sizes, 50)
        q3_size = np.percentile(campus_sizes, 75)
    else:
        q1_size, q2_size, q3_size = 15000, 30000, 50000

    for cid in cluster_ids:
        cluster_data = df[df['cluster_id'] == cid]
        center_lat = cluster_data['latitude'].mean()
        center_lon = cluster_data['longitude'].mean()
        
        max_distance = 0
        sites = cluster_data[['latitude', 'longitude']].values
        if len(sites) > 1:
            for i in range(len(sites)):
                for j in range(i+1, len(sites)):
                    d = calculate_distance_miles(sites[i], sites[j])
                    if d > max_distance:
                        max_distance = d
        
        cluster_aa_hc = cluster_data['OPS_H'].sum()
        #cluster_ops_hc = cluster_data['OPS_S'].sum()
        
        stats = {
            'cluster_id': cid,
            'num_sites': len(cluster_data),
            'size_category': size_category,
            'Cluster_AA_HC': cluster_aa_hc,
            'Cluster_OPS_HC': cluster_ops_hc,
            
            'center_latitude': center_lat,
            'center_longitude': center_lon,
            'max_distance_miles': round(max_distance, 2),
            
        }
        cluster_stats.append(stats)
    
    df_stats = pd.DataFrame(cluster_stats)
    
    return df_stats

In [3]:
### parameters

num_regions = 8
threshold = 0.2

In [4]:
## load cluster files

output_clustered = '../data_files/output/after_outlier/clustered_locations_300_miles_site_count_6_10_outlier_2.xlsx'
output_analysis = '../data_files/output/after_outlier/cluster_analysis_300_miles_site_count_6_10_outlier_2.xlsx'

df_clustered = pd.read_excel(output_clustered)
cluster_analysis = pd.read_excel(output_analysis)

In [5]:
### use cluster analysis file directly
### note this is orginal cluster center (before outliers added)
rel_cols = ['cluster_id','center_latitude','center_longitude','num_sites_outlier_1','Cluster_AA_HC_outlier_1']

rel_cluster_data =  cluster_analysis[rel_cols]
rel_cluster_data.head()

Unnamed: 0,cluster_id,center_latitude,center_longitude,num_sites_outlier_1,Cluster_AA_HC_outlier_1
0,0,32.210377,-110.970245,18,9642
1,200,28.41891,-81.217432,9,6879
2,136,29.886499,-95.632105,7,3882
3,46,32.595851,-116.94916,6,5502
4,4,30.287884,-81.785339,10,9285


In [None]:
### acceptance conditions

total_AA_HC = rel_cluster_data['Cluster_AA_HC_outlier_1'].sum()
avg_AA_HC_region = total_AA_HC/num_regions

min_AA_HC_region = (1 - threshold) * avg_AA_HC_region
max_AA_HC_region = (1 + threshold) * avg_AA_HC_region

print(total_AA_HC, avg_AA_HC_region, min_AA_HC_region, max_AA_HC_region)

806840 100855.0 80684.0 121026.0


In [None]:

##how many clusters have headcount < min_AA_HC_region 
print(len(rel_cluster_data[rel_cluster_data['Cluster_AA_HC_outlier_1'] < min_AA_HC_region]))
print(len(rel_cluster_data[rel_cluster_data['Cluster_AA_HC_outlier_1'] < min_AA_HC_region])/ len(rel_cluster_data))

rel_cluster_data[['num_sites_outlier_1','Cluster_AA_HC_outlier_1']].describe()

## 100% not met requirement 

111
1.0


Unnamed: 0,num_sites_outlier_1,Cluster_AA_HC_outlier_1
count,111.0,111.0
mean,10.072072,7268.828829
std,3.141255,3784.693337
min,6.0,1730.0
25%,8.0,4291.0
50%,9.0,6889.0
75%,11.0,9698.0
max,22.0,18288.0


In [111]:
df_adjusted = rel_cluster_data.copy()
#df_adjusted.loc[:,'merged'] = np.nan

print(df_adjusted['cluster_id'].nunique())

# Merge clusters inot regions based on headcount

clusters = df_adjusted['cluster_id'].unique()
cluster_centers = {cid: (df_adjusted[df_adjusted['cluster_id'] == cid]['center_latitude'].mean(),
                         df_adjusted[df_adjusted['cluster_id'] == cid]['center_longitude'].mean())
                    for cid in clusters}
for cid in clusters:
    cluster_data = df_adjusted[df_adjusted['cluster_id'] == cid]

    ##if already merged then skip
    #print(cluster_data['merged'].to_numpy()[0])
    #if ~cluster_data['merged'].isna().any():
    #    continue

    aa_hc = cluster_data['Cluster_AA_HC_outlier_1'].sum()
    if aa_hc < min_AA_HC_region:
    #if len(cluster_data) < min_sites:
        candidate = None
        min_dist = float('inf')
        for other_cid in clusters:
            if other_cid == cid:
                continue
            other_data = df_adjusted[df_adjusted['cluster_id'] == other_cid]
            #combined_sites = len(cluster_data) + len(other_data)
            combined_aa_hc = aa_hc + other_data['Cluster_AA_HC_outlier_1'].sum()
            if combined_aa_hc <= max_AA_HC_region:
                d = calculate_distance_miles(cluster_centers[cid], cluster_centers[other_cid])
                if d < min_dist:
                    min_dist = d
                    candidate = other_cid
        #print(candidate)
        if candidate is not None:
            df_adjusted.loc[df_adjusted['cluster_id'] == cid, 'cluster_id'] = candidate
            
            

            ## retain initial cluster id 
            #df_adjusted.loc[df_adjusted['cluster_id'] == cid, 'initial_cluster_id'] = cid
            
            ##cluster id based on candidate
            #df_adjusted.loc[df_adjusted['cluster_id'] == cid, 'cluster_id'] = candidate
            


print(df_adjusted['cluster_id'].nunique())


111
45


In [112]:
## mapping to retain old cluster id

org_clusters = rel_cluster_data[['cluster_id']]
org_clusters = org_clusters.rename(columns={'cluster_id':'initial_cluster_id'})

# Concatenate without ignoring index
df_adjusted_2 = pd.concat([df_adjusted, org_clusters], axis=1)

In [115]:
## groupby cluster_id 

cluster_adj_analysis = analyze_clusters(df_adjusted_2)

KeyError: 'is_outlier'

In [None]:
#### trying a single pass to debug


df_adjusted = rel_cluster_data.copy()
clusters = df_adjusted['cluster_id'].unique()
cluster_centers = {cid: (df_adjusted[df_adjusted['cluster_id'] == cid]['center_latitude'].to_numpy()[0],
                         df_adjusted[df_adjusted['cluster_id'] == cid]['center_longitude'].to_numpy()[0])
                    for cid in clusters}


cid = clusters[0]
print(cid)

cluster_data = df_adjusted[df_adjusted['cluster_id'] == cid]
aa_hc = cluster_data['Cluster_AA_HC_outlier_1'].sum()
#print(aa_hc)
#print(min_AA_HC_region)

if aa_hc < min_AA_HC_region:
    candidate = None
    min_dist = float('inf')
    for other_cid in clusters:
        if other_cid == cid:
            continue
        #print(other_cid)
        other_data = df_adjusted[df_adjusted['cluster_id'] == other_cid]
        combined_aa_hc = aa_hc + other_data['Cluster_AA_HC_outlier_1'].sum()
        #print(combined_aa_hc)

        if combined_aa_hc <= max_AA_HC_region:
            #print(cluster_centers[cid])
            #print(cluster_centers[other_cid])
            d = calculate_distance_miles(cluster_centers[cid], cluster_centers[other_cid])
            if d < min_dist:
                min_dist = d
                candidate = other_cid
    
    print(candidate)
    if candidate is not None:
        
        ##create merged cluster id based on candidate
        df_adjusted.loc[df_adjusted['cluster_id'] == cid, 'merged_cluster_id'] = candidate
        df_adjusted.loc[df_adjusted['cluster_id'] == cid, 'merged'] = True
        


0
105


In [None]:
df_adjusted[df_adjusted['initial_cluster_id']==]

Unnamed: 0,cluster_id,center_latitude,center_longitude,num_sites_outlier_1,Cluster_AA_HC_outlier_1,merged,initial_cluster_id


In [64]:
df_adjusted[df_adjusted['cluster_id']==105]


Unnamed: 0,cluster_id,center_latitude,center_longitude,num_sites_outlier_1,Cluster_AA_HC_outlier_1,merged,initial_cluster_id
