### Running step by step of clustering with commute distance (1st)  site count (2nd) and then headcount (3rd) constraints
#### initial cluster are created with proximity, we do NOT allow min and max site count to break it

#### Outliers are currently clustered into -1, and then NO further processing

In [1]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
import os
from sklearn.cluster import DBSCAN, KMeans
from scipy.spatial.distance import pdist, squareform
from matplotlib import pyplot as plt

In [2]:
# Constants (make changes here)
#DEFAULT_COMMUTE_RADIUS = 180  # miles (approx. 3-hour commute)
#DEFAULT_COMMUTE_RADIUS = 360
#DEFAULT_COMMUTE_RADIUS = 60 ## primary constraint
DEFAULT_COMMUTE_RADIUS= 300

MIN_SITES = 6
MAX_SITES = 10      ## secondary constraint

### headcount constraints (not enforced)
hc_min = 8000      ## to be used with 180 miles
#hc_min = 15000      ## to be used with 360 miles
hc_max = 50000

### Function Used

In [3]:
def load_locations(file_path):
    """Load location data from Excel file"""
    return pd.read_excel(file_path)

def calculate_distance_miles(coord1, coord2):
    """Calculate distance between two coordinates in miles"""
    return geodesic(coord1, coord2).miles

def safe_join(series):
    """Join series values handling different data types and null values"""
    if series is None or len(series) == 0:
        return ""
    return ', '.join(str(x) for x in pd.Series(series).dropna().unique() if str(x).strip() != '')


# Campus Size Category based on quartiles of OPS_H sum
def get_size_category(ops_h_sum, q1, q2, q3):
    """Classify campus size based on OPS_H sum quartiles"""
    if ops_h_sum < q1:
        return "Small"
    elif ops_h_sum < q2:
        return "Medium"
    elif ops_h_sum < q3:
        return "Large"
    else:
        return "X-Large"


# HR Staffing Levels (base rules)
def get_hr_staffing_levels(size_category):
    """
    Returns recommended HR staffing levels based on size category.
    Base staffing:
      X-Large: Campus Leader=1, HR_L4=16, HR_L5=6, HR_L6=5, HR_L7=2
      Large:   Campus Leader=1, HR_L4=11, HR_L5=5, HR_L6=3, HR_L7=1
      Medium:  Campus Leader=1, HR_L4=8,  HR_L5=4, HR_L6=2, HR_L7=0
      Small:   Campus Leader=1, HR_L4=5,  HR_L5=3, HR_L6=2, HR_L7=0
      Multi-Site: Campus Leader=1, HR_L4=1, HR_L5=1, HR_L6=0, HR_L7=0
    """
    staffing_matrix = {
        'Small': {
            'Campus_Leader': 1,
            'HR_L4': 5,
            'HR_L5': 3,
            'HR_L6': 2,
            'HR_L7': 0
        },
        'Medium': {
            'Campus_Leader': 1,
            'HR_L4': 8,
            'HR_L5': 4,
            'HR_L6': 2,
            'HR_L7': 0
        },
        'Large': {
            'Campus_Leader': 1,
            'HR_L4': 11,
            'HR_L5': 5,
            'HR_L6': 3,
            'HR_L7': 1
        },
        'X-Large': {
            'Campus_Leader': 1,
            'HR_L4': 16,
            'HR_L5': 6,
            'HR_L6': 5,
            'HR_L7': 2
        },
        'Multi-Site': {
            'Campus_Leader': 1,
            'HR_L4': 1,
            'HR_L5': 1,
            'HR_L6': 0,
            'HR_L7': 0
        }
    }
    return staffing_matrix.get(size_category, {
        'Campus_Leader': 0,
        'HR_L4': 0,
        'HR_L5': 0,
        'HR_L6': 0,
        'HR_L7': 0
    })



# Analyze clusters and perform HR/staffing analysis.
def analyze_clusters(df):
    """Generate summary statistics and staffing analysis for each campus."""
    cluster_stats = []
    cluster_ids = df['cluster_id'].unique()
    
    # Compute quartile thresholds for OPS_H from valid (non-outlier) clusters.
    campus_sizes = []
    for cid in cluster_ids:
        cluster_data = df[df['cluster_id'] == cid]
        if not cluster_data['is_outlier'].all():
            campus_sizes.append(cluster_data['OPS_H'].sum())
    if campus_sizes:
        q1_size = np.percentile(campus_sizes, 25)
        q2_size = np.percentile(campus_sizes, 50)
        q3_size = np.percentile(campus_sizes, 75)
    else:
        q1_size, q2_size, q3_size = 15000, 30000, 50000

    for cid in cluster_ids:
        cluster_data = df[df['cluster_id'] == cid]
        center_lat = cluster_data['latitude'].mean()
        center_lon = cluster_data['longitude'].mean()
        
        max_distance = 0
        sites = cluster_data[['latitude', 'longitude']].values
        if len(sites) > 1:
            for i in range(len(sites)):
                for j in range(i+1, len(sites)):
                    d = calculate_distance_miles(sites[i], sites[j])
                    if d > max_distance:
                        max_distance = d
        
        cluster_aa_hc = cluster_data['OPS_H'].sum()
        cluster_ops_hc = cluster_data['OPS_S'].sum()
        
        # Current staffing summary (if such columns exist)
        current_staffing = {
            'Current_Campus_Leader': 1 if 'Campus_Leader' in cluster_data.columns else 0,
            'Current_HR_L4': cluster_data['HR_4'].sum() if 'HR_4' in cluster_data.columns else 0,
            'Current_HR_L5': cluster_data['HR_5'].sum() if 'HR_5' in cluster_data.columns else 0,
            'Current_HR_L6': cluster_data['HR_6'].sum() if 'HR_6' in cluster_data.columns else 0,
            'Current_HR_L7': cluster_data['HR_7'].sum() if 'HR_7' in cluster_data.columns else 0
        }
        current_hr_total = sum(current_staffing.values())
        current_gearing_ratio = (cluster_aa_hc / current_hr_total) if current_hr_total else None
        current_gearing_ratio_ops = (cluster_ops_hc / current_hr_total) if current_hr_total else None
        
        if cluster_data['multi_site'].any():
            size_category = "Multi-Site"
            new_hr_staffing = get_hr_staffing_levels("Multi-Site")
        else:
            size_category = get_size_category(cluster_aa_hc, q1_size, q2_size, q3_size)
            new_hr_staffing = get_hr_staffing_levels(size_category)
        
        new_hr_total = sum(new_hr_staffing.values())
        new_gearing_ratio = (cluster_aa_hc / new_hr_total) if new_hr_total else None
        new_gearing_ratio_ops = (cluster_ops_hc / new_hr_total) if new_hr_total else None
        
        stats = {
            'cluster_id': cid,
            'num_sites': len(cluster_data),
            'size_category': size_category,
            'Cluster_AA_HC': cluster_aa_hc,
            'Cluster_OPS_HC': cluster_ops_hc,
            'Current_Campus_Leader': current_staffing['Current_Campus_Leader'],
            'Current_HR_L4': current_staffing['Current_HR_L4'],
            'Current_HR_L5': current_staffing['Current_HR_L5'],
            'Current_HR_L6': current_staffing['Current_HR_L6'],
            'Current_HR_L7': current_staffing['Current_HR_L7'],
            'Current_HR_Total': current_hr_total,
            'Current_Gearing_Ratio_AA': current_gearing_ratio,
            'Current_Gearing_Ratio_OPS': current_gearing_ratio_ops,
            'New_Campus_Leader': new_hr_staffing['Campus_Leader'],
            'New_HR_L4': new_hr_staffing['HR_L4'],
            'New_HR_L5': new_hr_staffing['HR_L5'],
            'New_HR_L6': new_hr_staffing['HR_L6'],
            'New_HR_L7': new_hr_staffing['HR_L7'],
            'New_HR_Total': new_hr_total,
            'New_Gearing_Ratio_AA': new_gearing_ratio,
            'New_Gearing_Ratio_OPS': new_gearing_ratio_ops,
            'country': safe_join(cluster_data['country']),
            'markets': safe_join(cluster_data['market']),
            'cities': safe_join(cluster_data['city']),
            'states': safe_join(cluster_data['state']),
            'center_latitude': center_lat,
            'center_longitude': center_lon,
            'max_distance_miles': round(max_distance, 2),
            'num_reassigned_outliers': len(cluster_data[cluster_data['reassignment_type'].notnull()])
        }
        cluster_stats.append(stats)
    
    df_stats = pd.DataFrame(cluster_stats)
    # Round gearing ratios for presentation.
    for col in ['Current_Gearing_Ratio_AA', 'Current_Gearing_Ratio_OPS',
                'New_Gearing_Ratio_AA', 'New_Gearing_Ratio_OPS']:
        df_stats[col] = df_stats[col].apply(lambda x: round(x, 2) if pd.notnull(x) else x)
    
    return df_stats

### Main

In [5]:
### load file
df = pd.read_excel('../data_files/input/locations.xlsx')
print(df.shape)

## distance matrix b/w site locations

coords = df[['latitude', 'longitude']].values
dist_matrix = squareform(pdist(coords, metric=lambda u, v: calculate_distance_miles(u, v)))

print(dist_matrix.min(),dist_matrix.max(),dist_matrix.mean())
print(np.quantile(dist_matrix, 0.25), np.quantile(dist_matrix, 0.5), np.quantile(dist_matrix, 0.75))

(1118, 17)


In [6]:
### clustering 1st pass

df_clustered = df.copy()

#commute_radius = DEFAULT_COMMUTE_RADIUS/5    ## for 60
#commute_radius = DEFAULT_COMMUTE_RADIUS/4    ## for 180
#commute_radius = DEFAULT_COMMUTE_RADIUS/5     ## for 360
commute_radius = DEFAULT_COMMUTE_RADIUS/6     ## for 300

min_sites = MIN_SITES
max_sites = MAX_SITES

##### stats for 60 miles
### DEFAULT_COMMUTE_RADIUS/4 is when we get 3 cluster with max distance > commute radius; but outlier is 40% of sites
### DEFAULT_COMMUTE_RADIUS/5 is when we get 0 cluster with max distance > commute radius; but outliers is 51% of sites 

clustering = DBSCAN(eps=commute_radius, min_samples=min_sites, metric='precomputed')
cluster_labels = clustering.fit_predict(dist_matrix)
df_clustered['cluster_id'] = cluster_labels
df_clustered['initial_outlier'] = (cluster_labels == -1)
df_clustered['outlier_reassigned'] = False
df_clustered['is_outlier'] = (cluster_labels == -1)
df_clustered['multi_site'] = False
df_clustered['reassignment_type'] = None

print(df_clustered['cluster_id'].nunique())

### Analyse initial cluster
cluster_analysis = analyze_clusters(df_clustered)

36


In [7]:
### #outliers grouped into -1 cluster

print(cluster_analysis[(cluster_analysis['cluster_id'] == -1)]['num_sites'].sum())
print(cluster_analysis[(cluster_analysis['cluster_id'] == -1)]['num_sites'].sum() / cluster_analysis['num_sites'].sum())


215
0.19230769230769232


In [None]:
## except outliers, summary of remaining clusters
cluster_analysis_no_outlier = cluster_analysis[~(cluster_analysis['cluster_id'] == -1)]

##how many clusters have max distance > DEFAULT_COMMUTE_RADIUS
print(len(cluster_analysis_no_outlier[cluster_analysis_no_outlier['max_distance_miles'] > DEFAULT_COMMUTE_RADIUS]))
print(len(cluster_analysis_no_outlier[cluster_analysis_no_outlier['max_distance_miles'] > DEFAULT_COMMUTE_RADIUS])/ len(cluster_analysis_no_outlier))

cluster_analysis_no_outlier[['num_sites','Cluster_AA_HC','max_distance_miles']].describe()

1
0.02857142857142857


Unnamed: 0,num_sites,Cluster_AA_HC,max_distance_miles
count,35.0,35.0,35.0
mean,25.8,19605.371429,87.338571
std,38.896393,27566.381577,89.108605
min,6.0,3602.0,14.26
25%,11.0,7263.5,37.325
50%,15.0,12969.0,65.82
75%,23.5,20636.5,107.98
max,231.0,160341.0,525.03


In [9]:
### what are the clusters with max distance above DEFAULT_COMMUTE_RADIUS
cluster_analysis_no_outlier[cluster_analysis_no_outlier['max_distance_miles'] > DEFAULT_COMMUTE_RADIUS]

Unnamed: 0,cluster_id,num_sites,size_category,Cluster_AA_HC,Cluster_OPS_HC,Current_Campus_Leader,Current_HR_L4,Current_HR_L5,Current_HR_L6,Current_HR_L7,...,New_Gearing_Ratio_AA,New_Gearing_Ratio_OPS,country,markets,cities,states,center_latitude,center_longitude,max_distance_miles,num_reassigned_outliers
5,5,231,X-Large,160341,6369,0,222,203,113,37,...,5344.7,212.3,United States,"Virginia Beach-Norfolk-Newport News, VA-NC, Wa...","Virginia Beach, Forestville, West Deptford, Wi...",United States,40.264041,-74.875234,525.03,0


### Now adding site_count as secondary constraint

In [10]:
df_adjusted = df_clustered.copy()

# Split clusters that are too large (by site count)
clusters = df_adjusted[df_adjusted['cluster_id'] != -1]['cluster_id'].unique()
for cid in clusters:
    cluster_data = df_adjusted[df_adjusted['cluster_id'] == cid]
    if len(cluster_data) > max_sites:
        changed = True
        n_subclusters = int(np.ceil(len(cluster_data) / max_sites))
        coords = cluster_data[['latitude', 'longitude']].values
        kmeans = KMeans(n_clusters=n_subclusters, random_state=42)
        sub_labels = kmeans.fit_predict(coords)
        new_cluster_id = df_adjusted['cluster_id'].max() + 1
        for sub in np.unique(sub_labels):
            indices = cluster_data.index[sub_labels == sub]
            df_adjusted.loc[indices, 'cluster_id'] = new_cluster_id
            new_cluster_id += 1

print(df_adjusted['cluster_id'].nunique())

### Analyse adjusted clusters new constraint
cluster_adj_analysis = analyze_clusters(df_adjusted)

109


In [11]:
## except outliers, summary of remaining clusters
cluster_adj_analysis_no_outlier = cluster_adj_analysis[~(cluster_adj_analysis['cluster_id'] == -1)]

##how many clusters have num_sites > MAX_SITES
print(len(cluster_adj_analysis_no_outlier[cluster_adj_analysis_no_outlier['num_sites'] > MAX_SITES]))
print(len(cluster_adj_analysis_no_outlier[cluster_adj_analysis_no_outlier['num_sites'] > MAX_SITES])/ len(cluster_adj_analysis_no_outlier))

cluster_adj_analysis_no_outlier[['num_sites','Cluster_AA_HC','max_distance_miles']].describe()


### 32 cluster has > max site count

32
0.2962962962962963


Unnamed: 0,num_sites,Cluster_AA_HC,max_distance_miles
count,108.0,108.0,108.0
mean,8.361111,6353.592593,34.83713
std,4.508472,4265.292448,20.908976
min,1.0,120.0,0.0
25%,5.0,3572.75,20.9175
50%,7.0,5210.5,30.44
75%,11.0,9244.5,47.8875
max,25.0,25570.0,118.1


In [12]:
### repeating once more !!!

df_adjusted_2 = df_adjusted.copy()

# Split clusters that are too large (by site count)
clusters = df_adjusted_2[df_adjusted_2['cluster_id'] != -1]['cluster_id'].unique()
for cid in clusters:
    cluster_data = df_adjusted_2[df_adjusted_2['cluster_id'] == cid]
    if len(cluster_data) > max_sites:
        changed = True
        n_subclusters = int(np.ceil(len(cluster_data) / max_sites))
        coords = cluster_data[['latitude', 'longitude']].values
        kmeans = KMeans(n_clusters=n_subclusters, random_state=42)
        sub_labels = kmeans.fit_predict(coords)
        new_cluster_id = df_adjusted_2['cluster_id'].max() + 1
        for sub in np.unique(sub_labels):
            indices = cluster_data.index[sub_labels == sub]
            df_adjusted_2.loc[indices, 'cluster_id'] = new_cluster_id
            new_cluster_id += 1

print(df_adjusted_2['cluster_id'].nunique())

### Analyse adjusted clusters after ops headcount constraint
cluster_adj_analysis_2 = analyze_clusters(df_adjusted_2)

143


In [13]:
## except outliers, summary of remaining clusters
cluster_adj_analysis_2_no_outlier = cluster_adj_analysis_2[~(cluster_adj_analysis_2['cluster_id'] == -1)]

##how many clusters have num sites > max sites
print(len(cluster_adj_analysis_2_no_outlier[cluster_adj_analysis_2_no_outlier['num_sites'] > MAX_SITES]))
print(len(cluster_adj_analysis_2_no_outlier[cluster_adj_analysis_2_no_outlier['num_sites'] > MAX_SITES])/ len(cluster_adj_analysis_2_no_outlier))

cluster_adj_analysis_2_no_outlier[['num_sites','Cluster_AA_HC','max_distance_miles']].describe()


### 7 CLUSTERS has > max sites

7
0.04929577464788732


Unnamed: 0,num_sites,Cluster_AA_HC,max_distance_miles
count,142.0,142.0,142.0
mean,6.359155,4832.309859,25.055563
std,3.09046,3390.568883,18.608298
min,1.0,120.0,0.0
25%,4.0,2222.25,13.4925
50%,6.5,4092.0,22.13
75%,8.0,6908.25,32.915
max,19.0,15010.0,78.25


In [14]:
### repeating once more !!!

df_adjusted_3 = df_adjusted_2.copy()

# Split clusters that are too large (by site count)
clusters = df_adjusted_3[df_adjusted_3['cluster_id'] != -1]['cluster_id'].unique()
for cid in clusters:
    cluster_data = df_adjusted_3[df_adjusted_3['cluster_id'] == cid]
    if len(cluster_data) > max_sites:
        changed = True
        n_subclusters = int(np.ceil(len(cluster_data) / max_sites))
        coords = cluster_data[['latitude', 'longitude']].values
        kmeans = KMeans(n_clusters=n_subclusters, random_state=42)
        sub_labels = kmeans.fit_predict(coords)
        new_cluster_id = df_adjusted_3['cluster_id'].max() + 1
        for sub in np.unique(sub_labels):
            indices = cluster_data.index[sub_labels == sub]
            df_adjusted_3.loc[indices, 'cluster_id'] = new_cluster_id
            new_cluster_id += 1

print(df_adjusted_3['cluster_id'].nunique())

### Analyse adjusted clusters after ops headcount constraint
cluster_adj_analysis_3 = analyze_clusters(df_adjusted_3)

150


In [15]:
## except outliers, summary of remaining clusters
cluster_adj_analysis_3_no_outlier = cluster_adj_analysis_3[~(cluster_adj_analysis_3['cluster_id'] == -1)]

##how many clusters have num sites > max sites
print(len(cluster_adj_analysis_3_no_outlier[cluster_adj_analysis_3_no_outlier['num_sites'] > MAX_SITES]))
print(len(cluster_adj_analysis_3_no_outlier[cluster_adj_analysis_3_no_outlier['num_sites'] > MAX_SITES])/ len(cluster_adj_analysis_3_no_outlier))

cluster_adj_analysis_3_no_outlier[['num_sites','Cluster_AA_HC','max_distance_miles']].describe()


### 1 CLUSTERS has > max sites
##thats ok

1
0.006711409395973154


Unnamed: 0,num_sites,Cluster_AA_HC,max_distance_miles
count,149.0,149.0,149.0
mean,6.060403,4605.288591,23.998255
std,2.602567,3221.915467,18.052673
min,1.0,120.0,0.0
25%,4.0,1960.0,11.62
50%,6.0,3999.0,20.44
75%,8.0,6818.0,32.13
max,11.0,15010.0,74.77


#### further grouping sites below min site count

In [16]:
### for 60 miles
#print(len(cluster_adj_analysis_2_no_outlier[cluster_adj_analysis_2_no_outlier['num_sites'] < MIN_SITES]))

### for 300 miles
print(len(cluster_adj_analysis_3_no_outlier[cluster_adj_analysis_3_no_outlier['num_sites'] < MIN_SITES]))

59


In [17]:
df_adjusted_4= df_adjusted_3.copy()

# Merge clusters that are too small (by site count)

clusters = df_adjusted_4[df_adjusted_4['cluster_id'] != -1]['cluster_id'].unique()
cluster_centers = {cid: (df_adjusted_4[df_adjusted_4['cluster_id'] == cid]['latitude'].mean(),
                         df_adjusted_4[df_adjusted_4['cluster_id'] == cid]['longitude'].mean())
                    for cid in clusters}
for cid in clusters:
    cluster_data = df_adjusted_4[df_adjusted_4['cluster_id'] == cid]
    #aa_hc = cluster_data['OPS_H'].sum()
    if len(cluster_data) < min_sites:
        candidate = None
        min_dist = float('inf')
        for other_cid in clusters:
            if other_cid == cid:
                continue
            other_data = df_adjusted_4[df_adjusted_4['cluster_id'] == other_cid]
            combined_sites = len(cluster_data) + len(other_data)
            #combined_aa_hc = aa_hc + other_data['OPS_H'].sum()
            if combined_sites <= max_sites:
                d = calculate_distance_miles(cluster_centers[cid], cluster_centers[other_cid])
                if d < min_dist:
                    min_dist = d
                    candidate = other_cid
        if candidate is not None:
            df_adjusted_4.loc[df_adjusted_4['cluster_id'] == cid, 'cluster_id'] = candidate
            changed = True


print(df_adjusted_4['cluster_id'].nunique())

### Analyse adjusted clusters after ops headcount constraint
cluster_adj_analysis_4 = analyze_clusters(df_adjusted_4)

118


In [18]:
## except outliers, summary of remaining clusters
cluster_adj_analysis_4_no_outlier = cluster_adj_analysis_4[~(cluster_adj_analysis_4['cluster_id'] == -1)]

##how many clusters have num sites < min sites
print(len(cluster_adj_analysis_4_no_outlier[cluster_adj_analysis_4_no_outlier['num_sites'] < MIN_SITES]))
print(len(cluster_adj_analysis_4_no_outlier[cluster_adj_analysis_4_no_outlier['num_sites'] < MIN_SITES])/ len(cluster_adj_analysis_4_no_outlier))

cluster_adj_analysis_4_no_outlier[['num_sites','Cluster_AA_HC','max_distance_miles']].describe()


### 10 CLUSTERS has < min sites

10
0.08547008547008547


Unnamed: 0,num_sites,Cluster_AA_HC,max_distance_miles
count,117.0,117.0,117.0
mean,7.717949,5864.854701,37.41188
std,1.770672,3166.829983,28.92238
min,2.0,598.0,3.44
25%,7.0,3239.0,18.49
50%,8.0,5138.0,27.37
75%,9.0,8175.0,52.01
max,11.0,15010.0,192.15


In [19]:
### repeating once more !!!

df_adjusted_5= df_adjusted_4.copy()

# Merge clusters that are too small (by site count)

clusters = df_adjusted_5[df_adjusted_5['cluster_id'] != -1]['cluster_id'].unique()
cluster_centers = {cid: (df_adjusted_5[df_adjusted_5['cluster_id'] == cid]['latitude'].mean(),
                         df_adjusted_5[df_adjusted_5['cluster_id'] == cid]['longitude'].mean())
                    for cid in clusters}
for cid in clusters:
    cluster_data = df_adjusted_5[df_adjusted_5['cluster_id'] == cid]
    #aa_hc = cluster_data['OPS_H'].sum()
    if len(cluster_data) < min_sites:
        candidate = None
        min_dist = float('inf')
        for other_cid in clusters:
            if other_cid == cid:
                continue
            other_data = df_adjusted_5[df_adjusted_5['cluster_id'] == other_cid]
            combined_sites = len(cluster_data) + len(other_data)
            #combined_aa_hc = aa_hc + other_data['OPS_H'].sum()
            if combined_sites <= max_sites:
                d = calculate_distance_miles(cluster_centers[cid], cluster_centers[other_cid])
                if d < min_dist:
                    min_dist = d
                    candidate = other_cid
        if candidate is not None:
            df_adjusted_5.loc[df_adjusted_5['cluster_id'] == cid, 'cluster_id'] = candidate
            changed = True


print(df_adjusted_5['cluster_id'].nunique())

### Analyse adjusted clusters after ops headcount constraint
cluster_adj_analysis_5 = analyze_clusters(df_adjusted_5)

112


In [21]:
## except outliers, summary of remaining clusters
cluster_adj_analysis_5_no_outlier = cluster_adj_analysis_5[~(cluster_adj_analysis_5['cluster_id'] == -1)]

##how many clusters have num sites < min sites
print(len(cluster_adj_analysis_5_no_outlier[cluster_adj_analysis_5_no_outlier['num_sites'] < MIN_SITES]))
print(len(cluster_adj_analysis_5_no_outlier[cluster_adj_analysis_5_no_outlier['num_sites'] < MIN_SITES])/ len(cluster_adj_analysis_5_no_outlier))

cluster_adj_analysis_5_no_outlier[['num_sites','Cluster_AA_HC','max_distance_miles']].describe()


### No CLUSTERS has < min sites -> but this break proimity constraint

0
0.0


Unnamed: 0,num_sites,Cluster_AA_HC,max_distance_miles
count,111.0,111.0,111.0
mean,8.135135,6181.873874,51.603423
std,1.344933,3012.627857,67.57112
min,6.0,1730.0,3.44
25%,7.0,3670.0,18.645
50%,8.0,5548.0,31.17
75%,9.0,8284.5,58.825
max,11.0,15010.0,421.33


In [22]:
### what are the clusters with max distance above DEFAULT_COMMUTE_RADIUS
cluster_adj_analysis_5_no_outlier[cluster_adj_analysis_5_no_outlier['max_distance_miles'] > DEFAULT_COMMUTE_RADIUS]

Unnamed: 0,cluster_id,num_sites,size_category,Cluster_AA_HC,Cluster_OPS_HC,Current_Campus_Leader,Current_HR_L4,Current_HR_L5,Current_HR_L6,Current_HR_L7,...,New_Gearing_Ratio_AA,New_Gearing_Ratio_OPS,country,markets,cities,states,center_latitude,center_longitude,max_distance_miles,num_reassigned_outliers
22,12,10,Large,5958,263,0,8,11,4,1,...,283.71,12.52,United States,"Boise City, ID, Salt Lake City, UT, Provo-Orem...","Nampa, Boise, West Jordan, Meridian, American ...",United States,42.33075,-114.591921,331.7,0
44,69,8,Large,6123,230,0,6,8,4,1,...,291.57,10.95,United States,"Dayton-Kettering, OH, Wilmington, OH, Staunton...","Vandalia, Kettering, Wilmington, Fishersville,...",United States,39.232713,-82.147496,317.41,0
89,36,9,Medium,4079,213,0,7,8,3,3,...,271.93,14.2,United States,"Atlanta-Sandy Springs-Alpharetta, GA, Deltona-...","Atlanta, White, Daytona Beach, Deltona, Lithia...",United States,31.77523,-83.304336,421.33,0
110,178,9,Small,2353,171,0,5,6,2,0,...,213.91,15.55,United States,"St. Louis, MO-IL, Madison, WI","Edwardsville, Madison, Pontoon Beach, East Sai...",United States,40.661538,-89.739389,314.7,0


## DO NOT RUN

###  try 3rd constraint headcount

### no way to apply headcount constraint without breaking both of previous constraints - DO NOT RUN

In [None]:
df_adjusted_4 = df_adjusted_3.copy()

# Merge clusters that are too small (by OPS_H sum < hc_min)
clusters = df_adjusted_4[df_adjusted_4['cluster_id'] != -1]['cluster_id'].unique()
cluster_centers = {cid: (df_adjusted_4[df_adjusted_4['cluster_id'] == cid]['latitude'].mean(),
                         df_adjusted_4[df_adjusted_4['cluster_id'] == cid]['longitude'].mean())
                  for cid in clusters}
for cid in clusters:
    cluster_data = df_adjusted_4[df_adjusted_4['cluster_id'] == cid]
    aa_hc = cluster_data['OPS_H'].sum()
    if aa_hc < (hc_min):
        candidate = None
        min_dist = float('inf')
        for other_cid in clusters:
            if other_cid == cid:
                continue
            other_data = df_adjusted_4[df_adjusted_4['cluster_id'] == other_cid]
            combined_sites = len(cluster_data) + len(other_data)
            combined_aa_hc = aa_hc + other_data['OPS_H'].sum()
            ## to make sure it doesn't break max sites rule
            if combined_aa_hc <= hc_max and combined_sites < max_sites:
                d = calculate_distance_miles(cluster_centers[cid], cluster_centers[other_cid])
                #if d < min_dist and d < (DEFAULT_COMMUTE_RADIUS-5):
                if d < min_dist:
                    min_dist = d
                    candidate = other_cid
        if candidate is not None:
            df_adjusted_4.loc[df_adjusted_4['cluster_id'] == cid, 'cluster_id'] = candidate
            changed = True


print(df_adjusted_4['cluster_id'].nunique())

### Analyse adjusted clusters after ops headcount constraint
cluster_adj_analysis_4 = analyze_clusters(df_adjusted_4)

52


In [None]:
## except outliers, summary of remaining clusters
cluster_adj_analysis_4_no_outlier = cluster_adj_analysis_4[~(cluster_adj_analysis_4['cluster_id'] == -1)]

##how many clusters have headcount > hc_min
print(len(cluster_adj_analysis_4_no_outlier[cluster_adj_analysis_4_no_outlier['Cluster_AA_HC'] < hc_min]))
print(len(cluster_adj_analysis_4_no_outlier[cluster_adj_analysis_4_no_outlier['Cluster_AA_HC'] < hc_min])/ len(cluster_adj_analysis_4_no_outlier))

cluster_adj_analysis_4_no_outlier[['num_sites','Cluster_AA_HC','max_distance_miles']].describe()


### 27  cluster 

27
0.5294117647058824


Unnamed: 0,num_sites,Cluster_AA_HC,max_distance_miles
count,51.0,51.0,51.0
mean,10.588235,8413.392157,22.668627
std,4.210351,4680.199218,10.437401
min,6.0,2149.0,4.44
25%,7.0,5017.5,17.465
50%,10.0,7627.0,21.68
75%,13.0,10667.5,26.6
max,20.0,23698.0,56.2


### Resume: To save output files

In [None]:
#### for 60 miles, 6-20 sites

output_clustered = '../data_files/output/clustered_locations_60_miles_site_count_6_20.xlsx'
output_analysis = '../data_files/output/cluster_analysis_60_miles_site_count_6_20.xlsx'

df_adjusted_3.to_excel(output_clustered, index=False)
cluster_adj_analysis_3.to_excel(output_analysis, index=False)

In [24]:
#### for 300 miles, 6-10 sites

output_clustered = '../data_files/output/clustered_locations_300_miles_site_count_6_10.xlsx'
output_analysis = '../data_files/output/cluster_analysis_300_miles_site_count_6_10.xlsx'

df_adjusted_5.to_excel(output_clustered, index=False)
cluster_adj_analysis_5.to_excel(output_analysis, index=False)