### Outlier module that can be added to any clustering scenario

In [None]:
import pandas as pd
import numpy as np
from k_means_constrained import KMeansConstrained
from geopy.distance import geodesic
import os
from sklearn.cluster import DBSCAN, KMeans
from scipy.spatial.distance import pdist, squareform
from matplotlib import pyplot as plt


### Functions Used

In [None]:
def load_locations(file_path):
    """Load location data from Excel file"""
    return pd.read_excel(file_path)

def calculate_distance_miles(coord1, coord2):
    """Calculate distance between two coordinates in miles"""
    return geodesic(coord1, coord2).miles

def safe_join(series):
    """Join series values handling different data types and null values"""
    if series is None or len(series) == 0:
        return ""
    return ', '.join(str(x) for x in pd.Series(series).dropna().unique() if str(x).strip() != '')


# Campus Size Category based on quartiles of OPS_H sum
def get_size_category(ops_h_sum, q1, q2, q3):
    """Classify campus size based on OPS_H sum quartiles"""
    if ops_h_sum < q1:
        return "Small"
    elif ops_h_sum < q2:
        return "Medium"
    elif ops_h_sum < q3:
        return "Large"
    else:
        return "X-Large"


# HR Staffing Levels (base rules)
def get_hr_staffing_levels(size_category):
    """
    Returns recommended HR staffing levels based on size category.
    Base staffing:
      X-Large: Campus Leader=1, HR_L4=16, HR_L5=6, HR_L6=5, HR_L7=2
      Large:   Campus Leader=1, HR_L4=11, HR_L5=5, HR_L6=3, HR_L7=1
      Medium:  Campus Leader=1, HR_L4=8,  HR_L5=4, HR_L6=2, HR_L7=0
      Small:   Campus Leader=1, HR_L4=5,  HR_L5=3, HR_L6=2, HR_L7=0
      Multi-Site: Campus Leader=1, HR_L4=1, HR_L5=1, HR_L6=0, HR_L7=0
    """
    staffing_matrix = {
        'Small': {
            'Campus_Leader': 1,
            'HR_L4': 5,
            'HR_L5': 3,
            'HR_L6': 2,
            'HR_L7': 0
        },
        'Medium': {
            'Campus_Leader': 1,
            'HR_L4': 8,
            'HR_L5': 4,
            'HR_L6': 2,
            'HR_L7': 0
        },
        'Large': {
            'Campus_Leader': 1,
            'HR_L4': 11,
            'HR_L5': 5,
            'HR_L6': 3,
            'HR_L7': 1
        },
        'X-Large': {
            'Campus_Leader': 1,
            'HR_L4': 16,
            'HR_L5': 6,
            'HR_L6': 5,
            'HR_L7': 2
        },
        'Multi-Site': {
            'Campus_Leader': 1,
            'HR_L4': 1,
            'HR_L5': 1,
            'HR_L6': 0,
            'HR_L7': 0
        }
    }
    return staffing_matrix.get(size_category, {
        'Campus_Leader': 0,
        'HR_L4': 0,
        'HR_L5': 0,
        'HR_L6': 0,
        'HR_L7': 0
    })



# Analyze clusters and perform HR/staffing analysis.
def analyze_clusters(df):
    """Generate summary statistics and staffing analysis for each campus."""
    cluster_stats = []
    cluster_ids = df['cluster_id'].unique()
    
    # Compute quartile thresholds for OPS_H from valid (non-outlier) clusters.
    campus_sizes = []
    for cid in cluster_ids:
        cluster_data = df[df['cluster_id'] == cid]
        if not cluster_data['is_outlier'].all():
            campus_sizes.append(cluster_data['OPS_H'].sum())
    if campus_sizes:
        q1_size = np.percentile(campus_sizes, 25)
        q2_size = np.percentile(campus_sizes, 50)
        q3_size = np.percentile(campus_sizes, 75)
    else:
        q1_size, q2_size, q3_size = 15000, 30000, 50000

    for cid in cluster_ids:
        cluster_data = df[df['cluster_id'] == cid]
        center_lat = cluster_data['latitude'].mean()
        center_lon = cluster_data['longitude'].mean()
        
        max_distance = 0
        sites = cluster_data[['latitude', 'longitude']].values
        if len(sites) > 1:
            for i in range(len(sites)):
                for j in range(i+1, len(sites)):
                    d = calculate_distance_miles(sites[i], sites[j])
                    if d > max_distance:
                        max_distance = d
        
        cluster_aa_hc = cluster_data['OPS_H'].sum()
        cluster_ops_hc = cluster_data['OPS_S'].sum()
        
        # Current staffing summary (if such columns exist)
        current_staffing = {
            'Current_Campus_Leader': 1 if 'Campus_Leader' in cluster_data.columns else 0,
            'Current_HR_L4': cluster_data['HR_4'].sum() if 'HR_4' in cluster_data.columns else 0,
            'Current_HR_L5': cluster_data['HR_5'].sum() if 'HR_5' in cluster_data.columns else 0,
            'Current_HR_L6': cluster_data['HR_6'].sum() if 'HR_6' in cluster_data.columns else 0,
            'Current_HR_L7': cluster_data['HR_7'].sum() if 'HR_7' in cluster_data.columns else 0
        }
        current_hr_total = sum(current_staffing.values())
        current_gearing_ratio = (cluster_aa_hc / current_hr_total) if current_hr_total else None
        current_gearing_ratio_ops = (cluster_ops_hc / current_hr_total) if current_hr_total else None
        
        if cluster_data['multi_site'].any():
            size_category = "Multi-Site"
            new_hr_staffing = get_hr_staffing_levels("Multi-Site")
        else:
            size_category = get_size_category(cluster_aa_hc, q1_size, q2_size, q3_size)
            new_hr_staffing = get_hr_staffing_levels(size_category)
        
        new_hr_total = sum(new_hr_staffing.values())
        new_gearing_ratio = (cluster_aa_hc / new_hr_total) if new_hr_total else None
        new_gearing_ratio_ops = (cluster_ops_hc / new_hr_total) if new_hr_total else None
        
        stats = {
            'cluster_id': cid,
            'num_sites': len(cluster_data),
            'size_category': size_category,
            'Cluster_AA_HC': cluster_aa_hc,
            'Cluster_OPS_HC': cluster_ops_hc,
            'Current_Campus_Leader': current_staffing['Current_Campus_Leader'],
            'Current_HR_L4': current_staffing['Current_HR_L4'],
            'Current_HR_L5': current_staffing['Current_HR_L5'],
            'Current_HR_L6': current_staffing['Current_HR_L6'],
            'Current_HR_L7': current_staffing['Current_HR_L7'],
            'Current_HR_Total': current_hr_total,
            'Current_Gearing_Ratio_AA': current_gearing_ratio,
            'Current_Gearing_Ratio_OPS': current_gearing_ratio_ops,
            'New_Campus_Leader': new_hr_staffing['Campus_Leader'],
            'New_HR_L4': new_hr_staffing['HR_L4'],
            'New_HR_L5': new_hr_staffing['HR_L5'],
            'New_HR_L6': new_hr_staffing['HR_L6'],
            'New_HR_L7': new_hr_staffing['HR_L7'],
            'New_HR_Total': new_hr_total,
            'New_Gearing_Ratio_AA': new_gearing_ratio,
            'New_Gearing_Ratio_OPS': new_gearing_ratio_ops,
            'country': safe_join(cluster_data['country']),
            'markets': safe_join(cluster_data['market']),
            'cities': safe_join(cluster_data['city']),
            'states': safe_join(cluster_data['state']),
            'center_latitude': center_lat,
            'center_longitude': center_lon,
            'max_distance_miles': round(max_distance, 2),
            'num_reassigned_outliers': len(cluster_data[cluster_data['reassignment_type'].notnull()])
        }
        cluster_stats.append(stats)
    
    df_stats = pd.DataFrame(cluster_stats)
    # Round gearing ratios for presentation.
    for col in ['Current_Gearing_Ratio_AA', 'Current_Gearing_Ratio_OPS',
                'New_Gearing_Ratio_AA', 'New_Gearing_Ratio_OPS']:
        df_stats[col] = df_stats[col].apply(lambda x: round(x, 2) if pd.notnull(x) else x)
    
    return df_stats

### Outlier Logic 1: Mapping individual outliers to closest campus

In [None]:
## load cluster files

output_clustered = '../data_files/output/clustered_locations_300_miles_site_count_6_10.xlsx'
output_analysis = '../data_files/output/cluster_analysis_300_miles_site_count_6_10.xlsx'

df_clustered = pd.read_excel(output_clustered)
cluster_analysis = pd.read_excel(output_analysis)

df_clustered.head()

In [None]:
# assign outlier based on min distance
def find_closest_cluster(outlier, df_clustered):
    outlier_coords = (outlier['latitude'], outlier['longitude'])
    valid_clusters = df_clustered[(~df_clustered['is_outlier']) & (df_clustered['country'] == outlier['country'])]
    closest = None
    min_distance = float('inf')
    for cid in valid_clusters['cluster_id'].unique():
        cluster_data = df_clustered[df_clustered['cluster_id'] == cid]
        center = (cluster_data['latitude'].mean(), cluster_data['longitude'].mean())
        distance = calculate_distance_miles(outlier_coords, center)
        #if distance <= commute_radius and distance < min_distance:
        if distance < min_distance:
            min_distance = distance
            closest = {'cluster_id': cid, 'distance': distance, 'match_type': 'assigned by min distance'}
    return closest

In [None]:
df_adjusted = df_clustered.copy()

# First pass: reassign individual outliers based on min distance to exsiting clusters
outliers = df_adjusted[df_adjusted['is_outlier']].copy()
print(f"\nProcessing {len(outliers)} outliers for reassignment...")
for idx, outlier in outliers.iterrows():
    match_result = find_closest_cluster(outlier, df_adjusted)
    #print(match_result)
    #if match_result:
    cid = match_result['cluster_id']
    #current_size = len(df_clustered[df_clustered['cluster_id'] == cid])
    #if current_size < max_sites:
    df_adjusted.at[idx, 'cluster_id'] = cid
    df_adjusted.at[idx, 'is_outlier'] = False
    df_adjusted.at[idx, 'outlier_reassigned'] = True
    df_adjusted.at[idx, 'reassignment_type'] = match_result['match_type']
    df_adjusted.at[idx, 'distance_from_campus'] = round(match_result['distance'],1)
    #print(f"Reassigned outlier {idx} to cluster {cid} (distance: {match_result['distance']:.2f} miles)")

In [None]:
### Analyse adjusted clusters new constraint
cluster_adj_analysis = analyze_clusters(df_adjusted)

In [None]:
### merging certain columns of new cluster analysis with previous

## except outliers, summary of remaining clusters
cluster_analysis_no_outlier = cluster_analysis[~(cluster_analysis['cluster_id'] == -1)].drop(columns=['num_reassigned_outliers'])


## relevant columns from new cluster analyis
to_merge = cluster_adj_analysis[['cluster_id','num_sites','Cluster_AA_HC','Cluster_OPS_HC','max_distance_miles','num_reassigned_outliers']]

to_merge = to_merge.rename(columns={c: c+'_outlier_1' for c in to_merge.columns if c not in ['cluster_id','num_reassigned_outliers']})


### merge on cluster_id

cluster_analysis_merged = cluster_analysis_no_outlier.merge(to_merge, on='cluster_id', how='left')

In [None]:
### intermediate save

#### for 60 miles, 6-20 sites with outliers

output_clustered = '../data_files/output/after_outlier/clustered_locations_60_miles_site_count_6_20_outlier_1.xlsx'
output_analysis = '../data_files/output/after_outlier/cluster_analysis_60_miles_site_count_6_20_outlier_1.xlsx'

df_adjusted.to_excel(output_clustered, index=False)
cluster_analysis_merged.to_excel(output_analysis, index=False)



### Outlier Logic 2: Grouping outliers within a campus into nodes

In [None]:
outlier_node_min = 2
outlier_node_max = 5

In [None]:
df_adjusted_2 = df_adjusted.copy()

# Second pass: group outliers into multi-site nodes (nested under main cluster)

outliers = df_adjusted_2[df_adjusted_2['initial_outlier']]
new_node_id = 1
##only processing outliers
for cid in outliers['cluster_id'].unique():
    
    #one main cluster at a time
    outlier_data = outliers[outliers['cluster_id'] == cid]
    
    if len(outlier_data) > outlier_node_max:
        ## split using constrained kmeans clustering
        changed = True
        n_subclusters = int(np.ceil(len(outlier_data) / outlier_node_max))
        coords = outlier_data[['latitude', 'longitude']].values
        constr_kmeans = KMeansConstrained(n_clusters=n_subclusters, size_min=outlier_node_min, size_max=outlier_node_max, random_state=42)
        sub_labels = constr_kmeans.fit_predict(coords)
        for sub in np.unique(sub_labels):
            indices = outlier_data.index[sub_labels == sub]
            df_adjusted_2.loc[indices, 'outlier_node_id'] = new_node_id
            df_adjusted_2.loc[indices, 'multi_site'] = True
            df_adjusted_2.loc[indices, 'reassignment_type_2'] = "adjusted multi-site node"
            new_node_id += 1

    elif len(outlier_data) <= outlier_node_max and len(outlier_data) >= outlier_node_min:
        ## no need to perform additional clustering, just group them into a node
        changed = True
        indices = outlier_data.index
        df_adjusted_2.loc[indices, 'outlier_node_id'] = new_node_id
        df_adjusted_2.loc[indices, 'multi_site'] = True
        df_adjusted_2.loc[indices, 'reassignment_type_2'] = "auto compliant multi-site node"
        new_node_id += 1

    else:
        ## no grouping, but add node id and update reassignment type
        changed = False
        indices = outlier_data.index
        df_adjusted_2.loc[indices, 'outlier_node_id'] = new_node_id
        df_adjusted_2.loc[indices, 'multi_site'] = False
        df_adjusted_2.loc[indices, 'reassignment_type_2'] = "stay single outlier"
        new_node_id += 1

print(df_adjusted_2['cluster_id'].nunique())
print(len(outliers))
print(df_adjusted_2['outlier_node_id'].nunique())

In [None]:
### add outlier node summaries to cluster summary

to_check = df_adjusted_2.groupby(['cluster_id','outlier_node_id','reassignment_type_2']).size().reset_index()


In [None]:
nodal_summary = df_adjusted_2.groupby(['cluster_id'])['outlier_node_id'].nunique().to_frame('num_outlier_nodes').reset_index()

### merge on cluster_id
cluster_analysis_merged = cluster_analysis_merged.merge(nodal_summary, on='cluster_id', how='left')

In [None]:
### final save

#### for 300 miles, 6-10 sites with outliers

output_clustered = '../data_files/output/after_outlier/clustered_locations_300_miles_site_count_6_10_outlier_2.xlsx'
output_analysis = '../data_files/output/after_outlier/cluster_analysis_300_miles_site_count_6_10_outlier_2.xlsx'

df_adjusted_2.to_excel(output_clustered, index=False)
cluster_analysis_merged.to_excel(output_analysis, index=False)

