In [1]:
import pandas as pd
import json
import glob
import logging
import time
import os

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [2]:

CALL_LIMIT_PER_WORKER = 267



In [3]:

# Load workers data and format correctly
logging.info("Loading workers data and calculating recommendation scores...")
workers_df = pd.read_json('extracted/workers.json').transpose().reset_index()
workers_df.columns = ['worker_id', 'name', 'base_salary']



2024-10-31 18:57:39,126 - INFO - Loading workers data and calculating recommendation scores...


In [4]:
# Load and flatten reports data for worker performance
reports_records = []
for file_path in glob.glob('extracted/previous_reports/*.json'):
    reports_records.extend(pd.read_json(file_path).to_dict(orient='records'))
reports_df = pd.DataFrame(reports_records)



In [5]:
# Calculate average recommendation score for each worker
worker_performance = reports_df.groupby('worker_id')['likely_to_recommend'].mean().reset_index()
worker_performance.columns = ['worker_id', 'avg_recommendation_score']

# Merge performance data with workers
workers_df = workers_df.merge(worker_performance, on='worker_id', how='left')



In [6]:
# Define thresholds based on recommendation score
def assign_difficulty_preference(score):
    if score > 2.2:
        return 'hard'
    elif 1.8 <= score <= 2.2:
        return 'medium'
    else:
        return 'easy'

# Assign difficulty preference to each worker
workers_df['preferred_difficulty'] = workers_df['avg_recommendation_score'].apply(assign_difficulty_preference)



In [7]:
# Assign calls to workers based on difficulty preference and performance
def assign_calls_to_workers(calls_df, workers_df):
    # Initialize the schedule dictionary
    schedule = {worker_id: [] for worker_id in workers_df['worker_id']}
    
    # Separate calls by difficulty for prioritized assignment
    calls_by_difficulty = {
        'hard': calls_df[calls_df['difficulty'] == 'hard'],
        'medium': calls_df[calls_df['difficulty'] == 'medium'],
        'easy': calls_df[calls_df['difficulty'] == 'easy']
    }

    def assign_call_to_worker(call_id, difficulty):
        # Filter and sort eligible workers by recommendation score
        eligible_workers = workers_df[workers_df['preferred_difficulty'] == difficulty].sort_values(
            by='avg_recommendation_score', ascending=False
        )
        
        # Try assigning to a preferred eligible worker
        for _, worker in eligible_workers.iterrows():
            worker_id = worker['worker_id']
            if len(schedule[worker_id]) < CALL_LIMIT_PER_WORKER:
                schedule[worker_id].append(call_id)
                return True

        # If preferred eligible workers are full, assign to a random available worker under the limit
        fallback_workers = workers_df.sample(frac=1)  # Shuffle to randomize fallback selection
        for _, worker in fallback_workers.iterrows():
            worker_id = worker['worker_id']
            if len(schedule[worker_id]) < CALL_LIMIT_PER_WORKER:
                schedule[worker_id].append(call_id)
                logging.warning(f"Call {call_id} assigned to fallback worker {worker_id} due to all eligible workers reaching call limits.")
                return True

        # Log error if all workers are at their limit
        logging.error(f"No available workers for call {call_id}. This should never happen.")
        return False

    # Assign calls by difficulty level based on preference
    for difficulty, calls in calls_by_difficulty.items():
        for _, call in calls.iterrows():
            call_id = call['call_id']
            if not assign_call_to_worker(call_id, difficulty):
                logging.warning(f"Call {call_id} could not be assigned under normal constraints.")
            # DANGER: we dont have an else statementfor when a call could  not be assigned to a normal worker under normal constraints. this made it that the schedule we delived was only 1/3 coomplete

    return schedule



In [22]:
# Process each feature call file
for file_path in glob.glob('extracted/feature_calls/*.json'):
    start_time = time.time()
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    
    logging.info(f"Processing feature calls file: {file_path}")
    
    # Load and flatten feature calls data for the current file
    feature_records = []
    with open(file_path) as f:
        data = json.load(f)
    for location, calls in data.items():
        for call_id, call_info in calls.items():
            call_info['call_id'] = call_id
            call_info['location'] = location
            feature_records.append(call_info)
    feature_calls_df = pd.DataFrame(feature_records)

    # Generate the call schedule for this file
    call_schedule = assign_calls_to_workers(feature_calls_df, workers_df)

    # Save the schedule in the required format
    output_schedule = {worker_id: calls for worker_id, calls in call_schedule.items()}
    output_file = f'call_shedule_{file_name}.json'
    
    with open(output_file, 'w') as outfile:
        json.dump(output_schedule, outfile, indent=4)
    
    total_time = time.time() - start_time
    logging.info(f"Call schedule for {file_name} generated and saved to '{output_file}' in {total_time:.2f} seconds.")


Unnamed: 0,worker_id,name,base_salary,avg_recommendation_score
0,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803,Efren Selva,10119,1.816522
1,w_ad84fb4e-5229-4c19-91e7-5e5cf8d3f20c,Alan Brown,10715,1.512500
2,w_653b3a89-c5fa-466b-a477-f0c09a724cdd,Douglas Case,8259,1.486364
3,w_7fbf0deb-0c65-449a-91ee-0e9e8cff4d0c,Christa Scott,11672,1.641053
4,w_cf0ff121-da11-4b5c-b191-4d895ce97512,Christopher Greer,9408,1.622093
...,...,...,...,...
750,w_972693d7-5edb-4499-abcb-98861d5a72e8,Anthony Nagel,10873,1.524211
751,w_4d6a56fd-1d07-477f-b86a-2ced18e96076,Roger Doubet,9881,1.368132
752,w_62aff065-2e41-4d62-a64e-a6c5783ef77a,Randall Clark,9610,1.747778
753,w_46c17e47-9b03-482a-9d91-eeabb5eb1b84,Eleanor Williamson,10455,1.510465
