In [2]:
import pandas as pd
import numpy as np
import json

In [3]:
# Load workers data and format correctly
workers_df = pd.read_json('Schedule1/extracted/workers.json').transpose().reset_index()
workers_df.columns = ['worker_id', 'name', 'base_salary']

In [4]:
workers_df.head()

Unnamed: 0,worker_id,name,base_salary
0,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803,Efren Selva,10119
1,w_ad84fb4e-5229-4c19-91e7-5e5cf8d3f20c,Alan Brown,10715
2,w_653b3a89-c5fa-466b-a477-f0c09a724cdd,Douglas Case,8259
3,w_7fbf0deb-0c65-449a-91ee-0e9e8cff4d0c,Christa Scott,11672
4,w_cf0ff121-da11-4b5c-b191-4d895ce97512,Christopher Greer,9408


In [5]:
# Load prices.json as a dictionary and convert to DataFrame
with open('Schedule1/prices.json') as f:
    prices_data = json.load(f)
prices_df = pd.DataFrame(list(prices_data.items()), columns=['technical_problem', 'price'])

In [6]:
prices_df.head()

Unnamed: 0,technical_problem,price
0,browser_and_web_based_support,650
1,teams_problems,665
2,internet_problems,680
3,cloud_and_storage_solutions,690
4,zoom_problems,695


In [12]:
# Load and flatten feature_calls.json
with open('ikkeimal_calls/calls/calls_21.json') as f:
    feature_calls_data = json.load(f)
    
# Flatten feature_calls_data
feature_records = []
for location, calls in feature_calls_data.items():
    for call_id, call_info in calls.items():
        call_info['call_id'] = call_id
        call_info['location'] = location
        feature_records.append(call_info)
feature_calls_df = pd.DataFrame(feature_records)

In [13]:
feature_calls_df

Unnamed: 0,date,tlf_number,technical_problem,difficulty,commission,call_id,location
0,1970-01-01 00:00:00+00:00,20962728,device_and_peripheral_setup,medium,705.0,c_0195b1db-9c43-4a1b-a0ab-241d46b65ccc,bangalore
1,1970-01-01 00:00:00+00:00,93884912,device_and_peripheral_setup,hard,846.0,c_82aa1a0f-5bf6-49a8-9a32-ceda4b0bea08,bangalore
2,1970-01-01 00:00:00+00:00,34997608,email_related_issues,medium,700.0,c_f26adcb6-1fbd-4c5d-a844-e861dd6088e7,bangalore
3,1970-01-01 00:00:00+00:00,08890010,software_installation_and_configuration,easy,600.0,c_a930dc43-76d4-43fe-a6c4-af82427c65b5,bangalore
4,1970-01-01 00:00:00+00:00,41021032,internet_problems,hard,816.0,c_ee2e7e49-799b-48c0-a0fd-f89a02d7c463,bangalore
...,...,...,...,...,...,...,...
25517,1970-01-01 00:00:00+00:00,85406010,account_and_security_issues,medium,710.0,c_d777b5cb-66d4-4e2c-aeaf-96fc075c9b9a,hyderabad
25518,1970-01-01 00:00:00+00:00,23185048,software_installation_and_configuration,medium,750.0,c_cc3c5b8a-2f51-415a-9979-a2d8db08caec,hyderabad
25519,1970-01-01 00:00:00+00:00,17929997,internet_problems,hard,816.0,c_9bcd486e-9448-4745-9ff4-cdb9ca04bcc7,hyderabad
25520,1970-01-01 00:00:00+00:00,26338837,email_related_issues,hard,840.0,c_be9b8b36-7134-4eb4-94f4-9fa42c98ccee,hyderabad


In [15]:
# Load and flatten previous_calls.json
with open('Schedule1/extracted/feature_calls/calls_11.json') as f:
    previous_calls_data = json.load(f)
previous_records = []
for location, calls in previous_calls_data.items():
    for call_id, call_info in calls.items():
        call_info['call_id'] = call_id
        call_info['location'] = location
        previous_records.append(call_info)
previous_calls_df = pd.DataFrame(previous_records)


In [16]:
previous_calls_df

Unnamed: 0,date,tlf_number,technical_problem,difficulty,commission,call_id,location
0,2024-10-23 16:54:39.515508,72345741,teams_problems,medium,665.0,c_5e239e86-20f4-4053-a77b-a01dd3971aad,bangalore
1,2024-10-23 12:18:18.515647,53336166,basic_hardware_troubleshooting,easy,576.0,c_1cbe224b-804b-4b30-bf78-198feea748f8,bangalore
2,2024-10-20 01:42:41.515693,81959025,email_related_issues,medium,700.0,c_ca2f35cc-47fb-446f-a8ad-3c0360edb98c,bangalore
3,2024-10-24 12:14:40.515734,96315388,device_and_peripheral_setup,medium,705.0,c_59b1c1cc-6362-4edd-aed8-720536d733e6,bangalore
4,2024-10-24 19:24:07.515775,54054590,internet_problems,medium,680.0,c_69e87302-cad8-42a3-9fb2-e64c0a4b14e4,bangalore
...,...,...,...,...,...,...,...
25311,2024-10-22 08:22:30.583100,34729150,device_and_peripheral_setup,hard,846.0,c_7f67de4f-76c1-40de-be23-ebdd0a636019,hyderabad
25312,2024-10-21 03:25:25.583138,76118231,operating_system_support,hard,882.0,c_8c4dc8eb-161a-4c60-a68e-2f54595a2756,hyderabad
25313,2024-10-25 13:25:00.583176,17245434,account_and_security_issues,easy,568.0,c_45767486-edd2-4207-a5df-3c35e15038db,hyderabad
25314,2024-10-20 17:25:45.583213,34434026,internet_problems,easy,544.0,c_603ffae3-4d8e-4b7f-ba53-e382dfdfce32,hyderabad


In [18]:
# Load reports data
reports_df = pd.read_json('Schedule1/ikkeheltimal_call_reports_11_20/future_call_reports/call_report_11.json')
reports_df

Unnamed: 0,call_id,worker_id,call_time,likely_to_recommend,professional_score,call_profit
0,c_670f54f4-9041-4ae2-85b9-05d30a4dbf23,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803,33.017543,1.0,,780
1,c_a4b4e43f-5939-4992-b576-c844f90a8131,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803,26.193912,1.0,,900
2,c_e2a563ff-802e-4f56-9d08-47fadc3270b6,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803,26.551302,1.0,,864
3,c_c2f70a56-60d8-45b8-b90c-73a6c24d6cc8,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803,28.051302,1.0,,864
4,c_17d86ada-c033-460e-9205-47b5c1e1f248,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803,31.855709,1.0,,846
...,...,...,...,...,...,...
3613,c_e5e5a752-2acd-4d41-be5f-46787fa2c053,w_46c17e47-9b03-482a-9d91-eeabb5eb1b84,57.421901,1.0,,780
3614,c_3d484a00-5b2d-47cc-ba02-2a4740201324,w_46c17e47-9b03-482a-9d91-eeabb5eb1b84,58.921901,1.0,,780
3615,c_9339ffa2-d827-45be-a2ad-ab5ae7febded,w_46c17e47-9b03-482a-9d91-eeabb5eb1b84,52.190790,1.0,,816
3616,c_742476ed-a390-4001-b2eb-26d4b318ee48,w_46c17e47-9b03-482a-9d91-eeabb5eb1b84,62.231301,1.0,,852


In [20]:
# Load and flatten schedules data
with open('Schedule1/extracted/feature_schedules/call_shedule_11.json') as f:
    schedules_data = json.load(f)
schedule_records = []
for worker_id, calls in schedules_data.items():
    for call_id in calls:
        schedule_records.append({'worker_id': worker_id, 'call_id': call_id})
schedules_df = pd.DataFrame(schedule_records)
schedules_df


Unnamed: 0,worker_id,call_id
0,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803,c_718780d2-05b4-45be-9685-cd37583049ab
1,w_ad84fb4e-5229-4c19-91e7-5e5cf8d3f20c,c_cd9c31f6-2c9c-4b41-95c2-0d6cfa5d7c52
2,w_ad84fb4e-5229-4c19-91e7-5e5cf8d3f20c,c_b5cce9b3-82f6-4c90-9d15-a27088eff079
3,w_ad84fb4e-5229-4c19-91e7-5e5cf8d3f20c,c_9bd813d0-005e-412d-b4aa-505d9282d6eb
4,w_ad84fb4e-5229-4c19-91e7-5e5cf8d3f20c,c_98a0c19f-aece-47a5-b8db-81914795e10f
...,...,...
25311,w_f5400776-c6cc-4563-b835-b5ee56242aa1,c_4298397e-6b42-4936-95b0-4ca36dc50ffa
25312,w_f5400776-c6cc-4563-b835-b5ee56242aa1,c_5db6c5b3-ca91-4795-8204-ceaf4bb34838
25313,w_f5400776-c6cc-4563-b835-b5ee56242aa1,c_fa0253e4-806d-4109-afeb-d7037fba67d9
25314,w_f5400776-c6cc-4563-b835-b5ee56242aa1,c_b1f83e1f-afec-4d3c-9c84-4c0dc77363e7


In [21]:
worker_performance = schedules_df.groupby('worker_id').agg(
        call_count=('call_id', 'count')
    ).reset_index()
worker_performance


Unnamed: 0,worker_id,call_count
0,w_00020787-2ccf-492e-bae4-0d04a4d7ec8a,3
1,w_006be806-a7bb-4a8e-b08b-3a00a117f15a,5
2,w_010ff3f2-78e3-41fa-95e2-5443dcc2aacd,6
3,w_018e899e-f64b-41e3-9f44-c25d14cd8660,7
4,w_01abe3b4-af73-4521-b5c8-bd582d9d2d89,267
...,...,...
744,w_fdfc2986-f7d5-45b1-8ce9-d0a68b8ae16e,5
745,w_fe2a2048-30d3-4f02-8921-069584e541c3,5
746,w_fe89bc24-7071-4e47-89cd-f4b180817d55,11
747,w_ff59e406-e6df-458b-a609-d8515eb511ef,5


In [22]:
# Combine prices with calls based on technical problems
feature_calls_df = feature_calls_df.merge(prices_df, on='technical_problem', how='left')

previous_calls_df = previous_calls_df.merge(prices_df, on='technical_problem', how='left')

In [23]:
# Ensure call_time is available by adding a dummy column if missing
if 'call_time' not in feature_calls_df.columns:
    feature_calls_df['call_time'] = np.nan  # Or set this with actual values if available


In [24]:
feature_calls_df.head()

Unnamed: 0,date,tlf_number,technical_problem,difficulty,commission,call_id,location,price,call_time
0,1970-01-01 00:00:00+00:00,20962728,device_and_peripheral_setup,medium,705.0,c_0195b1db-9c43-4a1b-a0ab-241d46b65ccc,bangalore,705,
1,1970-01-01 00:00:00+00:00,93884912,device_and_peripheral_setup,hard,846.0,c_82aa1a0f-5bf6-49a8-9a32-ceda4b0bea08,bangalore,705,
2,1970-01-01 00:00:00+00:00,34997608,email_related_issues,medium,700.0,c_f26adcb6-1fbd-4c5d-a844-e861dd6088e7,bangalore,700,
3,1970-01-01 00:00:00+00:00,8890010,software_installation_and_configuration,easy,600.0,c_a930dc43-76d4-43fe-a6c4-af82427c65b5,bangalore,750,
4,1970-01-01 00:00:00+00:00,41021032,internet_problems,hard,816.0,c_ee2e7e49-799b-48c0-a0fd-f89a02d7c463,bangalore,680,


In [25]:
# Calculate Expected Commission and Profit Discrepancy
difficulty_commission_map = {'hard': 1.2, 'medium': 1.0, 'easy': 0.8}
feature_calls_df['expected_commission'] = feature_calls_df['difficulty'].map(difficulty_commission_map) * feature_calls_df['price']
feature_calls_df['profit_discrepancy'] = feature_calls_df['commission'] - feature_calls_df['expected_commission']


In [26]:
feature_calls_df[['commission', 'expected_commission', 'profit_discrepancy', 'difficulty']][:100]

Unnamed: 0,commission,expected_commission,profit_discrepancy,difficulty
0,705.0,705.0,0.0,medium
1,846.0,846.0,0.0,hard
2,700.0,700.0,0.0,medium
3,600.0,600.0,0.0,easy
4,816.0,816.0,0.0,hard
...,...,...,...,...
95,690.0,690.0,0.0,medium
96,828.0,828.0,0.0,hard
97,680.0,680.0,0.0,medium
98,690.0,690.0,0.0,medium


In [27]:
worker_recommendation = reports_df.merge(workers_df, on='worker_id')


In [28]:
feature_calls_df['date'] = pd.to_datetime(feature_calls_df['date']).dt.date

# Group by 'date' (now containing only date component) and count call volume
call_volume_by_date_corrected = feature_calls_df.groupby('date').size().reset_index(name='call_volume')

In [29]:
feature_calls_df

Unnamed: 0,date,tlf_number,technical_problem,difficulty,commission,call_id,location,price,call_time,expected_commission,profit_discrepancy
0,1970-01-01,20962728,device_and_peripheral_setup,medium,705.0,c_0195b1db-9c43-4a1b-a0ab-241d46b65ccc,bangalore,705,,705.0,0.0
1,1970-01-01,93884912,device_and_peripheral_setup,hard,846.0,c_82aa1a0f-5bf6-49a8-9a32-ceda4b0bea08,bangalore,705,,846.0,0.0
2,1970-01-01,34997608,email_related_issues,medium,700.0,c_f26adcb6-1fbd-4c5d-a844-e861dd6088e7,bangalore,700,,700.0,0.0
3,1970-01-01,08890010,software_installation_and_configuration,easy,600.0,c_a930dc43-76d4-43fe-a6c4-af82427c65b5,bangalore,750,,600.0,0.0
4,1970-01-01,41021032,internet_problems,hard,816.0,c_ee2e7e49-799b-48c0-a0fd-f89a02d7c463,bangalore,680,,816.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
25517,1970-01-01,85406010,account_and_security_issues,medium,710.0,c_d777b5cb-66d4-4e2c-aeaf-96fc075c9b9a,hyderabad,710,,710.0,0.0
25518,1970-01-01,23185048,software_installation_and_configuration,medium,750.0,c_cc3c5b8a-2f51-415a-9979-a2d8db08caec,hyderabad,750,,750.0,0.0
25519,1970-01-01,17929997,internet_problems,hard,816.0,c_9bcd486e-9448-4745-9ff4-cdb9ca04bcc7,hyderabad,680,,816.0,0.0
25520,1970-01-01,26338837,email_related_issues,hard,840.0,c_be9b8b36-7134-4eb4-94f4-9fa42c98ccee,hyderabad,700,,840.0,0.0


In [30]:
worker_recommendation[['likely_to_recommend', 'worker_id']][:100]

Unnamed: 0,likely_to_recommend,worker_id
0,1.0,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803
1,1.0,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803
2,1.0,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803
3,1.0,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803
4,1.0,w_eb5ca7e7-197b-4128-9cdd-17b8d7d07803
...,...,...
95,1.0,w_9a21c8b0-2404-4bc4-9d12-3cb41fe1e066
96,1.0,w_9a21c8b0-2404-4bc4-9d12-3cb41fe1e066
97,1.0,w_9a21c8b0-2404-4bc4-9d12-3cb41fe1e066
98,1.0,w_8184008c-9e30-4ce8-bbf8-53b4d3dcd9f2
