In [1]:
import pandas as pd
import numpy as np

## Load global_results from disk for speedup calculations

In [2]:
df = pd.read_pickle("global_results_df_param_0.95_3_20.picle")

## Analyze

In [3]:
100*len(df[df['noofheartbeats']==1])/len(df)

29.858447915692327

In [4]:
100*len(df[df['noofheartbeats']<=26])/len(df)

90.33267857699623

In [5]:
df['noofheartbeats'].describe()

count    320730.000000
mean         10.968700
std          11.612452
min           1.000000
25%           1.000000
50%           8.000000
75%          16.000000
max         144.000000
Name: noofheartbeats, dtype: float64

In [6]:
len(df[df['truelabel'] != df['predictedlabel']])

58740

In [7]:
# No of jobs where no decision was made
len( df[df['predictedlabel'] == -1]) 

12282

In [9]:
# Verification only: We expect average to be 1800 since measurements are every 30mins

getnonzeros = []
for x in np.sort(df.totaljobduration):
    if x != 0 and x< 1800:
        getnonzeros.append(x)

sum(getnonzeros)/float(len(getnonzeros))

1791.4930372600677

## Process: Replace all 0 totaljobduration with 1800 (30mins)

In [10]:
len(df[df['totaljobduration']==0]),  len(df[df['totaljobduration'] == 1800])

(95765, 724)

In [11]:
df = df.replace({'totaljobduration': {0: 1800.0 }}) 

In [12]:
len(df[df['totaljobduration']==0]),  len(df[df['totaljobduration'] == 1800])

(0, 96489)

## Process: Replace all 0 predictionduration with 1800 (30mins)

In [13]:
len(df[df['predictionduration']==0]),  len(df[df['predictionduration'] == 1800])

(95765, 751)

In [14]:
df = df.replace({'predictionduration': {0: 1800.0 }}) 

In [15]:
len(df[df['predictionduration']==0]),  len(df[df['predictionduration'] == 1800])

(0, 96516)

## Get all jobs where a decision was made

In [16]:
decision_df = df[ df.predictedlabel != -1 ]

In [17]:
len(decision_df)

308448

In [18]:
decision_df.head()

Unnamed: 0,jobID,totaljobduration,predictionduration,localsamplefrac,noofheartbeats,decision_threshold,truelabel,predictedlabel
0,33751041,9008.0,3604.0,0.5,6,0.95,0,0
1,33292290,10801.0,3604.0,0.428571,7,0.95,0,0
2,32636931,1800.0,1800.0,1.0,1,0.95,0,0
3,32243716,14418.0,3606.0,0.333333,9,0.95,1,1
4,33613353,12599.0,3607.0,0.375,8,0.95,0,0


In [19]:
decision_df.describe()

Unnamed: 0,jobID,totaljobduration,predictionduration,localsamplefrac,noofheartbeats,decision_threshold,truelabel,predictedlabel
count,308448.0,308448.0,308448.0,308448.0,308448.0,308448.0,308448.0,308448.0
mean,40549290.0,18229.540224,2993.808337,0.54396,10.765773,0.95,0.472845,0.428902
std,4564101.0,20555.083382,977.722483,0.376345,11.609291,2.22045e-16,0.499263,0.49492
min,23141080.0,44.0,44.0,0.020833,1.0,0.95,0.0,0.0
25%,37440940.0,1800.0,1800.0,0.1875,1.0,0.95,0.0,0.0
50%,40404580.0,10887.5,3602.0,0.428571,7.0,0.95,0.0,0.0
75%,44488160.0,26999.0,3605.0,1.0,16.0,0.95,1.0,1.0
max,48306560.0,257452.0,98083.0,1.0,144.0,0.95,1.0,1.0


## New Computational Time Estimate (NCTE)
## NCTE = A(TN) + B(TP) + C(FP) + D(FN)

## A: True Negatives predictions

In [20]:
truenegative = decision_df[ (decision_df['truelabel'] == 0) & (decision_df['predictedlabel'] == 0) ]

In [21]:
#For Verification against confusion matrix
len(truenegative)

146148

In [22]:
A = sum(truenegative['totaljobduration'])
A = A/3600.0 #Hours(TN)
A

854263.3966666666

## B: True Positives predictions

In [23]:
truepositive = decision_df[ (decision_df['truelabel'] == 1) & (decision_df['predictedlabel'] == 1) ]

In [24]:
#For Verification against confusion matrix
len(truepositive)

115842

In [25]:
mean_success_jobs = df[df['truelabel'] == 0]['totaljobduration'].mean()

In [26]:
B = sum(truepositive['predictionduration'])+ len(truepositive)*mean_success_jobs
B = B/3600.0
B

802139.4144251575

## C:  "False Positives" predictions

In [27]:
falsepositives = decision_df[ (decision_df['truelabel'] == 0) & (decision_df['predictedlabel'] == 1) ]

In [28]:
#For Verification against confusion matrix
len(falsepositives)

16452

In [29]:
C = sum(falsepositives['predictionduration'])+sum(falsepositives['totaljobduration'])
C = C/3600.0
C

146622.06916666665

## D: False Negative predictions

In [30]:
falsenegatives = decision_df[ (decision_df['truelabel'] == 1) & (decision_df['predictedlabel'] == 0) ]

In [31]:
#For Verification against confusion matrix
len(falsenegatives)

30006

In [32]:
D = sum(falsenegatives['totaljobduration']) + len(falsenegatives)*mean_success_jobs
D = D/3600.0
D

314202.10769081686

## Original Computational Time Estimate (OCTE)
## OCTE = E(Done) + F(Failed) + G(re-run of Failed)

In [33]:
# 'Done' jobs
E = sum(decision_df[decision_df['truelabel'] == 0]['totaljobduration'])
E = E/3600.0
E

985911.7288888888

In [34]:
# 'Failed' jobs
F = sum(decision_df[decision_df['truelabel'] == 1]['totaljobduration'])
F = F/3600.0
F

575995.2775

In [35]:
# re-run of 'Failed' jobs
G = len(decision_df[decision_df['truelabel'] == 1])*mean_success_jobs
G = G/3600.0
G

901002.6396159743

## Speedup

In [36]:
old_total_time = E + F+ G
new_total_time = A + B+ C +D

In [37]:
# Years (speedup)
(old_total_time - new_total_time)/(24.0*365)

39.46149064561136

In [38]:
# Percentage (speedup)
perc = 100.0*(old_total_time - new_total_time)/(old_total_time)
perc

14.035539574758438