# Mech Tools Evaluation

### Create the dataset

In [32]:
# !docker build -t etl:0.0.1 . && docker run etl:0.0.1

### Load the dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option("display.precision", 2)
pd.set_option('display.max_columns', None)

In [2]:
dataset = pd.read_csv("dataset.csv")
# drop duplicates
dataset = dataset.drop_duplicates(subset='request_id')

# choose only non error
dataset = dataset[dataset['error'] == False].reset_index(drop=True)

str_cols = ("id", "currentAnswer", "title", "request_id", "prompt_request", "tool", "nonce", "vote")
for col in str_cols:
    dataset[col] = dataset[col].astype("string")
dataset.head()



Unnamed: 0,id,currentAnswer,title,request_id,request_block,prompt_request,tool,nonce,deliver_block,error,error_message,prompt_response,p_yes,p_no,confidence,info_utility,vote,win_probability
0,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,5545075191266518558547029150600135891156080971...,29552624,"Given the information available, and consideri...",prediction-online,4150b2a6-a1a4-43cd-b346-8dc5c5173d0b,29560402,False,,,0.7,0.3,0.8,0.6,Yes,0.7
1,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,4420322125272725217473329489276855755733870871...,29552737,"With the given question ""Will the Hisense U8K ...",prediction-online,4e81f577-6063-45bb-a26e-2c939c111cf3,29560222,False,,,0.6,0.4,0.8,0.5,Yes,0.6
2,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,8092041810453425517708884760497898797449114282...,29553033,"With the given question ""Will the Hisense U8K ...",prediction-online,ceba7701-4a3d-4720-bcaf-659666981d51,29560062,False,,,0.6,0.4,0.8,0.9,Yes,0.6
3,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,1109136049191103125504131999953988293599112824...,29553373,"Given the information available, and consideri...",prediction-online,2d8d2776-2a71-4e6d-a450-a85e0e1ec64a,29559888,False,,,0.6,0.4,0.8,0.4,Yes,0.6
4,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,5385201172829803617221747434686259018721904083...,29553477,"With the given question ""Will the Hisense U8K ...",prediction-online,641be53c-5304-44b1-ad6f-595d3b0ce319,29559855,False,,,0.6,0.4,0.8,0.5,Yes,0.6


In [3]:
dataset.shape

(68733, 18)

In [4]:
dataset.describe()

Unnamed: 0,request_block,deliver_block,p_yes,p_no,confidence,info_utility,win_probability
count,68700.0,68700.0,68733.0,68733.0,68733.0,68733.0,68733.0
mean,30700000.0,30700000.0,0.48,0.52,0.78,0.56,0.67
std,686000.0,686000.0,0.2,0.2,0.08,0.19,0.1
min,28900000.0,28900000.0,0.0,0.0,0.0,0.0,0.5
25%,30100000.0,30100000.0,0.3,0.4,0.8,0.5,0.6
50%,30600000.0,30600000.0,0.6,0.4,0.8,0.6,0.7
75%,31300000.0,31300000.0,0.6,0.7,0.8,0.6,0.75
max,32000000.0,32000000.0,1.0,1.0,1.0,1.0,1.0


In [5]:
dataset["vote"].describe()

count     63653
unique        2
top         Yes
freq      34992
Name: vote, dtype: object

### Normalize confidences

In [6]:
dataset["confidence"].unique()

array([0.8 , 0.9 , 0.7 , 0.5 , 0.1 , 0.6 , 0.75, 0.85, 0.3 , 0.4 , 1.  ,
       0.2 , 0.35, 0.  , 0.65, 0.99, 0.95, 0.55, 0.62, 0.88, 0.64, 0.81,
       0.68])

In [7]:
# number of samples with confidence below 0.5
print(f"Number of samples with confidence below 0.5: {dataset[dataset['confidence'] < 0.5].shape[0]}")

# drop samples with confidence below 0.5
dataset = dataset[dataset['confidence'] >= 0.5].reset_index(drop=True)

Number of samples with confidence below 0.5: 44


In [8]:
# bucket confidence into 0.5, 0.6, 0.7, 0.8, 0.9
dataset['confidence'] = dataset['confidence'].apply(lambda x: round(x, 1))

# if confidence is 1 --> 0.9
dataset['confidence'] = dataset['confidence'].apply(lambda x: 0.9 if x == 1.0 else x)
dataset["confidence"].unique()

array([0.8, 0.9, 0.7, 0.5, 0.6])

### Check the percentage of wins vs confidence for all the tools

In [9]:
def accuracy(data):
    correct_answers_mask = data["currentAnswer"] == data["vote"]
    n_answers = correct_answers_mask.count()
    n_answers_success = correct_answers_mask.sum()

    if n_answers == 0:
        accuracy = None
    elif n_answers_success == 0:
        accuracy = 0
    else:
        accuracy = n_answers_success/n_answers * 100

    return pd.Series({"n_correct": n_answers_success, "n_pred": n_answers, "accuracy": accuracy})

In [10]:
def acc_per_tool(group, col: str, conf: float):
    return group.apply(lambda x: accuracy(x[x[col] == conf]))

In [11]:
def gen_stats(group, col: str):
    stats = {f"{col}_{prob}" : acc_per_tool(group, col, prob) for prob in sorted(dataset[col].unique())}
    stats["total"] = tools_group.apply(accuracy)
    return pd.concat(stats.values(), axis=1, keys=stats.keys())

In [12]:
tools_group = dataset.groupby("tool")
tools_stats_per_conf = gen_stats(tools_group, "confidence")
display(tools_stats_per_conf)

Unnamed: 0_level_0,confidence_0.5,confidence_0.5,confidence_0.5,confidence_0.6,confidence_0.6,confidence_0.6,confidence_0.7,confidence_0.7,confidence_0.7,confidence_0.8,confidence_0.8,confidence_0.8,confidence_0.9,confidence_0.9,confidence_0.9,total,total,total
Unnamed: 0_level_1,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy
tool,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
claude-prediction-offline,626.0,1068.0,58.61,52.0,94.0,55.32,172.0,396.0,43.43,58.0,106.0,54.72,78.0,96.0,81.25,986.0,1760.0,56.02
claude-prediction-online,194.0,407.0,47.67,233.0,430.0,54.19,440.0,778.0,56.56,72.0,175.0,41.14,7.0,13.0,53.85,946.0,1803.0,52.47
prediction-offline,0.0,0.0,,17.0,30.0,56.67,161.0,298.0,54.03,689.0,1590.0,43.33,115.0,234.0,49.15,982.0,2152.0,45.63
prediction-offline-sme,0.0,0.0,,108.0,182.0,59.34,357.0,516.0,69.19,890.0,1875.0,47.47,99.0,210.0,47.14,1454.0,2783.0,52.25
prediction-online,0.0,0.0,,136.0,194.0,70.1,2076.0,3134.0,66.24,10287.0,22415.0,45.89,1668.0,3147.0,53.0,14167.0,28890.0,49.04
prediction-online-sme,1.0,3.0,33.33,310.0,653.0,47.47,2593.0,5381.0,48.19,9325.0,18158.0,51.35,996.0,1747.0,57.01,13225.0,25942.0,50.98
prediction-online-summarized-info,0.0,0.0,,1.0,2.0,50.0,7.0,13.0,53.85,47.0,102.0,46.08,14.0,24.0,58.33,69.0,141.0,48.94
prediction-sentence-embedding-bold,0.0,0.0,,0.0,0.0,,22.0,35.0,62.86,16.0,17.0,94.12,27.0,29.0,93.1,65.0,81.0,80.25
prediction-sentence-embedding-conservative,0.0,0.0,,0.0,0.0,,0.0,0.0,,18.0,35.0,51.43,23.0,47.0,48.94,41.0,82.0,50.0


### Check the percentage of wins vs probability for all the tools

In [13]:
tools_stats_per_prob = gen_stats(tools_group, "win_probability")
tools_stats_per_prob

Unnamed: 0_level_0,win_probability_0.5,win_probability_0.5,win_probability_0.5,win_probability_0.505,win_probability_0.505,win_probability_0.505,win_probability_0.525,win_probability_0.525,win_probability_0.525,win_probability_0.53,win_probability_0.53,win_probability_0.53,win_probability_0.54,win_probability_0.54,win_probability_0.54,win_probability_0.55,win_probability_0.55,win_probability_0.55,win_probability_0.58,win_probability_0.58,win_probability_0.58,win_probability_0.6,win_probability_0.6,win_probability_0.6,win_probability_0.62,win_probability_0.62,win_probability_0.62,win_probability_0.63,win_probability_0.63,win_probability_0.63,win_probability_0.65,win_probability_0.65,win_probability_0.65,win_probability_0.68,win_probability_0.68,win_probability_0.68,win_probability_0.7,win_probability_0.7,win_probability_0.7,win_probability_0.73,win_probability_0.73,win_probability_0.73,win_probability_0.75,win_probability_0.75,win_probability_0.75,win_probability_0.8,win_probability_0.8,win_probability_0.8,win_probability_0.83,win_probability_0.83,win_probability_0.83,win_probability_0.85,win_probability_0.85,win_probability_0.85,win_probability_0.87,win_probability_0.87,win_probability_0.87,win_probability_0.9,win_probability_0.9,win_probability_0.9,win_probability_0.91,win_probability_0.91,win_probability_0.91,win_probability_0.95,win_probability_0.95,win_probability_0.95,win_probability_0.99,win_probability_0.99,win_probability_0.99,win_probability_0.998,win_probability_0.998,win_probability_0.998,win_probability_0.999,win_probability_0.999,win_probability_0.999,win_probability_0.9999,win_probability_0.9999,win_probability_0.9999,win_probability_0.999999,win_probability_0.999999,win_probability_0.999999,win_probability_1.0,win_probability_1.0,win_probability_1.0,total,total,total
Unnamed: 0_level_1,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy
tool,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2,Unnamed: 82_level_2,Unnamed: 83_level_2,Unnamed: 84_level_2,Unnamed: 85_level_2,Unnamed: 86_level_2,Unnamed: 87_level_2
claude-prediction-offline,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,5.0,0.0,0.0,0.0,,290.0,641.0,45.24,0.0,1.0,0.0,0.0,0.0,,4.0,4.0,100.0,0.0,0.0,,136.0,267.0,50.94,0.0,0.0,,0.0,1.0,0.0,309.0,538.0,57.43,0.0,0.0,,4.0,5.0,80.0,0.0,0.0,,145.0,174.0,83.33,0.0,0.0,,3.0,3.0,100.0,84.0,108.0,77.78,0.0,0.0,,0.0,1.0,0.0,1.0,1.0,100.0,1.0,1.0,100.0,9.0,10.0,90.0,986.0,1760.0,56.02
claude-prediction-online,0.0,0.0,,0.0,1.0,0.0,0.0,0.0,,2.0,2.0,100.0,0.0,1.0,0.0,11.0,25.0,44.0,2.0,2.0,100.0,441.0,858.0,51.4,0.0,0.0,,0.0,1.0,0.0,26.0,59.0,44.07,1.0,2.0,50.0,141.0,375.0,37.6,1.0,1.0,100.0,3.0,11.0,27.27,281.0,389.0,72.24,0.0,0.0,,6.0,14.0,42.86,0.0,0.0,,24.0,52.0,46.15,0.0,0.0,,2.0,2.0,100.0,5.0,5.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,3.0,0.0,946.0,1803.0,52.47
prediction-offline,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,3.0,6.0,50.0,0.0,0.0,,358.0,899.0,39.82,0.0,0.0,,0.0,0.0,,15.0,35.0,42.86,0.0,0.0,,287.0,599.0,47.91,0.0,0.0,,20.0,58.0,34.48,282.0,521.0,54.13,0.0,0.0,,2.0,4.0,50.0,0.0,0.0,,8.0,23.0,34.78,0.0,0.0,,1.0,1.0,100.0,1.0,1.0,100.0,1.0,1.0,100.0,1.0,1.0,100.0,2.0,2.0,100.0,0.0,0.0,,1.0,1.0,100.0,982.0,2152.0,45.63
prediction-offline-sme,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,4.0,5.0,80.0,0.0,0.0,,411.0,953.0,43.13,0.0,0.0,,0.0,0.0,,14.0,32.0,43.75,0.0,0.0,,451.0,853.0,52.87,0.0,0.0,,45.0,70.0,64.29,482.0,789.0,61.09,0.0,0.0,,2.0,5.0,40.0,0.0,0.0,,33.0,63.0,52.38,0.0,0.0,,4.0,4.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,8.0,9.0,88.89,1454.0,2783.0,52.25
prediction-online,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,74.0,141.0,52.48,0.0,0.0,,5218.0,12749.0,40.93,0.0,0.0,,0.0,0.0,,409.0,891.0,45.9,0.0,0.0,,4204.0,8232.0,51.07,0.0,0.0,,517.0,950.0,54.42,3558.0,5673.0,62.72,0.0,0.0,,32.0,46.0,69.57,0.0,1.0,0.0,130.0,175.0,74.29,0.0,0.0,,8.0,11.0,72.73,1.0,1.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,16.0,20.0,80.0,14167.0,28890.0,49.04
prediction-online-sme,0.0,0.0,,0.0,0.0,,0.0,1.0,0.0,1.0,1.0,100.0,0.0,0.0,,61.0,113.0,53.98,0.0,0.0,,5114.0,9764.0,52.38,0.0,0.0,,0.0,0.0,,284.0,524.0,54.2,0.0,0.0,,3689.0,7320.0,50.4,0.0,0.0,,346.0,674.0,51.34,3469.0,7053.0,49.18,5.0,5.0,100.0,24.0,50.0,48.0,0.0,1.0,0.0,211.0,402.0,52.49,2.0,2.0,100.0,16.0,28.0,57.14,1.0,1.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,2.0,3.0,66.67,13225.0,25942.0,50.98
prediction-online-summarized-info,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,28.0,64.0,43.75,0.0,0.0,,0.0,0.0,,1.0,2.0,50.0,0.0,0.0,,17.0,35.0,48.57,0.0,0.0,,4.0,5.0,80.0,18.0,32.0,56.25,0.0,0.0,,0.0,1.0,0.0,0.0,0.0,,1.0,2.0,50.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,69.0,141.0,48.94
prediction-sentence-embedding-bold,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,1.0,2.0,50.0,0.0,0.0,,21.0,33.0,63.64,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,4.0,5.0,80.0,0.0,0.0,,0.0,0.0,,4.0,4.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,8.0,8.0,100.0,0.0,0.0,,8.0,8.0,100.0,1.0,1.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,18.0,20.0,90.0,65.0,81.0,80.25
prediction-sentence-embedding-conservative,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,8.0,21.0,38.1,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,9.0,11.0,81.82,0.0,0.0,,0.0,0.0,,21.0,44.0,47.73,0.0,0.0,,0.0,0.0,,0.0,0.0,,1.0,3.0,33.33,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,2.0,3.0,66.67,41.0,82.0,50.0


### List the well performing tools

In [14]:
# we are using 55% as the performance threshold and 100 as the samples threshold
perf_threshold = 55
samples_threshold = 100

In [15]:
well_performing = tools_stats_per_conf.loc[:, ("total", "accuracy")] > perf_threshold
enough_samples = tools_stats_per_conf.loc[:, ("total", "n_pred")] > samples_threshold
tools_stats_per_conf.loc[well_performing & enough_samples, [("total", "accuracy"), ("total", "n_pred")]]

Unnamed: 0_level_0,total,total
Unnamed: 0_level_1,accuracy,n_pred
tool,Unnamed: 1_level_2,Unnamed: 2_level_2
claude-prediction-offline,56.02,1760.0
