# Mech Tools Evaluation

### Create the dataset

In [1]:
# !docker build -t etl:0.0.1 . && docker run etl:0.0.1

### Load the dataset

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option("display.precision", 2)
pd.set_option('display.max_columns', None)

In [3]:
dataset = pd.read_csv("./data/dataset.csv")
# drop duplicates
dataset = dataset.drop_duplicates(subset='request_id')

# choose only non error
dataset = dataset[dataset['error'] == False].reset_index(drop=True)

str_cols = ("id", "currentAnswer", "title", "request_id", "prompt_request", "tool", "nonce", "vote")
for col in str_cols:
    dataset[col] = dataset[col].astype("string")
dataset.head()



Unnamed: 0,id,currentAnswer,title,request_id,request_block,prompt_request,tool,nonce,deliver_block,error,error_message,prompt_response,p_yes,p_no,confidence,info_utility,vote,win_probability
0,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,2315482717523248142566209479998403158981026548...,29555552,"With the given question ""Will the Hisense U8K ...",prediction-online,2a54469c-a055-4c9c-a558-51262cd4a787,29557480,False,,,0.5,0.5,0.8,0.6,,0.5
1,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,3469031602277041443388090267865736162834108248...,29555700,"With the given question ""Will the Hisense U8K ...",prediction-online,01538074-5f78-4e28-8058-47b6d94b682f,29557255,False,,,0.6,0.4,0.8,0.6,Yes,0.6
2,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,9159320707509526863870444110528895654334237837...,29555970,"Given the information available, and consideri...",prediction-online,099e342e-a41e-4689-be76-3570d8a256cf,29556974,False,,,0.65,0.35,0.8,0.6,Yes,0.65
3,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,1023057355467695172592840970123356496552194251...,29556083,"With the given question ""Will the Hisense U8K ...",prediction-online,438e9446-0f55-4e5e-b879-c06e93520969,29556905,False,,,0.6,0.4,0.8,0.7,Yes,0.6
4,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,1049080869663393036021703581760667415975346534...,29556451,"With the given question ""Will the Hisense U8K ...",prediction-online,7209cbcf-030e-4664-956f-a1882c3d35c2,29556491,False,,,0.6,0.4,0.8,0.5,Yes,0.6


In [4]:
dataset.shape

(67917, 18)

In [5]:
dataset.describe()

Unnamed: 0,request_block,deliver_block,p_yes,p_no,confidence,info_utility,win_probability
count,67900.0,67900.0,67917.0,67917.0,67917.0,67917.0,67917.0
mean,30700000.0,30700000.0,0.48,0.52,0.78,0.56,0.67
std,675000.0,674000.0,0.2,0.2,0.08,0.18,0.1
min,28900000.0,28900000.0,0.0,0.0,0.0,0.0,0.5
25%,30100000.0,30100000.0,0.3,0.4,0.8,0.5,0.6
50%,30600000.0,30600000.0,0.6,0.4,0.8,0.6,0.7
75%,31200000.0,31200000.0,0.6,0.7,0.8,0.6,0.75
max,32000000.0,32000000.0,1.0,1.0,1.0,1.0,1.0


In [6]:
dataset["vote"].describe()

count     62930
unique        2
top         Yes
freq      34688
Name: vote, dtype: object

### Normalize confidences

In [7]:
dataset["confidence"].unique()

array([0.8 , 0.9 , 0.7 , 0.6 , 0.5 , 0.1 , 0.75, 0.85, 0.3 , 1.  , 0.35,
       0.2 , 0.4 , 0.65, 0.99, 0.95, 0.55, 0.  , 0.62, 0.88, 0.64, 0.81,
       0.68])

In [8]:
# number of samples with confidence below 0.5
print(f"Number of samples with confidence below 0.5: {dataset[dataset['confidence'] < 0.5].shape[0]}")

# drop samples with confidence below 0.5
dataset = dataset[dataset['confidence'] >= 0.5].reset_index(drop=True)

Number of samples with confidence below 0.5: 38


In [9]:
# bucket confidence into 0.5, 0.6, 0.7, 0.8, 0.9
dataset['confidence'] = dataset['confidence'].apply(lambda x: round(x, 1))

# if confidence is 1 --> 0.9
dataset['confidence'] = dataset['confidence'].apply(lambda x: 0.9 if x == 1.0 else x)
dataset["confidence"].unique()

array([0.8, 0.9, 0.7, 0.6, 0.5])

### Check the percentage of wins vs confidence for all the tools

In [10]:
def accuracy(data):
    correct_answers_mask = data["currentAnswer"] == data["vote"]
    n_answers = correct_answers_mask.count()
    n_answers_success = correct_answers_mask.sum()

    if n_answers == 0:
        accuracy = None
    elif n_answers_success == 0:
        accuracy = 0
    else:
        accuracy = n_answers_success/n_answers * 100

    return pd.Series({"n_correct": n_answers_success, "n_pred": n_answers, "accuracy": accuracy})

In [11]:
def acc_per_tool(group, col: str, conf: float):
    return group.apply(lambda x: accuracy(x[x[col] == conf]))

In [12]:
def gen_stats(group, col: str):
    stats = {f"{col}_{prob}" : acc_per_tool(group, col, prob) for prob in sorted(dataset[col].unique())}
    stats["total"] = tools_group.apply(accuracy)
    return pd.concat(stats.values(), axis=1, keys=stats.keys())

In [13]:
tools_group = dataset.groupby("tool")
tools_stats_per_conf = gen_stats(tools_group, "confidence")
display(tools_stats_per_conf)

Unnamed: 0_level_0,confidence_0.5,confidence_0.5,confidence_0.5,confidence_0.6,confidence_0.6,confidence_0.6,confidence_0.7,confidence_0.7,confidence_0.7,confidence_0.8,confidence_0.8,confidence_0.8,confidence_0.9,confidence_0.9,confidence_0.9,total,total,total
Unnamed: 0_level_1,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy
tool,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
claude-prediction-offline,597.0,1017.0,58.7,50.0,91.0,54.95,170.0,390.0,43.59,57.0,104.0,54.81,77.0,95.0,81.05,951.0,1697.0,56.04
claude-prediction-online,186.0,384.0,48.44,248.0,432.0,57.41,454.0,811.0,55.98,63.0,166.0,37.95,7.0,12.0,58.33,958.0,1805.0,53.07
prediction-offline,0.0,0.0,,19.0,32.0,59.38,162.0,291.0,55.67,689.0,1596.0,43.17,114.0,234.0,48.72,984.0,2153.0,45.7
prediction-offline-sme,0.0,0.0,,106.0,183.0,57.92,343.0,505.0,67.92,893.0,1873.0,47.68,97.0,207.0,46.86,1439.0,2768.0,51.99
prediction-online,0.0,0.0,,136.0,194.0,70.1,2075.0,3130.0,66.29,10353.0,22531.0,45.95,1676.0,3159.0,53.05,14240.0,29014.0,49.08
prediction-online-sme,1.0,3.0,33.33,313.0,647.0,48.38,2574.0,5249.0,49.04,9090.0,17569.0,51.74,981.0,1702.0,57.64,12959.0,25170.0,51.49
prediction-online-summarized-info,0.0,0.0,,1.0,2.0,50.0,7.0,13.0,53.85,47.0,103.0,45.63,14.0,24.0,58.33,69.0,142.0,48.59
prediction-sentence-embedding-bold,0.0,0.0,,0.0,0.0,,22.0,35.0,62.86,16.0,17.0,94.12,27.0,29.0,93.1,65.0,81.0,80.25
prediction-sentence-embedding-conservative,0.0,0.0,,0.0,0.0,,0.0,0.0,,18.0,35.0,51.43,23.0,47.0,48.94,41.0,82.0,50.0


### Check the percentage of wins vs probability for all the tools

In [14]:
tools_stats_per_prob = gen_stats(tools_group, "win_probability")
tools_stats_per_prob

Unnamed: 0_level_0,win_probability_0.5,win_probability_0.5,win_probability_0.5,win_probability_0.505,win_probability_0.505,win_probability_0.505,win_probability_0.525,win_probability_0.525,win_probability_0.525,win_probability_0.53,win_probability_0.53,win_probability_0.53,win_probability_0.54,win_probability_0.54,win_probability_0.54,win_probability_0.55,win_probability_0.55,win_probability_0.55,win_probability_0.58,win_probability_0.58,win_probability_0.58,win_probability_0.6,win_probability_0.6,win_probability_0.6,win_probability_0.62,win_probability_0.62,win_probability_0.62,win_probability_0.63,win_probability_0.63,win_probability_0.63,win_probability_0.65,win_probability_0.65,win_probability_0.65,win_probability_0.68,win_probability_0.68,win_probability_0.68,win_probability_0.7,win_probability_0.7,win_probability_0.7,win_probability_0.73,win_probability_0.73,win_probability_0.73,win_probability_0.75,win_probability_0.75,win_probability_0.75,win_probability_0.8,win_probability_0.8,win_probability_0.8,win_probability_0.83,win_probability_0.83,win_probability_0.83,win_probability_0.85,win_probability_0.85,win_probability_0.85,win_probability_0.87,win_probability_0.87,win_probability_0.87,win_probability_0.9,win_probability_0.9,win_probability_0.9,win_probability_0.95,win_probability_0.95,win_probability_0.95,win_probability_0.99,win_probability_0.99,win_probability_0.99,win_probability_0.998,win_probability_0.998,win_probability_0.998,win_probability_0.999,win_probability_0.999,win_probability_0.999,win_probability_0.9999,win_probability_0.9999,win_probability_0.9999,win_probability_0.999999,win_probability_0.999999,win_probability_0.999999,win_probability_1.0,win_probability_1.0,win_probability_1.0,total,total,total
Unnamed: 0_level_1,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy
tool,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2,Unnamed: 82_level_2,Unnamed: 83_level_2,Unnamed: 84_level_2
claude-prediction-offline,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,5.0,0.0,0.0,0.0,,287.0,627.0,45.77,0.0,1.0,0.0,0.0,0.0,,4.0,4.0,100.0,0.0,0.0,,131.0,254.0,51.57,0.0,0.0,,0.0,1.0,0.0,290.0,510.0,56.86,0.0,0.0,,3.0,4.0,75.0,0.0,0.0,,139.0,168.0,82.74,3.0,3.0,100.0,84.0,108.0,77.78,0.0,0.0,,0.0,1.0,0.0,1.0,1.0,100.0,1.0,1.0,100.0,8.0,9.0,88.89,951.0,1697.0,56.04
claude-prediction-online,0.0,0.0,,0.0,1.0,0.0,0.0,0.0,,2.0,2.0,100.0,0.0,1.0,0.0,11.0,23.0,47.83,2.0,2.0,100.0,451.0,882.0,51.13,0.0,0.0,,0.0,1.0,0.0,27.0,55.0,49.09,1.0,1.0,100.0,140.0,362.0,38.67,1.0,1.0,100.0,3.0,12.0,25.0,288.0,394.0,73.1,0.0,0.0,,5.0,11.0,45.45,0.0,0.0,,21.0,50.0,42.0,2.0,2.0,100.0,4.0,4.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,1.0,0.0,958.0,1805.0,53.07
prediction-offline,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,3.0,6.0,50.0,0.0,0.0,,358.0,904.0,39.6,0.0,0.0,,0.0,0.0,,15.0,35.0,42.86,0.0,0.0,,288.0,603.0,47.76,0.0,0.0,,21.0,54.0,38.89,283.0,517.0,54.74,0.0,0.0,,2.0,4.0,50.0,0.0,0.0,,9.0,25.0,36.0,1.0,1.0,100.0,1.0,1.0,100.0,1.0,1.0,100.0,1.0,1.0,100.0,0.0,0.0,,0.0,0.0,,1.0,1.0,100.0,984.0,2153.0,45.7
prediction-offline-sme,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,4.0,5.0,80.0,0.0,0.0,,417.0,960.0,43.44,0.0,0.0,,0.0,0.0,,13.0,32.0,40.62,0.0,0.0,,442.0,839.0,52.68,0.0,0.0,,46.0,71.0,64.79,469.0,779.0,60.21,0.0,0.0,,2.0,6.0,33.33,0.0,0.0,,32.0,61.0,52.46,5.0,5.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,9.0,10.0,90.0,1439.0,2768.0,51.99
prediction-online,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,74.0,139.0,53.24,0.0,0.0,,5238.0,12800.0,40.92,0.0,0.0,,0.0,0.0,,418.0,900.0,46.44,0.0,0.0,,4240.0,8287.0,51.16,0.0,0.0,,525.0,962.0,54.57,3560.0,5675.0,62.73,0.0,0.0,,31.0,44.0,70.45,0.0,1.0,0.0,130.0,175.0,74.29,7.0,10.0,70.0,1.0,1.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,16.0,20.0,80.0,14240.0,29014.0,49.08
prediction-online-sme,0.0,0.0,,0.0,0.0,,0.0,1.0,0.0,1.0,1.0,100.0,0.0,0.0,,56.0,104.0,53.85,0.0,0.0,,4943.0,9441.0,52.36,0.0,0.0,,0.0,0.0,,271.0,498.0,54.42,0.0,0.0,,3650.0,7115.0,51.3,0.0,0.0,,339.0,652.0,51.99,3444.0,6888.0,50.0,5.0,5.0,100.0,24.0,48.0,50.0,0.0,1.0,0.0,207.0,387.0,53.49,16.0,25.0,64.0,1.0,1.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,2.0,3.0,66.67,12959.0,25170.0,51.49
prediction-online-summarized-info,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,28.0,64.0,43.75,0.0,0.0,,0.0,0.0,,1.0,2.0,50.0,0.0,0.0,,17.0,36.0,47.22,0.0,0.0,,4.0,5.0,80.0,18.0,32.0,56.25,0.0,0.0,,0.0,1.0,0.0,0.0,0.0,,1.0,2.0,50.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,69.0,142.0,48.59
prediction-sentence-embedding-bold,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,1.0,2.0,50.0,0.0,0.0,,21.0,33.0,63.64,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,4.0,5.0,80.0,0.0,0.0,,0.0,0.0,,4.0,4.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,8.0,8.0,100.0,8.0,8.0,100.0,1.0,1.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,18.0,20.0,90.0,65.0,81.0,80.25
prediction-sentence-embedding-conservative,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,8.0,21.0,38.1,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,9.0,11.0,81.82,0.0,0.0,,0.0,0.0,,21.0,44.0,47.73,0.0,0.0,,0.0,0.0,,0.0,0.0,,1.0,3.0,33.33,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,2.0,3.0,66.67,41.0,82.0,50.0


### List the well performing tools

In [15]:
# we are using 55% as the performance threshold and 100 as the samples threshold
perf_threshold = 55
samples_threshold = 100

In [16]:
well_performing = tools_stats_per_conf.loc[:, ("total", "accuracy")] > perf_threshold
enough_samples = tools_stats_per_conf.loc[:, ("total", "n_pred")] > samples_threshold
tools_stats_per_conf.loc[well_performing & enough_samples, [("total", "accuracy"), ("total", "n_pred")]]

Unnamed: 0_level_0,total,total
Unnamed: 0_level_1,accuracy,n_pred
tool,Unnamed: 1_level_2,Unnamed: 2_level_2
claude-prediction-offline,56.04,1697.0
