# Mech Tools Evaluation

### Create the dataset

In [None]:
!docker build -t etl:0.0.1 . && docker run etl:0.0.1

### Load the dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option("display.precision", 2)
pd.set_option('display.max_columns', None)

In [2]:
dataset = pd.read_csv("dataset.csv")
str_cols = ("id", "currentAnswer", "title", "request_id", "prompt", "tool", "nonce", "vote")
for col in str_cols:
    dataset[col] = dataset[col].astype("string")
dataset.head()

Unnamed: 0,id,currentAnswer,title,request_id,request_block,prompt,tool,nonce,deliver_block,p_yes,p_no,confidence,info_utility,vote,win_probability
0,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,1429730407779530824523722231071959771311408049...,29544655,"With the given question ""Will the Hisense U8K ...",prediction-online,c6366b3f-eff5-4533-8dd9-d653b281b29d,29577379,0.6,0.4,0.8,0.5,Yes,0.6
1,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,1695055931594747475916883029584567955775422500...,29545478,"With the given question ""Will the Hisense U8K ...",prediction-online,1eed33a5-a3f0-41c4-beae-23e9022ffe22,29576660,0.6,0.4,0.8,0.7,Yes,0.6
2,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,5972945302788386668720465960403202339977906500...,29546230,"With the given question ""Will the Hisense U8K ...",prediction-online,dd376ef9-eb2c-4d9f-8a5a-cf9ae8deb0b3,29576574,0.6,0.4,0.8,0.7,Yes,0.6
3,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,1043402953919313937539182160739114840263108832...,29546982,"With the given question ""Will the Hisense U8K ...",prediction-online,91096f15-5e3b-4bf1-8178-f17f1efcf639,29576448,0.7,0.3,0.8,0.6,Yes,0.7
4,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,9433232780766388309643050548812272093999565778...,29547744,"With the given question ""Will the Hisense U8K ...",prediction-online,92321968-7888-4877-b33f-22fa4755fbc2,29576351,0.65,0.35,0.9,0.8,Yes,0.65


In [3]:
dataset.shape

(15082, 15)

In [4]:
dataset.describe()

Unnamed: 0,request_block,deliver_block,p_yes,p_no,confidence,info_utility,win_probability
count,15100.0,15100.0,15082.0,15082.0,15082.0,15082.0,15082.0
mean,30300000.0,30300000.0,0.49,0.51,0.78,0.57,0.68
std,504000.0,503000.0,0.2,0.2,0.08,0.18,0.09
min,29000000.0,29000000.0,0.0,0.0,0.5,0.0,0.53
25%,29900000.0,29900000.0,0.3,0.4,0.8,0.5,0.6
50%,30300000.0,30300000.0,0.6,0.4,0.8,0.6,0.7
75%,30700000.0,30700000.0,0.6,0.7,0.8,0.7,0.75
max,31100000.0,31100000.0,1.0,1.0,1.0,1.0,1.0


In [5]:
dataset["vote"].describe()

count     15082
unique        2
top         Yes
freq       8985
Name: vote, dtype: object

### Normalize confidences

In [6]:
dataset["confidence"].unique()

array([0.8 , 0.9 , 0.7 , 0.5 , 0.6 , 0.85, 0.75, 1.  , 0.99, 0.95])

In [7]:
dataset.loc[(dataset["confidence"] >= 0.9) & (dataset["confidence"] < 1), "confidence"] = 0.9
dataset.loc[dataset["confidence"] == 0.85, "confidence"] = 0.8
dataset.loc[dataset["confidence"] == 0.75, "confidence"] = 0.7
dataset["confidence"].unique()

array([0.8, 0.9, 0.7, 0.5, 0.6, 1. ])

### Check the percentage of wins vs confidence for all the tools

In [8]:
def accuracy(data):
    correct_answers_mask = data["currentAnswer"] == data["vote"]
    n_answers = correct_answers_mask.count()
    n_answers_success = correct_answers_mask.sum()

    if n_answers == 0:
        accuracy = None
    elif n_answers_success == 0:
        accuracy = 0
    else:
        accuracy = n_answers_success/n_answers * 100

    return pd.Series({"n_correct": n_answers_success, "n_pred": n_answers, "accuracy": accuracy})

In [9]:
def acc_per_tool(group, col: str, conf: float):
    return group.apply(lambda x: accuracy(x[x[col] == conf]))

In [10]:
def gen_stats(group, col: str):
    stats = {f"{col}_{prob}" : acc_per_tool(group, col, prob) for prob in sorted(dataset[col].unique())}
    stats["total"] = tools_group.apply(accuracy)
    return pd.concat(stats.values(), axis=1, keys=stats.keys())

In [11]:
tools_group = dataset.groupby("tool")
tools_stats_per_conf = gen_stats(tools_group, "confidence")
display(tools_stats_per_conf)

Unnamed: 0_level_0,confidence_0.5,confidence_0.5,confidence_0.5,confidence_0.6,confidence_0.6,confidence_0.6,confidence_0.7,confidence_0.7,confidence_0.7,confidence_0.8,confidence_0.8,confidence_0.8,confidence_0.9,confidence_0.9,confidence_0.9,confidence_1.0,confidence_1.0,confidence_1.0,total,total,total
Unnamed: 0_level_1,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy
tool,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
claude-prediction-offline,297.0,434.0,68.43,19.0,43.0,44.19,81.0,191.0,42.41,15.0,28.0,53.57,28.0,34.0,82.35,28.0,35.0,80.0,468.0,765.0,61.18
claude-prediction-online,63.0,83.0,75.9,96.0,128.0,75.0,188.0,331.0,56.8,21.0,47.0,44.68,1.0,4.0,25.0,0.0,0.0,,369.0,593.0,62.23
prediction-offline,0.0,0.0,,9.0,9.0,100.0,72.0,92.0,78.26,219.0,558.0,39.25,40.0,78.0,51.28,0.0,0.0,,340.0,737.0,46.13
prediction-offline-sme,0.0,0.0,,31.0,39.0,79.49,133.0,160.0,83.12,278.0,620.0,44.84,38.0,74.0,51.35,0.0,1.0,0.0,480.0,894.0,53.69
prediction-online,0.0,0.0,,50.0,56.0,89.29,774.0,998.0,77.56,3204.0,7271.0,44.07,586.0,1101.0,53.22,0.0,0.0,,4614.0,9426.0,48.95
prediction-online-sme,0.0,0.0,,47.0,62.0,75.81,350.0,499.0,70.14,932.0,1919.0,48.57,104.0,180.0,57.78,0.0,0.0,,1433.0,2660.0,53.87
prediction-sentence-embedding-bold,0.0,0.0,,0.0,0.0,,0.0,1.0,0.0,0.0,1.0,0.0,3.0,3.0,100.0,0.0,0.0,,3.0,5.0,60.0
prediction-sentence-embedding-conservative,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,2.0,2.0,100.0,0.0,0.0,,2.0,2.0,100.0


### Check the percentage of wins vs probability for all the tools

In [12]:
tools_stats_per_prob = gen_stats(tools_group, "win_probability")
tools_stats_per_prob

Unnamed: 0_level_0,win_probability_0.525,win_probability_0.525,win_probability_0.525,win_probability_0.55,win_probability_0.55,win_probability_0.55,win_probability_0.6,win_probability_0.6,win_probability_0.6,win_probability_0.65,win_probability_0.65,win_probability_0.65,win_probability_0.7,win_probability_0.7,win_probability_0.7,win_probability_0.75,win_probability_0.75,win_probability_0.75,win_probability_0.8,win_probability_0.8,win_probability_0.8,win_probability_0.85,win_probability_0.85,win_probability_0.85,win_probability_0.87,win_probability_0.87,win_probability_0.87,win_probability_0.9,win_probability_0.9,win_probability_0.9,win_probability_0.95,win_probability_0.95,win_probability_0.95,win_probability_0.99,win_probability_0.99,win_probability_0.99,win_probability_0.998,win_probability_0.998,win_probability_0.998,win_probability_0.999,win_probability_0.999,win_probability_0.999,win_probability_0.9999,win_probability_0.9999,win_probability_0.9999,win_probability_0.999999,win_probability_0.999999,win_probability_0.999999,win_probability_1.0,win_probability_1.0,win_probability_1.0,total,total,total
Unnamed: 0_level_1,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy
tool,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2
claude-prediction-offline,0.0,0.0,,0.0,2.0,0.0,140.0,303.0,46.2,2.0,2.0,100.0,71.0,128.0,55.47,0.0,0.0,,133.0,192.0,69.27,1.0,1.0,100.0,0.0,0.0,,63.0,67.0,94.03,1.0,1.0,100.0,49.0,60.0,81.67,0.0,0.0,,0.0,0.0,,1.0,1.0,100.0,1.0,1.0,100.0,6.0,7.0,85.71,468.0,765.0,61.18
claude-prediction-online,0.0,0.0,,1.0,1.0,100.0,176.0,323.0,54.49,9.0,12.0,75.0,38.0,83.0,45.78,2.0,3.0,66.67,125.0,151.0,82.78,4.0,4.0,100.0,0.0,0.0,,9.0,10.0,90.0,0.0,0.0,,5.0,5.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,1.0,0.0,369.0,593.0,62.23
prediction-offline,0.0,0.0,,1.0,3.0,33.33,104.0,309.0,33.66,5.0,16.0,31.25,96.0,222.0,43.24,6.0,14.0,42.86,118.0,162.0,72.84,0.0,0.0,,0.0,0.0,,4.0,5.0,80.0,1.0,1.0,100.0,1.0,1.0,100.0,1.0,1.0,100.0,1.0,1.0,100.0,2.0,2.0,100.0,0.0,0.0,,0.0,0.0,,340.0,737.0,46.13
prediction-offline-sme,0.0,0.0,,0.0,1.0,0.0,112.0,329.0,34.04,5.0,12.0,41.67,151.0,267.0,56.55,13.0,26.0,50.0,177.0,234.0,75.64,2.0,2.0,100.0,0.0,0.0,,13.0,15.0,86.67,4.0,4.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,3.0,4.0,75.0,480.0,894.0,53.69
prediction-online,0.0,0.0,,31.0,50.0,62.0,1502.0,4124.0,36.42,125.0,298.0,41.95,1427.0,2717.0,52.52,178.0,353.0,50.42,1275.0,1790.0,71.23,11.0,14.0,78.57,0.0,1.0,0.0,57.0,68.0,83.82,5.0,7.0,71.43,1.0,1.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,2.0,3.0,66.67,4614.0,9426.0,48.95
prediction-online-sme,0.0,1.0,0.0,8.0,14.0,57.14,492.0,1111.0,44.28,28.0,54.0,51.85,404.0,735.0,54.97,41.0,66.0,62.12,429.0,636.0,67.45,3.0,7.0,42.86,0.0,1.0,0.0,26.0,33.0,78.79,2.0,2.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,1433.0,2660.0,53.87
prediction-sentence-embedding-bold,0.0,0.0,,0.0,0.0,,0.0,1.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,2.0,2.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,1.0,1.0,100.0,3.0,5.0,60.0
prediction-sentence-embedding-conservative,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,2.0,2.0,100.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,2.0,2.0,100.0


### List the well performing tools

In [13]:
# we are using 55% as the performance threshold and 100 as the samples threshold
perf_threshold = 55
samples_threshold = 100

In [14]:
well_performing = tools_stats_per_conf.loc[:, ("total", "accuracy")] > perf_threshold
enough_samples = tools_stats_per_conf.loc[:, ("total", "n_pred")] > samples_threshold
tools_stats_per_conf.loc[well_performing & enough_samples, [("total", "accuracy"), ("total", "n_pred")]]

Unnamed: 0_level_0,total,total
Unnamed: 0_level_1,accuracy,n_pred
tool,Unnamed: 1_level_2,Unnamed: 2_level_2
claude-prediction-offline,61.18,765.0
claude-prediction-online,62.23,593.0
