# Mech Tools Evaluation

### Create the dataset

In [None]:
!docker build -t etl:0.0.1 . && docker run etl:0.0.1

### Load the dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option("display.precision", 2)
pd.set_option('display.max_columns', None)

In [2]:
dataset = pd.read_csv("dataset.csv")
str_cols = ("id", "currentAnswer", "title", "requestId", "prompt", "tool", "nonce", "vote")
for col in str_cols:
    dataset[col] = dataset[col].astype("string")
dataset.head()

Unnamed: 0,id,currentAnswer,title,requestId,prompt,tool,nonce,p_yes,p_no,confidence,info_utility,vote,win_probability
0,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,1429730407779530824523722231071959771311408049...,"With the given question ""Will the Hisense U8K ...",prediction-online,c6366b3f-eff5-4533-8dd9-d653b281b29d,0.6,0.4,0.8,0.5,Yes,0.6
1,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,1695055931594747475916883029584567955775422500...,"With the given question ""Will the Hisense U8K ...",prediction-online,1eed33a5-a3f0-41c4-beae-23e9022ffe22,0.6,0.4,0.8,0.7,Yes,0.6
2,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,5972945302788386668720465960403202339977906500...,"With the given question ""Will the Hisense U8K ...",prediction-online,dd376ef9-eb2c-4d9f-8a5a-cf9ae8deb0b3,0.6,0.4,0.8,0.7,Yes,0.6
3,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,1043402953919313937539182160739114840263108832...,"With the given question ""Will the Hisense U8K ...",prediction-online,91096f15-5e3b-4bf1-8178-f17f1efcf639,0.7,0.3,0.8,0.6,Yes,0.7
4,0x0094fa304017d5c2b355790e2976f769ea600492,No,Will the Hisense U8K be considered a top-tier ...,9433232780766388309643050548812272093999565778...,"With the given question ""Will the Hisense U8K ...",prediction-online,92321968-7888-4877-b33f-22fa4755fbc2,0.65,0.35,0.9,0.8,Yes,0.65


In [3]:
dataset.shape

(6987, 13)

In [4]:
dataset.describe()

Unnamed: 0,p_yes,p_no,confidence,info_utility,win_probability
count,6987.0,6987.0,6987.0,6987.0,6987.0
mean,0.49,0.51,0.79,0.58,0.69
std,0.21,0.21,0.08,0.18,0.09
min,0.0,0.0,0.5,0.0,0.53
25%,0.3,0.35,0.8,0.5,0.6
50%,0.6,0.4,0.8,0.6,0.7
75%,0.65,0.7,0.8,0.7,0.8
max,1.0,1.0,1.0,1.0,1.0


In [5]:
dataset["vote"].describe()

count     6987
unique       2
top        Yes
freq      4123
Name: vote, dtype: object

### Normalize confidences

In [6]:
dataset["confidence"].unique()

array([0.8 , 0.9 , 0.7 , 0.85, 0.6 , 0.5 , 1.  , 0.75, 0.99, 0.95])

In [7]:
dataset.loc[(dataset["confidence"] >= 0.9) & (dataset["confidence"] < 1), "confidence"] = 0.9
dataset.loc[dataset["confidence"] == 0.85, "confidence"] = 0.8
dataset.loc[dataset["confidence"] == 0.75, "confidence"] = 0.7
dataset["confidence"].unique()

array([0.8, 0.9, 0.7, 0.6, 0.5, 1. ])

### Check the percentage of wins vs confidence for all the tools

In [8]:
def accuracy(data):
    correct_answers_mask = data["currentAnswer"] == data["vote"]
    n_answers = correct_answers_mask.count()
    n_answers_success = correct_answers_mask.sum()

    if n_answers == 0:
        accuracy = None
    elif n_answers_success == 0:
        accuracy = 0
    else:
        accuracy = n_answers_success/n_answers * 100

    return pd.Series({"n_correct": n_answers_success, "n_pred": n_answers, "accuracy": accuracy})

In [9]:
def acc_per_tool(group, col: str, conf: float):
    return group.apply(lambda x: accuracy(x[x[col] == conf]))

In [10]:
def gen_stats(group, col: str):
    stats = {f"{col}_{prob}" : acc_per_tool(group, col, prob) for prob in sorted(dataset[col].unique())}
    stats["total"] = tools_group.apply(accuracy)
    return pd.concat(stats.values(), axis=1, keys=stats.keys())

In [11]:
tools_group = dataset.groupby("tool")
tools_stats_per_conf = gen_stats(tools_group, "confidence")
display(tools_stats_per_conf)

Unnamed: 0_level_0,confidence_0.5,confidence_0.5,confidence_0.5,confidence_0.6,confidence_0.6,confidence_0.6,confidence_0.7,confidence_0.7,confidence_0.7,confidence_0.8,confidence_0.8,confidence_0.8,confidence_0.9,confidence_0.9,confidence_0.9,confidence_1.0,confidence_1.0,confidence_1.0,total,total,total
Unnamed: 0_level_1,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy
tool,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
claude-prediction-offline,81.0,143.0,56.64,12.0,19.0,63.16,34.0,64.0,53.12,30.0,44.0,68.18,30.0,37.0,81.08,32.0,40.0,80.0,219.0,347.0,63.11
claude-prediction-online,26.0,37.0,70.27,60.0,88.0,68.18,85.0,144.0,59.03,11.0,26.0,42.31,1.0,1.0,100.0,0.0,0.0,,183.0,296.0,61.82
prediction-offline,0.0,0.0,,0.0,0.0,,14.0,19.0,73.68,32.0,75.0,42.67,5.0,11.0,45.45,0.0,0.0,,51.0,105.0,48.57
prediction-offline-sme,0.0,0.0,,12.0,16.0,75.0,72.0,87.0,82.76,131.0,296.0,44.26,13.0,32.0,40.62,1.0,1.0,100.0,229.0,432.0,53.01
prediction-online,0.0,0.0,,33.0,40.0,82.5,434.0,561.0,77.36,1651.0,4118.0,40.09,372.0,748.0,49.73,2.0,3.0,66.67,2492.0,5470.0,45.56
prediction-online-sme,1.0,1.0,100.0,2.0,2.0,100.0,36.0,47.0,76.6,114.0,209.0,54.55,5.0,23.0,21.74,0.0,0.0,,158.0,282.0,56.03
prediction-sentence-embedding-bold,0.0,0.0,,0.0,0.0,,11.0,17.0,64.71,8.0,8.0,100.0,11.0,11.0,100.0,4.0,4.0,100.0,34.0,40.0,85.0
prediction-sentence-embedding-conservative,0.0,0.0,,0.0,0.0,,0.0,0.0,,2.0,6.0,33.33,4.0,9.0,44.44,0.0,0.0,,6.0,15.0,40.0


### Check the percentage of wins vs probability for all the tools

In [12]:
tools_stats_per_prob = gen_stats(tools_group, "win_probability")
tools_stats_per_prob

Unnamed: 0_level_0,win_probability_0.525,win_probability_0.525,win_probability_0.525,win_probability_0.55,win_probability_0.55,win_probability_0.55,win_probability_0.6,win_probability_0.6,win_probability_0.6,win_probability_0.65,win_probability_0.65,win_probability_0.65,win_probability_0.7,win_probability_0.7,win_probability_0.7,win_probability_0.75,win_probability_0.75,win_probability_0.75,win_probability_0.8,win_probability_0.8,win_probability_0.8,win_probability_0.85,win_probability_0.85,win_probability_0.85,win_probability_0.9,win_probability_0.9,win_probability_0.9,win_probability_0.95,win_probability_0.95,win_probability_0.95,win_probability_0.99,win_probability_0.99,win_probability_0.99,win_probability_0.999999,win_probability_0.999999,win_probability_0.999999,win_probability_1.0,win_probability_1.0,win_probability_1.0,total,total,total
Unnamed: 0_level_1,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy,n_correct,n_pred,accuracy
tool,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2
claude-prediction-offline,0.0,0.0,,0.0,0.0,,44.0,76.0,57.89,0.0,0.0,,9.0,32.0,28.12,0.0,0.0,,54.0,102.0,52.94,2.0,3.0,66.67,37.0,45.0,82.22,2.0,2.0,100.0,65.0,80.0,81.25,1.0,1.0,100.0,5.0,6.0,83.33,219.0,347.0,63.11
claude-prediction-online,0.0,0.0,,2.0,3.0,66.67,77.0,138.0,55.8,7.0,9.0,77.78,21.0,46.0,45.65,0.0,3.0,0.0,71.0,90.0,78.89,1.0,1.0,100.0,1.0,3.0,33.33,0.0,0.0,,3.0,3.0,100.0,0.0,0.0,,0.0,0.0,,183.0,296.0,61.82
prediction-offline,0.0,0.0,,1.0,2.0,50.0,17.0,46.0,36.96,0.0,0.0,,11.0,25.0,44.0,1.0,2.0,50.0,20.0,29.0,68.97,0.0,0.0,,0.0,0.0,,0.0,0.0,,1.0,1.0,100.0,0.0,0.0,,0.0,0.0,,51.0,105.0,48.57
prediction-offline-sme,0.0,0.0,,1.0,1.0,100.0,56.0,150.0,37.33,3.0,7.0,42.86,62.0,132.0,46.97,7.0,8.0,87.5,92.0,123.0,74.8,0.0,0.0,,6.0,9.0,66.67,0.0,0.0,,0.0,0.0,,0.0,0.0,,2.0,2.0,100.0,229.0,432.0,53.01
prediction-online,0.0,0.0,,16.0,26.0,61.54,712.0,2312.0,30.8,82.0,190.0,43.16,777.0,1563.0,49.71,105.0,211.0,49.76,761.0,1120.0,67.95,4.0,6.0,66.67,26.0,32.0,81.25,1.0,1.0,100.0,1.0,1.0,100.0,0.0,0.0,,7.0,8.0,87.5,2492.0,5470.0,45.56
prediction-online-sme,0.0,1.0,0.0,0.0,1.0,0.0,48.0,106.0,45.28,4.0,13.0,30.77,56.0,86.0,65.12,6.0,7.0,85.71,40.0,63.0,63.49,1.0,1.0,100.0,3.0,4.0,75.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,158.0,282.0,56.03
prediction-sentence-embedding-bold,0.0,0.0,,0.0,0.0,,11.0,17.0,64.71,0.0,0.0,,2.0,2.0,100.0,0.0,0.0,,1.0,1.0,100.0,0.0,0.0,,5.0,5.0,100.0,6.0,6.0,100.0,0.0,0.0,,0.0,0.0,,9.0,9.0,100.0,34.0,40.0,85.0
prediction-sentence-embedding-conservative,0.0,0.0,,0.0,0.0,,2.0,5.0,40.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,4.0,9.0,44.44,0.0,0.0,,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,6.0,15.0,40.0


### List the well performing tools

In [13]:
# we are using 55% as the performance threshold
perf_threshold = 55

In [14]:
well_performing = tools_stats_per_conf.loc[:, ("total", "accuracy")] > perf_threshold
tools_stats_per_conf.loc[well_performing, ("total", "accuracy")]

tool
claude-prediction-offline             63.11
claude-prediction-online              61.82
prediction-online-sme                 56.03
prediction-sentence-embedding-bold    85.00
Name: (total, accuracy), dtype: float64