In [20]:
from tqdm import tqdm_notebook
from transformers import pipeline
import pandas as pd
import torch
from tqdm.notebook import tqdm

tqdm.pandas()

In [21]:
model_path = "./model/classification/cosine-bleu-C1-Tmean"
model = pipeline(
    "text-classification",
    model=model_path,
    truncation=True,
    device=0 if torch.cuda.is_available() else -1,
)

In [22]:
data_dir = model_path.split("/")[-1].replace("-", "_").split("_T")[0] + "_.csv"
data = pd.read_csv("./evaluation/speed/" + data_dir)

In [23]:
def get_threshold(df, th_type):
    if th_type == "mean":
        return df["max_score"].mean()
    elif th_type == "0.65":
        return 0.65
    elif th_type == "%75":
        return df["max_score"].quantile(0.75)

In [24]:
threshold = model_path.split("-")[-1].removeprefix("T")
data["max_score"] = data.apply(lambda x: x[x["fastest"]], axis=1)
filtered = data[data["max_score"] >= get_threshold(data, threshold)]

In [25]:
test_data = data[~data.index.isin(filtered.index)]

In [27]:
test_data

Unnamed: 0,code,comment,bert,bow,doc2vec,tfidf,word2vec,max,fastest,max_score
2,protected AuditMessageType getPatientParticipa...,"Since we're using an enhanced for loop, does c...",0.0889,0.0235,0.0235,0.0235,0.0235,bert,bert,0.0889
3,public synchronized void registerListener(Drui...,I think it might be better to add to the list ...,0.0802,0.0498,0.0802,0.0498,0.0454,"bert,doc2vec",doc2vec,0.0802
4,public Step loadVariantsStep(StepBuilderFactor...,can we remove this then? I think the main reas...,0.1291,0.1282,0.1291,0.1282,0.1282,"bert,doc2vec",doc2vec,0.1291
5,public int hashCode() { int result = factType....,I'm still scared something stupid not using re...,0.0172,0.0206,0.0219,0.1192,0.0206,tfidf,tfidf,0.1192
7,public boolean visit(TypeDeclaration elem) { U...,Is this needed for externals?,0.0269,0.0130,0.0013,0.0415,0.0004,tfidf,tfidf,0.0415
...,...,...,...,...,...,...,...,...,...,...
150670,"public Config(Map<String, String> properties) ...",one more method where it can take file locatio...,0.0343,0.0343,0.0413,0.0343,0.0343,doc2vec,doc2vec,0.0413
150671,public CacheImpl(String name) { this.name = na...,I would prefer to initialize these inline in f...,0.1179,0.1254,0.0577,0.0867,0.0577,bow,bow,0.1254
150672,"public void sendResponse(ReplyHeader h, Record...","This IOException is swallowed either, should w...",0.0810,0.0161,0.0766,0.0162,0.0180,bert,bert,0.0810
150673,public void execute(IProgressMonitor m) throws...,1) When would getCause return null? 2) Why dro...,0.1098,0.1181,0.1181,0.0291,0.0206,"bow,doc2vec",doc2vec,0.1181


In [None]:
test_data["predicted"] = test_data["code"].progress_apply(
    lambda x: model(x)[0]["label"]
)
test_data["predicted_score"] = test_data.apply(lambda x: x[x["predicted"]], axis=1)

In [16]:
test_data.to_csv("predicted.csv")

In [31]:
res = pd.read_csv("predicted.csv", index_col=0)

In [34]:
res

Unnamed: 0,code,comment,bert,bow,doc2vec,tfidf,word2vec,max,fastest,max_score,predicted,predicted_score
2,protected AuditMessageType getPatientParticipa...,"Since we're using an enhanced for loop, does c...",0.0889,0.0235,0.0235,0.0235,0.0235,bert,bert,0.0889,word2vec,0.0235
3,public synchronized void registerListener(Drui...,I think it might be better to add to the list ...,0.0802,0.0498,0.0802,0.0498,0.0454,"bert,doc2vec",doc2vec,0.0802,word2vec,0.0454
4,public Step loadVariantsStep(StepBuilderFactor...,can we remove this then? I think the main reas...,0.1291,0.1282,0.1291,0.1282,0.1282,"bert,doc2vec",doc2vec,0.1291,word2vec,0.1282
5,public int hashCode() { int result = factType....,I'm still scared something stupid not using re...,0.0172,0.0206,0.0219,0.1192,0.0206,tfidf,tfidf,0.1192,bert,0.0172
7,public boolean visit(TypeDeclaration elem) { U...,Is this needed for externals?,0.0269,0.0130,0.0013,0.0415,0.0004,tfidf,tfidf,0.0415,word2vec,0.0004
...,...,...,...,...,...,...,...,...,...,...,...,...
150670,"public Config(Map<String, String> properties) ...",one more method where it can take file locatio...,0.0343,0.0343,0.0413,0.0343,0.0343,doc2vec,doc2vec,0.0413,tfidf,0.0343
150671,public CacheImpl(String name) { this.name = na...,I would prefer to initialize these inline in f...,0.1179,0.1254,0.0577,0.0867,0.0577,bow,bow,0.1254,bow,0.1254
150672,"public void sendResponse(ReplyHeader h, Record...","This IOException is swallowed either, should w...",0.0810,0.0161,0.0766,0.0162,0.0180,bert,bert,0.0810,word2vec,0.0180
150673,public void execute(IProgressMonitor m) throws...,1) When would getCause return null? 2) Why dro...,0.1098,0.1181,0.1181,0.0291,0.0206,"bow,doc2vec",doc2vec,0.1181,word2vec,0.0206


In [19]:
test_data.mean(numeric_only=True)

bert               0.060059
bow                0.058744
doc2vec            0.058614
tfidf              0.058947
word2vec           0.058431
max_score          0.098434
predicted_score    0.065411
dtype: float64