In [1]:
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve

# loading files with precalculated scores using test_model.py script
results_rrla = joblib.load("../data/RRLA_results.data")
results_rrml = joblib.load("../data/RRML_results.data")

# data used for testing (10K samples)
df = pd.read_csv("../data/test_all_users_sample.csv")


def precision_at_recall(y_true, y_scores, recall_thr=0.75):
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    idx = np.where(recall > recall_thr)[0][-1]
    return precision[idx]



In [2]:
# Defining variables for metrics saving for different configurations
names, model_error_rates, model_mean_time = [], [], []
model_auc, model_auc_anon, model_auc_auth, model_pr_r_25 = [], [], [], []  

## Multilingual stable model: 

In [3]:
# Function for metrics calculation from dumped results:

def calculate_metrics(result_dump, true_data, model_name="", verbose=False):
    
    # processing data used for further metrics calculation
    true_dict = {r: l for r, l in zip(true_data.revision_id, true_data.revision_is_identity_reverted)}
    is_anon_dict = {r: l for r, l in zip(true_data.revision_id, true_data.user_is_anonymous)}

    error_counts = 0
    rev_ids = []
    true_labels = []
    times = []
    scores = []
    is_anon = []

    for r in result_dump:
        if r[3] is not None:
            error_counts+=1
            continue
        else:
            rev_ids.append(r[0])
            true_labels.append(true_dict[r[0]])
            times.append(r[2])
            scores.append(r[1].probability)
            is_anon.append(is_anon_dict[r[0]])

    anon_auc = roc_auc_score(
        [l for l, g in zip(true_labels, is_anon) if g], 
        [l for l, g in zip(scores, is_anon) if g]
    )
    all_auc = roc_auc_score(
        [l for l, g in zip(true_labels, is_anon) if not g], 
        [l for l, g in zip(scores, is_anon) if not g]
    )
    
    if verbose:
        print(model_name)
        print("AUC score: ", roc_auc_score(true_labels, scores))
        print("AUC authorised: ", all_auc)
        print("AUC anon: ", anon_auc)
        print("Precision at Recall 0.75: ", precision_at_recall(true_labels, scores, recall_thr=0.75))
        print()
        print("Error rate: ", error_counts / len(true_data))
        print()
        print("Time distribution (20 simultanious requests on M2 CPU):\n", pd.Series(times).describe())
    
    return error_counts / len(true_data), np.mean(times), roc_auc_score(true_labels, scores), \
        anon_auc, all_auc, precision_at_recall(true_labels, scores, recall_thr=0.75)



In [4]:
model_name = "Multilingual RR"

error_rate, mean_time, auc, auc_anon, auc_auth, pr_r_25 = \
    calculate_metrics(results_rrml, df, model_name=model_name, verbose=True)

model_error_rates.append(error_rate)
model_mean_time.append(mean_time)
model_auc.append(auc)
model_auc_anon.append(auc_anon)
model_auc_auth.append(auc_auth)
model_pr_r_25.append(pr_r_25)
names.append(model_name)

Multilingual RR
AUC score:  0.8785292720192289
AUC authorised:  0.7827275359463663
AUC anon:  0.7811554112293784
Precision at Recall 0.75:  0.28831312017640576

Error rate:  0.0063

Time distribution (20 simultanious requests on M2 CPU):
 count    9937.000000
mean        3.130323
std         3.106043
min         0.366666
25%         1.208242
50%         2.317784
75%         4.228241
max       140.010956
dtype: float64


## Language-agnostic model: 

In [5]:
model_name = "Language-agnostic RR"

error_rate, mean_time, auc, auc_anon, auc_auth, pr_r_25 = \
    calculate_metrics(results_rrla, df, model_name=model_name, verbose=True)

model_error_rates.append(error_rate)
model_mean_time.append(mean_time)
model_auc.append(auc)
model_auc_anon.append(auc_anon)
model_auc_auth.append(auc_auth)
model_pr_r_25.append(pr_r_25)
names.append(model_name)

Language-agnostic RR
AUC score:  0.8653014728780128
AUC authorised:  0.7803407351260373
AUC anon:  0.6999705047693502
Precision at Recall 0.75:  0.25801677355698077

Error rate:  0.0061

Time distribution (20 simultanious requests on M2 CPU):
 count    9939.000000
mean        0.538154
std         0.230301
min         0.322874
25%         0.387910
50%         0.450326
75%         0.608941
max         7.007304
dtype: float64


# ORES

We have all the ORES scores precalculated -> use tham for further analysis

In [6]:
# Load ores scores: 
ores_scores = pd.read_csv("../data/test_ores_scores_full_test.csv")
ores_scores_dict = {r: i for r, i in zip(ores_scores.revision_id, ores_scores.ores_pred) if not pd.isnull(i)}

# Example of one record:
random_key = np.random.choice(list(ores_scores_dict.keys()))

print("Example:")
print(f"Revision_id: {random_key}, Score: {ores_scores_dict[random_key]}")

Example:
Revision_id: 1095887742, Score: 0.0080322643704155


In [7]:
true_dict = {r: l for r, l in zip(df.revision_id, df.revision_is_identity_reverted)}
is_anon_dict = {r: l for r, l in zip(df.revision_id, df.user_is_anonymous)}

error_counts = 0

rev_ids = []
true_labels = []
times = []
scores = []
is_anon = []

ores_is_available = []

for r in results_rrla:
    ores_is_available.append(r[0] in ores_scores_dict)
    if not ores_is_available[-1]: 
        continue
    if r[3] is not None:
        error_counts+=1
        continue
    else:
        rev_ids.append(r[0])
        true_labels.append(true_dict[r[0]])
        times.append(r[2])
        scores.append(ores_scores_dict.get(r[0], 0))
        is_anon.append(is_anon_dict[r[0]])

In [8]:
print("AUC score: ", roc_auc_score(true_labels, scores))

anon_auc = roc_auc_score(
    [l for l, g in zip(true_labels, is_anon) if g], 
    [l for l, g in zip(scores, is_anon) if g]
)
all_auc = roc_auc_score(
    [l for l, g in zip(true_labels, is_anon) if not g], 
    [l for l, g in zip(scores, is_anon) if not g]
)

print("AUC authorised: ", all_auc)
print("AUC anon: ", anon_auc)
print("Precision at Recall 0.75: ", precision_at_recall(true_labels, scores, recall_thr=0.75))

names.append("ORES")
model_error_rates.append(None)
model_mean_time.append(None)
model_auc.append(roc_auc_score(true_labels, scores))
model_auc_anon.append(anon_auc)
model_auc_auth.append(all_auc)
model_pr_r_25.append(precision_at_recall(true_labels, scores, recall_thr=0.75))

AUC score:  0.840866173596575
AUC authorised:  0.7095457138226181
AUC anon:  0.6823470085470085
Precision at Recall 0.75:  0.23596059113300494


## Final comparison table: 

In [9]:
pd.DataFrame({
    "model": names,
    "error_rate": model_error_rates, 
    "model_processing_time": model_mean_time,
    "AUC": model_auc,
    "AUC anonymous":model_auc_anon,
    "AUC authorised" :model_auc_auth,
    "Pr@R0.25": model_pr_r_25,
})



Unnamed: 0,model,error_rate,model_processing_time,AUC,AUC anonymous,AUC authorised,Pr@R0.25
0,Multilingual RR,0.0063,3.130323,0.878529,0.781155,0.782728,0.288313
1,Language-agnostic RR,0.0061,0.538154,0.865301,0.699971,0.780341,0.258017
2,ORES,,,0.840866,0.682347,0.709546,0.235961
