# AI Detector - Evaluation

## 1. Import Necessary Dependencies

In [1]:
import os
import torch
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


We should also specify `device` for GPU accelerated training (if GPU is available)

In [2]:
device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## 2. Define Evaluation Metrics

In [3]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def manhattan_distance(a, b):
    return np.sum(np.abs(a - b))

def euclidean_distance(a, b):
    return np.linalg.norm(a - b)

def dot_product(a, b):
    return np.dot(a, b)

## 3. Define `evaluate_model()` function

- **Params:**
  - `model` -> Fine-tuned model path
  - `df` -> Preprocessed data
- **Returns:**
  - `metrics` -> 8 different evaluation metrics

In [4]:
def evaluate_model(model, df):
    # Encode all sentences
    candidate_embeddings = model.encode(df['candidate_combined'].tolist())
    ai_embeddings = model.encode(df['ai_combined'].tolist())

    # Calculate various similarity/distance measures
    cosine_scores = [cosine_similarity(c, a) for c, a in zip(candidate_embeddings, ai_embeddings)]
    manhattan_scores = [-manhattan_distance(c, a) for c, a in zip(candidate_embeddings, ai_embeddings)]
    euclidean_scores = [-euclidean_distance(c, a) for c, a in zip(candidate_embeddings, ai_embeddings)]
    dot_product_scores = [dot_product(c, a) for c, a in zip(candidate_embeddings, ai_embeddings)]
    
    true_scores = df['similarity_score'].tolist()
    
    # Calculate Spearman correlations
    cosine_spearman = spearmanr(true_scores, cosine_scores).correlation
    manhattan_spearman = spearmanr(true_scores, manhattan_scores).correlation
    euclidean_spearman = spearmanr(true_scores, euclidean_scores).correlation
    dot_product_spearman = spearmanr(true_scores, dot_product_scores).correlation
    
    # Calculate other metrics using cosine scores
    mse = mean_squared_error(true_scores, cosine_scores)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true_scores, cosine_scores)
    r2 = r2_score(true_scores, cosine_scores)
    
    metrics = {
        "Cosine Spearman": cosine_spearman,
        "Manhattan Spearman": manhattan_spearman,
        "Euclidean Spearman": euclidean_spearman,
        "Dot Product Spearman": dot_product_spearman,
        "Mean Squared Error": mse,
        "Root Mean Squared Error": rmse,
        "Mean Absolute Error": mae,
        "R-squared Score": r2
    }
    
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    return metrics

## 3. Evaluate the Model
Specify the model

In [5]:
# model_name = "all-mpnet-base-v2"
# model_name = "all-distilroberta-v1"
# model_name = "all-MiniLM-L12-v2"
# model_name = "all-MiniLM-L6-v2"
model_name = "multi-qa-mpnet-base-dot-v1"

Load the exported model

In [6]:
# Load the train data
data_dir = os.path.join(os.path.abspath(''), os.pardir, 'data')
df = pd.read_csv(os.path.join(data_dir, 'preprocessed_data.csv'))

# Define model export/output path
model_dir = os.path.join(
    os.path.abspath(''), os.pardir, 'models')
model_path = os.path.join(model_dir, f'fine-tuned_{model_name}')

Evaluate the Model

In [7]:
model = SentenceTransformer(model_path, device=device)
metrics = evaluate_model(model, df)

Cosine Spearman: 0.9672
Manhattan Spearman: 0.9603
Euclidean Spearman: 0.9551
Dot Product Spearman: 0.9652
Mean Squared Error: 0.0086
Root Mean Squared Error: 0.0925
Mean Absolute Error: 0.0702
R-squared Score: 0.8805
