In [1]:
import numpy as np


def stylistic_features(text):
    """
    Extracts stylistic features from a given text, including text length.

    Parameters:
    - text (str): The input text from which stylistic features are extracted.

    Returns:
    - dict: A dictionary containing the sentence count, average word length, and text length.
    """
    text_length = len(text)
    sentences = text.split(".")
    sentence_count = len(sentences) - 1
    word_lengths = [len(word) for word in text.split()]
    average_word_length = sum(word_lengths) / len(word_lengths) if word_lengths else 0

    return {
        "text_length": text_length,
        "sentence_count": sentence_count,
        "average_word_length": average_word_length,
    }


def stylistic_similarity(
    text1,
    text2,
    features_to_compare=None,
):
    """
    Calculates the stylistic similarity between two texts based on customizable features, including consideration of text length.

    Parameters:
    - text1 (str): The first text.
    - text2 (str): The second text.
    - features_to_compare (list, optional): A list of features to compare for similarity. Options include 'sentence_count', 'average_word_length', and 'text_length'. Defaults to comparing all features.

    Returns:
    - float: The overall stylistic similarity between the two texts based on the selected features.
    """
    if features_to_compare is None:
        features_to_compare = ["text_length", "sentence_count", "average_word_length"]

    features1 = stylistic_features(text1)
    features2 = stylistic_features(text2)
    similarities = []

    for feature in features_to_compare:
        if feature in features1 and feature in features2:
            similarity = 1 - abs(features1[feature] - features2[feature]) / max(features1[feature], features2[feature], 1)
            similarities.append(similarity)

    if similarities:
        average_similarity = np.mean(similarities)
    else:
        average_similarity = 0

    return average_similarity


def extract_structural_features(text):
    """
    Extracts structural features from a given text, including headers, bullet points, and numbered lists.

    Parameters:
    - text (str): The input text from which structural features are extracted.

    Returns:
    - dict: A dictionary containing the header count, bullet points count, and numbered list count.
    """
    features = {}
    # Count headers more robustly by considering all levels of markdown headers
    features["header_count"] = sum(1 for line in text.split("\n") if line.strip().startswith("#"))
    features["bullet_point_count"] = text.count("\n- ") + text.count("\n* ")

    # Enhanced handling for numbered lists including first and second tier (e.g., 1., a.)
    def is_numbered_list_item(line):
        parts = line.strip().split(". ", 1)
        if len(parts) == 2:
            first_part, _ = parts
            return first_part.isdigit() or (first_part.isalpha() and len(first_part) == 1)
        return False

    features["numbered_list_count"] = sum(1 for line in text.split("\n") if is_numbered_list_item(line))

    return features


def structural_similarity(text1, text2):
    """
    Calculates the structural similarity between two texts, considering headers, bullet points, and numbered lists.
    Adjusts calculation based on the presence of structural features, rather than using a simple average.

    Parameters:
    - text1 (str): The first text.
    - text2 (str): The second text.

    Returns:
    - float: The overall structural similarity between the two texts.
    """
    features1 = extract_structural_features(text1)
    features2 = extract_structural_features(text2)

    similarities = []
    weights = []

    # Calculate header similarity if headers are present in either text
    if features1["header_count"] > 0 or features2["header_count"] > 0:
        header_similarity = 1 - abs(features1["header_count"] - features2["header_count"]) / max(features1["header_count"], features2["header_count"], 1)
        similarities.append(header_similarity)
        weights.append(max(features1["header_count"], features2["header_count"]))

    # Calculate bullet point similarity if bullet points are present in either text
    if features1["bullet_point_count"] > 0 or features2["bullet_point_count"] > 0:
        bullet_similarity = 1 - abs(features1["bullet_point_count"] - features2["bullet_point_count"]) / max(features1["bullet_point_count"], features2["bullet_point_count"], 1)
        similarities.append(bullet_similarity)
        weights.append(max(features1["bullet_point_count"], features2["bullet_point_count"]))

    # Calculate numbered list similarity if numbered lists are present in either text
    if features1["numbered_list_count"] > 0 or features2["numbered_list_count"] > 0:
        numbered_list_similarity = 1 - abs(features1["numbered_list_count"] - features2["numbered_list_count"]) / max(features1["numbered_list_count"], features2["numbered_list_count"], 1)
        similarities.append(numbered_list_similarity)
        weights.append(max(features1["numbered_list_count"], features2["numbered_list_count"]))

    # Calculate weighted average of similarities
    if similarities:
        weighted_average_similarity = sum(similarity * weight for similarity, weight in zip(similarities, weights)) / sum(weights)
    else:
        weighted_average_similarity = 1

    return weighted_average_similarity


def format_similarity(text1, text2):
    """
    Calculates a comprehensive format similarity between two texts, combining stylistic and structural scores, including the consideration of text length.

    Parameters:
    - text1 (str): The first text.
    - text2 (str): The second text.

    Returns:
    - float: The overall format similarity between the two texts.
    """
    features_to_compare = ["text_length", "sentence_count", "average_word_length"]
    stylistic_score = stylistic_similarity(text1, text2, features_to_compare[:])
    structural_score = structural_similarity(text1, text2)

    overall_score = stylistic_score * 0.3 + structural_score * 0.7
    return overall_score

In [2]:
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine


def BERTsimilarity(text1, text2):
    """
    Calculates the similarity between two texts using BERT embeddings.

    Parameters:
    - text1 (str): The first text.
    - text2 (str): The second text.

    Returns:
    - float: The similarity between the two texts.
    """
    model = SentenceTransformer("all-mpnet-base-v2")
    embedding1 = model.encode(text1)
    embedding2 = model.encode(text2)
    similarity = 1 - cosine(embedding1, embedding2)
    return similarity

In [3]:
import json
import pandas as pd

# Assuming the JSON structure is a list of dictionaries with keys: question, model, provider, response
# Load the JSON data
with open("responses.json", "r") as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)


# Calculate similarities
# Initialize a column for similarities
df["BERT_similarity"] = 0.0
df["format_similarity"] = 0.0

# Iterate over each question and model to calculate similarities between OpenAI and Azure responses
for question in df["question"].unique():
    for model in df["model"].unique():
        openai_response = df[(df["question"] == question) & (df["model"] == model) & (df["provider"] == "openai")]["response"].iloc[0]
        azure_response = df[(df["question"] == question) & (df["model"] == model) & (df["provider"] == "azure")]["response"].iloc[0]

        # Update the DataFrame with the calculated similaritys
        df.loc[(df["question"] == question) & (df["model"] == model), "BERT_similarity"] = BERTsimilarity(openai_response, azure_response)
        df.loc[(df["question"] == question) & (df["model"] == model), "format_similarity"] = format_similarity(openai_response, azure_response)

In [4]:
def clean_data(df):
    """
    Cleans the DataFrame by removing specific columns and duplicates, and resetting the index.

    Parameters:
    - df (pd.DataFrame): The DataFrame to be cleaned.

    Returns:
    - pd.DataFrame: The cleaned DataFrame.
    """
    df = df.drop(columns=["provider", "response"])
    df = df.drop_duplicates()
    df = df.sort_values(["model"])

    # Use factorize to encode the unique questions, starting with 1
    # df["question"], _ = pd.factorize(df["question"])
    # Increment by 1 to start numbering from 1 instead of 0
    # df["question"] = df["question"] + 1

    df.reset_index(drop=True, inplace=True)
    return df


df_similarity = clean_data(df.copy(deep=True))
df_similarity

Unnamed: 0,question,model,BERT_similarity,format_similarity
0,Tell me some useful info and tips about AI.,gpt-4,0.846239,0.538891
1,Are you aware of Phantom Liberty? Please brief...,gpt-4,0.790551,0.977796
2,\nCyberpunk 2077 — Never Fade Away by P. T. Ad...,gpt-4,0.933726,0.972131
3,\nA bird in the hand is worth two in the bush\...,gpt-4,0.913258,0.257918
4,\nProve or disprove: “An individual is risk av...,gpt-4,0.924695,0.482897
5,Explain gradient descent.,gpt-4,0.838881,0.220882
6,Describe Sagittarius A* and TON 618.,gpt-4,0.880041,0.978038
7,Tell me some useful info and tips about AI.,gpt-4-turbo,0.892335,0.914315
8,Are you aware of Phantom Liberty? Please brief...,gpt-4-turbo,0.862975,0.951599
9,\nCyberpunk 2077 — Never Fade Away by P. T. Ad...,gpt-4-turbo,0.887388,0.786608


In [5]:
df_similarity["BERT_similarity"].describe()

count    14.000000
mean      0.889601
std       0.045067
min       0.790551
25%       0.866600
50%       0.889862
75%       0.921836
max       0.953328
Name: BERT_similarity, dtype: float64

In [6]:
df_similarity[df_similarity["model"] == "gpt-4"]["BERT_similarity"].describe()

count    7.000000
mean     0.875341
std      0.052710
min      0.790551
25%      0.842560
50%      0.880041
75%      0.918976
max      0.933726
Name: BERT_similarity, dtype: float64

In [7]:
df_similarity[df_similarity["model"] == "gpt-4o"]["BERT_similarity"].describe()

count    7.000000
mean     0.903860
std      0.033879
min      0.862975
25%      0.882433
50%      0.892335
75%      0.926758
max      0.953328
Name: BERT_similarity, dtype: float64

In [8]:
df_similarity["format_similarity"].describe()

count    14.000000
mean      0.726645
std       0.286258
min       0.220882
25%       0.496896
50%       0.850462
75%       0.976380
max       0.979215
Name: format_similarity, dtype: float64

In [9]:
df_similarity[df_similarity["model"] == "gpt-4"]["format_similarity"].describe()

count    7.000000
mean     0.632651
std      0.340309
min      0.220882
25%      0.370408
50%      0.538891
75%      0.974964
max      0.978038
Name: format_similarity, dtype: float64

In [10]:
df_similarity[df_similarity["model"] == "gpt-4o"]["format_similarity"].describe()

count    7.000000
mean     0.820640
std      0.202781
min      0.423542
25%      0.748843
50%      0.914315
75%      0.964859
max      0.979215
Name: format_similarity, dtype: float64