In [1]:
from text_feature_engine import TextFeatureEngine
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

df = pd.read_json(
    "../data/training-set/chatbot-arena-conversations.jsonl.gz",
    lines=True,
    compression="gzip"
)
# Initialize the feature engine
feature_engine = TextFeatureEngine()

# Process your dataframe
processed_df = feature_engine.process_dataframe(
    df,
    conversation_a_col='conversation_a',
    conversation_b_col='conversation_b'
)

In [2]:
processed_df.head()

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,prompt,response_a,response_b,...,prompt_unique_words,response_a_unique_words,response_b_unique_words,is_question,prompt_contains_politeness,prompt_contains_negative,response_a_contains_politeness,response_a_contains_negative,response_b_contains_politeness,response_b_contains_negative
0,58210e39b3fd4441a2bd4a518bb44c2d,chatglm-6b,koala-13b,model_b,arena_user_973,[{'content': 'What is the difference between O...,[{'content': 'What is the difference between O...,What is the difference between OpenCL and CUDA?,OpenCL and CUDA are two different programming ...,OpenCL and CUDA are both programming languages...,...,8,88,134,1,0,0,0,0,0,1
1,2564acd09e3942fd97657d05282d4389,oasst-pythia-12b,alpaca-13b,tie,arena_user_973,[{'content': 'Why did my parent not invite me ...,[{'content': 'Why did my parent not invite me ...,Why did my parent not invite me to their wedding?,It is possible that your parent did not invite...,It is likely that they wanted to keep the gues...,...,10,67,61,1,0,1,0,1,0,0
2,90bfd142157948aba01931726c888e7f,koala-13b,oasst-pythia-12b,model_b,arena_user_973,"[{'content': 'Fuji vs. Nikon, which is better?...","[{'content': 'Fuji vs. Nikon, which is better?...","Fuji vs. Nikon, which is better?",Both Fuji and Nikon are popular camera brands ...,This is a subjective question and the answer d...,...,6,106,119,1,0,0,0,0,0,1
3,a7c5accc53e649a3bc6b2e41d962ebc4,vicuna-13b,oasst-pythia-12b,model_b,arena_user_973,[{'content': 'How to build an arena for chatbo...,[{'content': 'How to build an arena for chatbo...,How to build an arena for chatbots?,Building an arena for chatbots can be done by ...,Building an arena for chatbots is a great way ...,...,7,121,125,1,0,0,0,0,0,0
4,adf27e819a3c494cb6e993f0c660e097,vicuna-13b,koala-13b,model_a,arena_user_973,"[{'content': 'When is it today?', 'role': 'use...","[{'content': 'When is it today?', 'role': 'use...",When is it today?,"I'm sorry, I cannot determine the current date...","Today is February 23, 2023.",...,4,37,5,1,0,0,0,1,0,0


In [None]:
processed_df.columns

Index(['question_id', 'model_a', 'model_b', 'winner', 'judge',
       'conversation_a', 'conversation_b', 'prompt', 'response_a',
       'response_b', 'prompt_tokens', 'prompt_token_length',
       'response_a_tokens', 'response_a_token_length', 'response_b_tokens',
       'response_b_token_length', 'response_a_flesch_kincaid',
       'response_a_gunning_fog', 'response_a_smog',
       'response_b_flesch_kincaid', 'response_b_gunning_fog',
       'response_b_smog', 'response_a_ttr', 'response_a_lexical_diversity',
       'response_a_avg_syllable_count', 'response_a_complex_word_count',
       'response_b_ttr', 'response_b_lexical_diversity',
       'response_b_avg_syllable_count', 'response_b_complex_word_count',
       'response_jaccard_similarity', 'prompt_a_keyword_overlap',
       'prompt_b_keyword_overlap', 'is_question', 'prompt_contains_politeness',
       'prompt_contains_negative', 'response_a_contains_politeness',
       'response_a_contains_negative', 'response_b_contains_po

In [1]:
from embedding_feature_engine import EmbeddingFeatureEngine

# Initialize the embedding feature engine
embedding_engine = EmbeddingFeatureEngine()

# Process the embeddings by specifying file paths
clustering_dataframes = embedding_engine.process_dataframe(
    prompt_embeddings_path="../data/training-set/chatbot-arena-prompts-embeddings.npy",
    response_a_embeddings_path="../data/training-set/chatbot-arena-model_a_response-embeddings.npy",
    response_b_embeddings_path="../data/training-set/chatbot-arena-model_b_response-embeddings.npy"
)

# Iterate through the clustering DataFrames and print summary information
for method, df in clustering_dataframes.items():
    print(f"Clustering method: {method}")
    print("Generated features:", df.columns.tolist())
    print("Number of samples:", len(df))

hdbscan_df = clustering_dataframes['hdbscan']
kmeans_df = clustering_dataframes['kmeans']
agglo_df = clustering_dataframes['agglo']

Loaded embeddings with shapes:
Prompt embeddings: (25282, 256)
Response A embeddings: (25282, 256)
Response B embeddings: (25282, 256)
Clustering method: hdbscan
Generated features: ['cosine_sim_prompt_response_a', 'cosine_sim_prompt_response_b', 'cosine_sim_response_a_b', 'euclidean_dist_prompt_response_a', 'euclidean_dist_prompt_response_b', 'euclidean_dist_response_a_b', 'variance_prompt', 'variance_response_a', 'variance_response_b', 'mean_prompt', 'mean_response_a', 'mean_response_b', 'std_prompt', 'std_response_a', 'std_response_b', 'prompt_clusters', 'response_a_clusters', 'response_b_clusters', 'response_a_same_cluster_as_prompt', 'response_b_same_cluster_as_prompt', 'response_a_different_from_response_b']
Number of samples: 25282
Clustering method: kmeans
Generated features: ['cosine_sim_prompt_response_a', 'cosine_sim_prompt_response_b', 'cosine_sim_response_a_b', 'euclidean_dist_prompt_response_a', 'euclidean_dist_prompt_response_b', 'euclidean_dist_response_a_b', 'variance_

In [4]:


print(hdbscan_df.columns)
hdbscan_df

Index(['cosine_sim_prompt_response_a', 'cosine_sim_prompt_response_b',
       'cosine_sim_response_a_b', 'euclidean_dist_prompt_response_a',
       'euclidean_dist_prompt_response_b', 'euclidean_dist_response_a_b',
       'variance_prompt', 'variance_response_a', 'variance_response_b',
       'mean_prompt', 'mean_response_a', 'mean_response_b', 'std_prompt',
       'std_response_a', 'std_response_b', 'prompt_clusters',
       'response_a_clusters', 'response_b_clusters',
       'response_a_same_cluster_as_prompt',
       'response_b_same_cluster_as_prompt',
       'response_a_different_from_response_b'],
      dtype='object')


Unnamed: 0,cosine_sim_prompt_response_a,cosine_sim_prompt_response_b,cosine_sim_response_a_b,euclidean_dist_prompt_response_a,euclidean_dist_prompt_response_b,euclidean_dist_response_a_b,variance_prompt,variance_response_a,variance_response_b,mean_prompt,...,mean_response_b,std_prompt,std_response_a,std_response_b,prompt_clusters,response_a_clusters,response_b_clusters,response_a_same_cluster_as_prompt,response_b_same_cluster_as_prompt,response_a_different_from_response_b
0,0.751438,0.802665,0.853163,0.705070,0.628228,0.541917,0.003879,0.003875,0.003886,-0.005230,...,-0.004450,0.062281,0.062249,0.062341,391,-1,600,False,False,True
1,0.751248,0.502628,0.540338,0.705340,0.997368,0.958814,0.003905,0.003906,0.003906,0.001240,...,-0.000750,0.062488,0.062498,0.062495,20,513,514,False,False,True
2,0.815042,0.767400,0.851614,0.608208,0.682055,0.544769,0.003887,0.003882,0.003866,-0.004331,...,-0.006376,0.062350,0.062308,0.062174,-1,699,-1,False,True,True
3,0.829923,0.826861,0.918109,0.583227,0.588455,0.404700,0.003892,0.003871,0.003882,-0.003775,...,-0.004890,0.062386,0.062218,0.062308,661,280,298,False,False,True
4,0.364985,0.566376,0.352887,1.126956,0.931262,1.137640,0.003900,0.003877,0.003906,0.002495,...,0.000501,0.062450,0.062267,0.062498,142,357,122,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25277,0.735905,0.694663,0.741538,0.726767,0.781457,0.718975,0.003791,0.003841,0.003862,-0.010724,...,-0.006674,0.061573,0.061978,0.062143,76,-1,-1,False,False,False
25278,0.527935,0.566758,0.688008,0.971664,0.930851,0.789926,0.003886,0.003888,0.003895,0.004519,...,0.003372,0.062336,0.062354,0.062409,257,137,68,False,False,True
25279,0.800589,0.838094,0.820807,0.631523,0.569045,0.598654,0.003896,0.003864,0.003892,-0.003185,...,-0.003762,0.062419,0.062161,0.062387,687,-1,492,False,False,True
25280,0.771494,0.751828,0.858995,0.676027,0.704517,0.531046,0.003887,0.003859,0.003893,-0.004363,...,-0.003590,0.062348,0.062121,0.062397,687,355,-1,False,False,True


In [5]:
print(kmeans_df.shape)
print(kmeans_df.columns)
kmeans_df.head()

(25282, 23)
Index(['cosine_sim_prompt_response_a', 'cosine_sim_prompt_response_b',
       'cosine_sim_response_a_b', 'euclidean_dist_prompt_response_a',
       'euclidean_dist_prompt_response_b', 'euclidean_dist_response_a_b',
       'variance_prompt', 'variance_response_a', 'variance_response_b',
       'mean_prompt', 'mean_response_a', 'mean_response_b', 'std_prompt',
       'std_response_a', 'std_response_b', 'prompt_clusters',
       'response_a_clusters', 'response_b_clusters',
       'response_a_same_cluster_as_prompt',
       'response_b_same_cluster_as_prompt',
       'centroid_distance_prompt_response_a',
       'centroid_distance_prompt_response_b',
       'response_a_different_from_response_b'],
      dtype='object')


Unnamed: 0,cosine_sim_prompt_response_a,cosine_sim_prompt_response_b,cosine_sim_response_a_b,euclidean_dist_prompt_response_a,euclidean_dist_prompt_response_b,euclidean_dist_response_a_b,variance_prompt,variance_response_a,variance_response_b,mean_prompt,...,std_response_a,std_response_b,prompt_clusters,response_a_clusters,response_b_clusters,response_a_same_cluster_as_prompt,response_b_same_cluster_as_prompt,centroid_distance_prompt_response_a,centroid_distance_prompt_response_b,response_a_different_from_response_b
0,0.751438,0.802665,0.853163,0.70507,0.628228,0.541917,0.003879,0.003875,0.003886,-0.00523,...,0.062249,0.062341,9,2,7,False,False,2.488179,2.617597,True
1,0.751248,0.502628,0.540338,0.70534,0.997368,0.958814,0.003905,0.003906,0.003906,0.00124,...,0.062498,0.062495,7,7,2,True,False,0.0,2.308875,True
2,0.815042,0.7674,0.851614,0.608208,0.682055,0.544769,0.003887,0.003882,0.003866,-0.004331,...,0.062308,0.062174,2,7,5,False,False,2.308875,2.173924,True
3,0.829923,0.826861,0.918109,0.583227,0.588455,0.4047,0.003892,0.003871,0.003882,-0.003775,...,0.062218,0.062308,9,2,7,False,False,2.488179,2.617597,True
4,0.364985,0.566376,0.352887,1.126956,0.931262,1.13764,0.0039,0.003877,0.003906,0.002495,...,0.062267,0.062498,4,1,6,False,False,4.92383,4.249504,True


In [6]:
print(agglo_df.shape)
print(agglo_df.columns)
agglo_df.head()

(25282, 21)
Index(['cosine_sim_prompt_response_a', 'cosine_sim_prompt_response_b',
       'cosine_sim_response_a_b', 'euclidean_dist_prompt_response_a',
       'euclidean_dist_prompt_response_b', 'euclidean_dist_response_a_b',
       'variance_prompt', 'variance_response_a', 'variance_response_b',
       'mean_prompt', 'mean_response_a', 'mean_response_b', 'std_prompt',
       'std_response_a', 'std_response_b', 'prompt_clusters',
       'response_a_clusters', 'response_b_clusters',
       'response_a_same_cluster_as_prompt',
       'response_b_same_cluster_as_prompt',
       'response_a_different_from_response_b'],
      dtype='object')


Unnamed: 0,cosine_sim_prompt_response_a,cosine_sim_prompt_response_b,cosine_sim_response_a_b,euclidean_dist_prompt_response_a,euclidean_dist_prompt_response_b,euclidean_dist_response_a_b,variance_prompt,variance_response_a,variance_response_b,mean_prompt,...,mean_response_b,std_prompt,std_response_a,std_response_b,prompt_clusters,response_a_clusters,response_b_clusters,response_a_same_cluster_as_prompt,response_b_same_cluster_as_prompt,response_a_different_from_response_b
0,0.751438,0.802665,0.853163,0.70507,0.628228,0.541917,0.003879,0.003875,0.003886,-0.00523,...,-0.00445,0.062281,0.062249,0.062341,4,2,0,False,False,True
1,0.751248,0.502628,0.540338,0.70534,0.997368,0.958814,0.003905,0.003906,0.003906,0.00124,...,-0.00075,0.062488,0.062498,0.062495,3,5,1,False,False,True
2,0.815042,0.7674,0.851614,0.608208,0.682055,0.544769,0.003887,0.003882,0.003866,-0.004331,...,-0.006376,0.06235,0.062308,0.062174,6,3,1,False,False,True
3,0.829923,0.826861,0.918109,0.583227,0.588455,0.4047,0.003892,0.003871,0.003882,-0.003775,...,-0.00489,0.062386,0.062218,0.062308,2,1,0,False,False,True
4,0.364985,0.566376,0.352887,1.126956,0.931262,1.13764,0.0039,0.003877,0.003906,0.002495,...,0.000501,0.06245,0.062267,0.062498,7,1,5,False,False,True


In [17]:
from hardness_feature_engine import HardnessFeatureEngine
# Example usage:
# Initialize the feature engine
engine = HardnessFeatureEngine()

# Example DataFrame
df = pd.read_json(
    "../data/training-set/chatbot-arena-gpt3-scores.jsonl.gz",
    lines=True,
    compression="gzip"
)

# Process the DataFrame
score_columns = ["score_value_1", "score_value_2", "score_value_3"]
processed_df = engine.calculate_combined_hardness_score(df, score_columns)
processed_df.head()

Unnamed: 0,question_id,prompt,openai_scores_raw_choices_nested,topic_modeling_1,score_reason_1,score_value_1,topic_modeling_2,score_reason_2,score_value_2,topic_modeling_3,score_reason_3,score_value_3,combined_hardness_score
0,58210e39b3fd4441a2bd4a518bb44c2d,What is the difference between OpenCL and CUDA?,"[{'finish_reason': 'stop', 'index': 0, 'logpro...",Technical Comparison,This prompt requires the AI to accurately comp...,9.0,Software Comparison,This prompt assesses the AI's factual accuracy...,8.0,"Comparison, Technology",This prompt requires the AI to demonstrate kno...,9.0,8.666667
1,2564acd09e3942fd97657d05282d4389,Why did my parent not invite me to their wedding?,"[{'finish_reason': 'stop', 'index': 0, 'logpro...","Reasoning, Emotion",This prompt requires the AI to understand huma...,9.0,"Emotions, Relationships",This prompt involves understanding complex hum...,8.0,"Reasoning, Emotional",This prompt challenges the AI to infer motives...,8.0,8.333333
2,90bfd142157948aba01931726c888e7f,"Fuji vs. Nikon, which is better?","[{'finish_reason': 'stop', 'index': 0, 'logpro...",Camera comparison,This prompt does not require problem-solving s...,2.0,Comparative Analysis,This prompt assesses the AI's ability to analy...,6.0,Photography comparison,This prompt is subjective and does not provide...,2.0,3.333333
3,a7c5accc53e649a3bc6b2e41d962ebc4,How to build an arena for chatbots?,"[{'finish_reason': 'stop', 'index': 0, 'logpro...",Chatbot Arena,This prompt requires problem-solving skills an...,8.0,Chatbot Arena,This prompt requires the AI to engage in probl...,8.0,Chatbot Arena,This prompt requires problem-solving skills an...,8.0,8.0
4,adf27e819a3c494cb6e993f0c660e097,When is it today?,"[{'finish_reason': 'stop', 'index': 0, 'logpro...",Time Query,This prompt is very straightforward and does n...,2.0,Date Inquiry,This prompt is very straightforward and does n...,2.0,Time-based Inquiry,This prompt is too straightforward and simply ...,2.0,2.0
