In [87]:
import importlib
import models
import text_feature_engine
import embedding_feature_engine
import hardness_feature_engine
import numpy as np
import pandas as pd

In [154]:
importlib.reload(models)
importlib.reload(embedding_feature_engine)
importlib.reload(text_feature_engine)
importlib.reload(hardness_feature_engine)

<module 'hardness_feature_engine' from '/Users/surajr/classes/msse/ds200/grad-project/analysis/hardness_feature_engine.py'>

In [113]:
test_convos = pd.read_json(
    "../data/test-set/arena-test-set-prompt-and-responses.jsonl.gz",
    lines=True,
    compression="gzip"
)

df = pd.read_json(
    "../data/training-set/chatbot-arena-conversations.jsonl.gz",
    lines=True,
    compression="gzip"
)

train_scores_df = pd.read_json(
    "../data/training-set/chatbot-arena-gpt3-scores.jsonl.gz",
    lines=True,
    compression="gzip"
)

In [155]:
text_engine = text_feature_engine.TextFeatureEngine()
embedding_engine = embedding_feature_engine.EmbeddingFeatureEngine()
hardness_engine = hardness_feature_engine.HardnessFeatureEngine()

In [None]:
processed_df = text_engine.process_dataframe(
    df,
    conversation_a_col='conversation_a',
    conversation_b_col='conversation_b'
)

processed_test_df = text_engine.process_dataframe(
    test_convos,
    conversation_a_col='model_a_response',
    conversation_b_col='model_b_response'
)

clustering_dataframes = embedding_engine.process_dataframe(
    train_prompt_embeddings_path="../data/training-set/chatbot-arena-prompts-embeddings.npy",
    train_response_a_embeddings_path="../data/training-set/chatbot-arena-model_a_response-embeddings.npy",
    train_response_b_embeddings_path="../data/training-set/chatbot-arena-model_b_response-embeddings.npy",
    test_prompt_embeddings_path="../data/test-set/arena-test-set-prompts-embeddings.npy",
    test_response_a_embeddings_path="../data/test-set/arena-test-set-model_a_response-embeddings.npy",
    test_response_b_embeddings_path="../data/test-set/arena-test-set-model_b_response-embeddings.npy",
)

clustering_dataframes_train = embedding_engine.process_dataframe(
    prompt_embeddings_path="../data/training-set/chatbot-arena-prompts-embeddings.npy",
    response_a_embeddings_path="../data/training-set/chatbot-arena-model_a_response-embeddings.npy",
    response_b_embeddings_path="../data/training-set/chatbot-arena-model_b_response-embeddings.npy"
)

Loaded embeddings with shapes:
Train Prompt embeddings: (25282, 256)
Train Response A embeddings: (25282, 256)
Train Response B embeddings: (25282, 256)
Test Prompt embeddings: (3200, 256)
Test Response A embeddings: (3200, 256)
Test Response B embeddings: (3200, 256)




In [None]:
hdbscan_df_test = clustering_dataframes['test']['hdbscan'].reset_index()
kmeans_df_test = clustering_dataframes['test']['kmeans'].reset_index()
agglo_df_test = clustering_dataframes['test']['agglo'].reset_index()

hdbscan_df_train = clustering_dataframes['train']['hdbscan']
kmeans_df_train = clustering_dataframes['train']['kmeans']
agglo_df_train = clustering_dataframes['train']['agglo']

In [162]:
train_df_hdbscan = pd.concat([processed_df, hdbscan_df_train], axis=1).merge(train_scores_df, on='question_id', how='left')
train_df_kmeans = pd.concat([processed_df, kmeans_df_train], axis=1).merge(train_scores_df, on='question_id', how='left')
train_df_agglo = pd.concat([processed_df, agglo_df_train], axis=1).merge(train_scores_df, on='question_id', how='left')

train_df_hdbscan = train_df_hdbscan.dropna()
train_df_kmeans = train_df_kmeans.dropna()
train_df_agglo = train_df_agglo.dropna()

test_df_hdbscan = pd.concat([processed_test_df, hdbscan_df_test], axis=1)
test_df_kmeans = pd.concat([processed_test_df, kmeans_df_test], axis=1)
test_df_agglo = pd.concat([processed_test_df, agglo_df_test], axis=1)

In [164]:
train_df_hdbscan = hardness_engine.calculate_combined_hardness_score(train_df_hdbscan, ['score_value_1', 'score_value_2', 'score_value_3'])
train_df_kmeans = hardness_engine.calculate_combined_hardness_score(train_df_kmeans, ['score_value_1', 'score_value_2', 'score_value_3'])
train_df_agglo = hardness_engine.calculate_combined_hardness_score(train_df_agglo, ['score_value_1', 'score_value_2', 'score_value_3'])

In [None]:
for columns in list(train_df_hdbscan.select_dtypes(include=['number']).columns):
    print(columns)

In [109]:
# possible columns to care about: topic_modeling?, same_cluster_as_prompt ones, and model a/b --> maybe use models for elo rating
# response_a_same_cluster_as_prompt, response_b_same_cluster_as_prompt, topic
set(train_df_hdbscan.columns) - set(train_df_hdbscan.select_dtypes(include=['number']).columns)

{'conversation_a',
 'conversation_b',
 'judge',
 'model_a',
 'model_b',
 'openai_scores_raw_choices_nested',
 'prompt_tokens',
 'prompt_x',
 'prompt_y',
 'question_id',
 'response_a',
 'response_a_different_from_response_b',
 'response_a_same_cluster_as_prompt',
 'response_a_tokens',
 'response_b',
 'response_b_same_cluster_as_prompt',
 'response_b_tokens',
 'score_reason_1',
 'score_reason_2',
 'score_reason_3',
 'score_value_1',
 'score_value_2',
 'score_value_3',
 'topic_modeling_1',
 'topic_modeling_2',
 'topic_modeling_3',
 'winner'}

In [None]:
top_10_hardness_features_by_corr = [
    'prompt_unique_words',
    'prompt_token_length', 
    'a_response_token_length',
    'b_response_token_length',
    'prompt_a_keyword_overlap',
    'prompt_b_keyword_overlap',
    'b_response_unique_words',
    'a_response_unique_words',
    'response_ab_keyword_overlap',
    'a_response_complex_word_count'
]

top_10_features_by_rf = [
    'b_response_token_length',
    'cosine_sim_response_a_b',
    'a_response_token_length',
    'cosine_sim_prompt_response_b',
    'cosine_sim_prompt_response_a',
    'response_jaccard_similarity',
    'prompt_b_jaccard_similarity',
    'prompt_a_jaccard_similarity',
    'b_response_avg_syllable_count',
    'a_response_avg_syllable_count'
]

my_chosen_features = [
    'prompt_unique_words',
    'prompt_token_length',
    'a_response_token_length',
    'b_response_token_length',
    'prompt_a_keyword_overlap',
    'prompt_b_keyword_overlap',
    'b_response_unique_words',
    'a_response_unique_words',
    'response_ab_keyword_overlap',
    'a_response_complex_word_count',
    'cosine_sim_response_a_b',
    'cosine_sim_prompt_response_b',
    'cosine_sim_prompt_response_a',
    'response_jaccard_similarity',
    'prompt_b_jaccard_similarity',
    'prompt_a_jaccard_similarity',
    'b_response_avg_syllable_count',
    'a_response_avg_syllable_count'
]

all_model_features = list(train_df_hdbscan.drop(columns=['score_value_1', 'score_value_2', 'score_value_3', 'combined_hardness_score']).select_dtypes(include=['number']).columns)

top_5_hardness_features_by_corr = top_10_hardness_features_by_corr[:5]
features_list = {'all': all_model_features, 'mine': my_chosen_features}
# features_list = {'top_5_corr': top_5_hardness_features_by_corr, 'top_10_corr': top_10_hardness_features_by_corr, 'all': list(hardness_df.drop(columns=['combined_hardness_score']).columns), 'top_5_rf': top_10_features_by_rf[:5], 'top_10_rf': top_10_features_by_rf}
regularizations = [None, 'l1', 'l2']
tune_hyperparameters = [True, False]

In [221]:
train_df_hdbscan[['prompt_token_length', 'combined_hardness_score']].corr()

Unnamed: 0,prompt_token_length,combined_hardness_score
prompt_token_length,1.0,0.21633
combined_hardness_score,0.21633,1.0


In [216]:
mse_dict = {}
for feat_set in features_list.keys():
    for regularization in regularizations:
        for tune_hyperparameter in tune_hyperparameters:
            hardness_model = models.MultiLinearRegressionModel(train_df_hdbscan, regularization=regularization)
            hardness_model.preprocess(features=features_list[feat_set])
            hardness_model.train_model(tune_hyperparameters=tune_hyperparameter)
            print("Features: ", feat_set)
            print("Regularization: ", regularization)
            print("Tune Hyperparameter: ", tune_hyperparameter)
            pred = hardness_model.predict(on_test=True)
            mse = hardness_model.evaluate(pred, on_test=True)
            print("\n")
            mse_dict[(feat_set, regularization, tune_hyperparameter, hardness_model.alpha)] = mse

Features:  all
Regularization:  None
Tune Hyperparameter:  True
The MSE is:  2.6817882104707897


Features:  all
Regularization:  None
Tune Hyperparameter:  False
The MSE is:  2.6817882104707897


Best parameters:  {'alpha': 0.02}
Features:  all
Regularization:  l1
Tune Hyperparameter:  True
The MSE is:  2.370829487010418


Features:  all
Regularization:  l1
Tune Hyperparameter:  False
The MSE is:  3.1254121060266384


Best parameters:  {'alpha': 0.98}
Features:  all
Regularization:  l2
Tune Hyperparameter:  True
The MSE is:  2.392324937359884


Features:  all
Regularization:  l2
Tune Hyperparameter:  False
The MSE is:  2.392324937359884




In [None]:
# train the best mlr model here again
best_hardness_model = models.MultiLinearRegressionModel(train_df_hdbscan, test_df_hdbscan, regularization='l2')
best_hardness_model.preprocess(features=features_list['all'])
best_hardness_model.train_model(tune_hyperparameters=False)
hardness_prediction = best_hardness_model.predict(on_test=True)

In [184]:
importlib.reload(models)

<module 'models' from '/Users/surajr/classes/msse/ds200/grad-project/analysis/models.py'>

In [185]:
# train xgboost model
xgboost_model = models.XGBoostModel(train_df_hdbscan, test_df_hdbscan)
xgboost_model.preprocess(features=features_list['all'])
xgboost_model.train_model()
xgboost_pred = xgboost_model.predict()
mse = xgboost_model.evaluate(xgboost_pred)
winners_pred = xgboost_model.predict(on_test=True)

The accuracy is:  0.5728754549770533
Classification report:                precision    recall  f1-score   support

           0       0.54      0.70      0.61      8998
           1       0.55      0.68      0.61      8852
           2       0.85      0.26      0.39      2791
           3       0.81      0.30      0.44      4635

    accuracy                           0.57     25276
   macro avg       0.69      0.49      0.51     25276
weighted avg       0.63      0.57      0.56     25276



In [190]:
winners_pred = xgboost_model.predict_to_outcome(xgboost_model.predict(on_test=True))

In [191]:
# concatenate results and save to csv
pd.concat([test_df_hdbscan['question_id'], pd.Series(winners_pred, name='winner'), pd.Series(hardness_prediction, name='hardness_score')], axis=1).to_csv('predictions.csv', index=False)

In [193]:
all_data = pd.read_parquet('../all_data.parquet')

In [217]:
all_data_test = all_data[all_data['question_id'].isin(test_df_hdbscan['question_id'])].drop_duplicates(subset='question_id', keep='first')

In [220]:
all_data_test

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,turn,anony,language,tstamp,openai_moderation,toxic_chat_tag
21,4f332ebd8cdc4ff2be74aa8828ff20d5,koala-13b,vicuna-13b,tie,arena_user_778,[{'content': 'what do you think about the futu...,[{'content': 'what do you think about the futu...,2,True,English,1.682354e+09,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
43,f2be6f13e5ed40e5b81443223996494c,stablelm-tuned-alpha-7b,vicuna-13b,model_b,arena_user_316,[{'content': 'Salut ! Tu es un méchant chatbot...,[{'content': 'Salut ! Tu es un méchant chatbot...,2,True,English,1.682355e+09,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
55,5fafefb8a0c54243afb52d2892946cea,koala-13b,vicuna-13b,model_b,arena_user_316,[{'content': '⚔️ Chatbot Arena ⚔️ Rules:  C...,[{'content': '⚔️ Chatbot Arena ⚔️ Rules:  C...,3,True,English,1.682355e+09,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
58,7834f572267f40709ecebb273a2b346b,chatglm-6b,stablelm-tuned-alpha-7b,tie,arena_user_316,[{'content': 'Guess the word that i have in my...,[{'content': 'Guess the word that i have in my...,3,True,English,1.682355e+09,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
60,1ccc7e58290245c4bd5457fce45f8640,vicuna-13b,koala-13b,model_a,arena_user_1242,[{'content': 'You are a peasant living in the ...,[{'content': 'You are a peasant living in the ...,2,True,English,1.682355e+09,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32948,eb08f8a7f20840c99efe9fc8c03f1c13,wizardlm-13b,fastchat-t5-3b,tie,arena_user_21242,[{'content': 'Tell me a little about yourself....,[{'content': 'Tell me a little about yourself....,2,True,English,1.687761e+09,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
32953,4baca918f1f5440599ae9edb3bfa8cc1,RWKV-4-Raven-14B,vicuna-7b,model_a,arena_user_21280,[{'content': 'Create an detail outline for one...,[{'content': 'Create an detail outline for one...,3,True,English,1.687763e+09,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
32966,a787ce60dc1440f39455ab20e3bffe33,guanaco-33b,gpt-4,model_b,arena_user_765,"[{'content': 'subquery in select statement', '...","[{'content': 'subquery in select statement', '...",2,True,English,1.687765e+09,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
32971,3dc09f20eedb405ab3dc980cf7bff5d0,wizardlm-13b,gpt4all-13b-snoozy,model_a,arena_user_21326,"[{'content': 'how to', 'role': 'user'}, {'cont...","[{'content': 'how to', 'role': 'user'}, {'cont...",2,True,English,1.687766e+09,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."


In [203]:
len(all_data_test)

3199

In [205]:
curr_pred = pd.read_csv('predictions.csv')

In [206]:
curr_pred

Unnamed: 0,question_id,winner,hardness_score
0,4f332ebd8cdc4ff2be74aa8828ff20d5,model_b,8.0
1,f2be6f13e5ed40e5b81443223996494c,model_b,6.0
2,5fafefb8a0c54243afb52d2892946cea,model_b,5.0
3,7834f572267f40709ecebb273a2b346b,model_a,5.0
4,1ccc7e58290245c4bd5457fce45f8640,model_a,7.0
...,...,...,...
3195,eb08f8a7f20840c99efe9fc8c03f1c13,tie,5.0
3196,4baca918f1f5440599ae9edb3bfa8cc1,model_b,7.0
3197,a787ce60dc1440f39455ab20e3bffe33,model_b,8.0
3198,3dc09f20eedb405ab3dc980cf7bff5d0,model_b,7.0


In [211]:
all_data_test = all_data_test[['question_id', 'winner']].merge(curr_pred, on='question_id', how='left')

In [214]:
all_data_test[['winner_x', 'winner_y']].value_counts()

winner_x       winner_y     
model_a        model_a          614
model_b        model_b          522
               model_a          518
model_a        model_b          433
tie (bothbad)  model_a          250
               model_b          238
tie            model_a          150
               model_b          136
model_a        tie               72
model_b        tie               62
tie (bothbad)  tie (bothbad)     52
model_b        tie (bothbad)     46
model_a        tie (bothbad)     36
tie            tie               35
tie (bothbad)  tie               27
tie            tie (bothbad)      9
Name: count, dtype: int64

In [None]:
a