In [14]:
# Import all the models and functions
import importlib
import models
importlib.reload(models)

import pandas as pd

# Example of using a model

In [3]:
# Load data
topic_and_hardness = pd.read_json(
    "../data/training-set/chatbot-arena-gpt3-scores.jsonl.gz",
    lines=True,
    compression="gzip"
)
# Perform any feature engineering
pattern_z = r"\b\w+\b"
topic_and_hardness['prompt_length'] = topic_and_hardness['prompt'].apply(lambda x: len(re.findall(pattern_z, x)))
topic_and_hardness = topic_and_hardness.dropna()
# Create a model
mlr = models.MultiLinearRegressionModel(topic_and_hardness, regularization='l1')
# Preprocess the data with the features you want to use
mlr.preprocess(features=['prompt_length'])
# Train the model
mlr.train_model()
# Evaluate the model based on a prediction
mlr.evaluate(mlr.predict())

NameError: name 're' is not defined

In [33]:
convos_df = pd.read_json(
    "../data/training-set/chatbot-arena-conversations.jsonl.gz",
    lines=True,
    compression="gzip"
)
# Perform any feature engineering
convos_df["prompt"] = convos_df["conversation_a"].str[0].str["content"]
convos_df['prompt_length'] = convos_df['prompt'].apply(lambda x: len(re.findall(pattern_z, x)))
convos_df = convos_df.dropna()
# Create a model
lr = models.RandomForestModel(convos_df)
# Preprocess the data with the features you want to use
lr.preprocess(features=['prompt_length'])
# Train the model
lr.train_model()
# Evaluate the model based on a prediction
lr.evaluate(lr.predict())

18.263910933739215
[[1215 1517]
 [1199 1429]]


In [38]:
svm = models.SVMModel(convos_df, decision_function_shape='ovo')
svm.preprocess(features=['prompt_length'])
svm.train_model()
svm.evaluate(svm.predict())

The accuracy is:  0.34568226763348714


In [39]:
svmr = models.SVMModel(convos_df, decision_function_shape='ovr')
svmr.preprocess(features=['prompt_length'])
svmr.train_model()
svmr.evaluate(svmr.predict())

The accuracy is:  0.34568226763348714


# Actual Modeling

In [4]:
hardness_df = pd.read_csv("../data/training-set/all_features_no_sentiment_draft_final.csv")
winners_df = pd.read_csv("../data/training-set/text_features_no_sentiment.csv")

In [11]:
winners_features = list(winners_df.select_dtypes(include=['number']).columns)

## Hardness Modeling

In [11]:
top_10_hardness_features_by_corr = [
    'prompt_unique_words',
    'prompt_token_length', 
    'a_response_token_length',
    'b_response_token_length',
    'prompt_a_keyword_overlap',
    'prompt_b_keyword_overlap',
    'b_response_unique_words',
    'a_response_unique_words',
    'response_ab_keyword_overlap',
    'a_response_complex_word_count'
]

top_10_features_by_rf = [
    'b_response_token_length',
    'cosine_sim_response_a_b',
    'a_response_token_length',
    'cosine_sim_prompt_response_b',
    'cosine_sim_prompt_response_a',
    'response_jaccard_similarity',
    'prompt_b_jaccard_similarity',
    'prompt_a_jaccard_similarity',
    'b_response_avg_syllable_count',
    'a_response_avg_syllable_count'
]

top_5_hardness_features_by_corr = top_10_hardness_features_by_corr[:5]

features_list = {'top_5_corr': top_5_hardness_features_by_corr, 'top_10_corr': top_10_hardness_features_by_corr, 'all': list(hardness_df.drop(columns=['combined_hardness_score']).columns), 'top_5_rf': top_10_features_by_rf[:5], 'top_10_rf': top_10_features_by_rf}
regularizations = [None, 'l1', 'l2']
tune_hyperparameters = [True, False]

In [39]:
ord_model_test = models.OrdinalRegressionModel(hardness_df)
ord_model_test.preprocess(features_list['all'])
ord_model_test.train_model()
ord_model_test_predict = ord_model_test.predict()

Optimization terminated successfully.
         Current function value: 1.376612
         Iterations: 92
         Function evaluations: 95
         Gradient evaluations: 95




In [40]:
ord_model_test.evaluate(ord_model_test_predict)

The MSE is:  2.6680749933984687


2.6680749933984687

In [45]:
mse_dict = {}
for feat_set in features_list.keys():
    for regularization in regularizations:
        for tune_hyperparameter in tune_hyperparameters:
            hardness_model = models.MultiLinearRegressionModel(hardness_df, regularization=regularization)
            hardness_model.preprocess(features=features_list[feat_set])
            hardness_model.train_model(tune_hyperparameters=tune_hyperparameter)
            print("Features: ", feat_set)
            print("Regularization: ", regularization)
            print("Tune Hyperparameter: ", tune_hyperparameter)
            mse = hardness_model.evaluate(hardness_model.predict())
            print("\n")
            mse_dict[(feat_set, regularization, tune_hyperparameter, hardness_model.alpha)] = mse

Features:  top_5_corr
Regularization:  None
Tune Hyperparameter:  True
The MSE is:  2.402957486136784


Features:  top_5_corr
Regularization:  None
Tune Hyperparameter:  False
The MSE is:  2.402957486136784


Best parameters:  {'alpha': 0.02}
Features:  top_5_corr
Regularization:  l1
Tune Hyperparameter:  True
The MSE is:  2.415764457354106


Features:  top_5_corr
Regularization:  l1
Tune Hyperparameter:  False
The MSE is:  3.0734090308951676


Best parameters:  {'alpha': 0.98}
Features:  top_5_corr
Regularization:  l2
Tune Hyperparameter:  True
The MSE is:  2.402825455505677


Features:  top_5_corr
Regularization:  l2
Tune Hyperparameter:  False
The MSE is:  2.402825455505677


Features:  top_10_corr
Regularization:  None
Tune Hyperparameter:  True
The MSE is:  2.362424082387114


Features:  top_10_corr
Regularization:  None
Tune Hyperparameter:  False
The MSE is:  2.362424082387114


Best parameters:  {'alpha': 0.02}
Features:  top_10_corr
Regularization:  l1
Tune Hyperparameter:  Tr

In [46]:
lowest_mse_settings = min(mse_dict, key=mse_dict.get)
lowsest_mse = mse_dict[lowest_mse_settings]
print("Lowest MSE: ", lowsest_mse)
print("Settings: ", lowest_mse_settings)

Lowest MSE:  2.2733034063902826
Settings:  ('all', 'l2', True, 0.98)


In [58]:
lasso_mse = []
ridge_mse = []
reg_mse = []
for key, val in mse_dict.items():
    if key[1] == 'l1':
        lasso_mse.append(val)
    elif key[1] == 'l2':
        ridge_mse.append(val)
    else:
        reg_mse.append(val)

## Winner Modeling

In [16]:
xgboost_model_test = models.XGBoostModel(winners_df)
xgboost_model_test.preprocess(features=winners_features)
xgboost_model_test.train_model()
xgboost_model_test_predict = xgboost_model_test.predict()
xgboost_model_test.evaluate(xgboost_model_test_predict)

The accuracy is:  0.46064601186552406
Classification report:                precision    recall  f1-score   support

           0       0.46      0.59      0.52      2615
           1       0.49      0.58      0.53      2669
           2       0.41      0.14      0.21       862
           3       0.36      0.20      0.26      1439

    accuracy                           0.46      7585
   macro avg       0.43      0.38      0.38      7585
weighted avg       0.45      0.46      0.44      7585



0.46064601186552406

In [59]:
decision_function_shapes = ['ovo', 'ovr']
winners_features = list(winners_df.drop(columns=['winner']).select_dtypes(include=['float64', 'int64', 'float32']).columns)

In [67]:
acc_dict = {}
for decision_function_shape in decision_function_shapes:
    winners_model = models.SVMModel(winners_df, decision_function_shape=decision_function_shape)
    winners_model.preprocess(features=winners_features)
    winners_model.train_model()
    print("Decision Function Shape: ", decision_function_shape)
    acc = winners_model.evaluate(winners_model.predict())
    print("\n")
    acc_dict[(decision_function_shape, winners_model.C)] = acc


Best parameters:  {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Decision Function Shape:  ovo
The accuracy is:  0.4796308503625577
Classification report:                 precision    recall  f1-score   support

      model_a       0.47      0.67      0.55      2615
      model_b       0.50      0.62      0.55      2669
          tie       0.53      0.07      0.13       862
tie (bothbad)       0.44      0.13      0.19      1439

     accuracy                           0.48      7585
    macro avg       0.48      0.37      0.36      7585
 weighted avg       0.48      0.48      0.43      7585



Best parameters:  {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Decision Function Shape:  ovr
The accuracy is:  0.4796308503625577
Classification report:                 precision    recall  f1-score   support

      model_a       0.47      0.67      0.55      2615
      model_b       0.50      0.62      0.55      2669
          tie       0.53      0.07      0.13       862
tie (bothbad)       0.44      0.