In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier, cv, DMatrix


In [2]:

data = pd.read_csv('./train.csv/train.csv')


In [3]:

# Define target variable with three classes: 0 (Model A wins), 1 (Model B wins), 2 (Tie)
data['target'] = np.where(data['winner_model_a'] == 1, 0, np.where(data['winner_model_b'] == 1, 1, 2))

data = data.drop(['winner_model_a', 'winner_model_b', 'winner_tie'], axis=1)


In [4]:

# Feature engineering: text lengths and basic similarity approximations
data['prompt_length'] = data['prompt'].apply(len)
data['response_a_length'] = data['response_a'].apply(len)
data['response_b_length'] = data['response_b'].apply(len)
data['prompt_word_count'] = data['prompt'].apply(lambda x: len(x.split()))
data['response_a_word_count'] = data['response_a'].apply(lambda x: len(x.split()))
data['response_b_word_count'] = data['response_b'].apply(lambda x: len(x.split()))
data['prompt_response_a_similarity'] = data['prompt_length'] / (data['response_a_length'] + 1e-5)
data['prompt_response_b_similarity'] = data['prompt_length'] / (data['response_b_length'] + 1e-5)

# Define features and target variable
features = data[['prompt_length', 'response_a_length', 'response_b_length',
                 'prompt_word_count', 'response_a_word_count', 'response_b_word_count',
                 'prompt_response_a_similarity', 'prompt_response_b_similarity']]
target = data['target']

print(data.head())


       id             model_a              model_b  \
0   30192  gpt-4-1106-preview           gpt-4-0613   
1   53567           koala-13b           gpt-4-0613   
2   65089  gpt-3.5-turbo-0613       mistral-medium   
3   96401    llama-2-13b-chat  mistral-7b-instruct   
4  198779           koala-13b   gpt-3.5-turbo-0314   

                                              prompt  \
0  ["Is it morally right to try to have a certain...   
1  ["What is the difference between marriage lice...   
2  ["explain function calling. how would you call...   
3  ["How can I create a test set for a very rare ...   
4  ["What is the best way to travel from Tel-Aviv...   

                                          response_a  \
0  ["The question of whether it is morally right ...   
1  ["A marriage license is a legal document that ...   
2  ["Function calling is the process of invoking ...   
3  ["Creating a test set for a very rare category...   
4  ["The best way to travel from Tel Aviv to Jeru...   

 

In [5]:

# ANOTHER TRY OF FEATURES ENGIREERING
# Combine text columns into one
data['combined_text'] = data['prompt'] + " " + data['response_a'] + " " + data['response_b']

# Apply TF-IDF
tfidf_combined = TfidfVectorizer(max_features=10000, ngram_range=(3, 6), stop_words='english')
# TfidfVectorizer(analyzer='char', ngram_range=(3, 6), max_features=1000)
tfidf_matrix = tfidf_combined.fit_transform(data['combined_text'])
tfidf_features = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_combined.get_feature_names_out())

# Define features and target
X = tfidf_features
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:

# Initialize XGBoost with basic parameters
xgb_model = XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss', random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy}")
print("Classification Report:\n", classification_report(y_test, y_pred))



XGBoost Accuracy: 0.3588204592901879
Classification Report:
               precision    recall  f1-score   support

           0       0.36      0.78      0.49      4030
           1       0.34      0.15      0.21      3929
           2       0.42      0.11      0.17      3537

    accuracy                           0.36     11496
   macro avg       0.37      0.35      0.29     11496
weighted avg       0.37      0.36      0.30     11496



In [8]:
# Create DMatrix
dtrain = DMatrix(X_train, label=y_train)

# Define parameters
params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'max_depth': 3,
    'eta': 0.1,
    'subsample': 0.4,
}

# Perform cross-validation
cv_results = cv(
    params,
    dtrain,
    num_boost_round=100,
    nfold=5,
    metrics="merror",
    early_stopping_rounds=10
)

# Train final model with optimal parameters
best_params = params.copy()
best_params.update({'num_boost_round': len(cv_results)})
final_model = XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized XGBoost Accuracy: {accuracy}")


KeyboardInterrupt: 

In [9]:

# Best model from grid search
best_xgb_model = grid_search.best_estimator_

# Make predictions
y_pred_best = best_xgb_model.predict(X_test)

# Evaluate the tuned model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Optimized XGBoost Accuracy: {accuracy_best}")
print("Optimized Classification Report:\n", classification_report(y_test, y_pred_best))


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'