In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier, cv, DMatrix
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
import torch


In [4]:

data = pd.read_csv('./train.csv/train.csv')


In [5]:

# Define target variable with three classes: 0 (Model A wins), 1 (Model B wins), 2 (Tie)
data['target'] = np.where(data['winner_model_a'] == 1, 0, np.where(data['winner_model_b'] == 1, 1, 2))

data = data.drop(['winner_model_a', 'winner_model_b', 'winner_tie'], axis=1)


IF you want to run the program with the BERT algorithm, run the following code cells


In [6]:
# Load pre-trained BERT model and tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [7]:
def get_bert_embeddings_in_batches(text_list, batch_size=16):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        # Ensure processing on CPU/GPU as available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
        # Extract CLS embeddings
        cls_embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings.append(cls_embeddings.cpu().numpy())
    
    # Concatenate all batches
    return np.concatenate(embeddings, axis=0)

In [8]:
# Combine text columns
data['combined_text'] = data['prompt'] + " " + data['response_a'] + " " + data['response_b']

# Generate BERT embeddings
bert_embeddings = get_bert_embeddings_in_batches(data['combined_text'].tolist(), batch_size=16)

# Define features and target
X = bert_embeddings
y = data['target']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

: 

IF you want to run the program with the TF - IDF algorithm, run the following code cell

In [4]:

# ANOTHER TRY OF FEATURES ENGIREERING
# Combine text columns into one
data['combined_text'] = data['prompt'] + " " + data['response_a'] + " " + data['response_b']

# Apply TF-IDF
tfidf_combined = TfidfVectorizer(max_features=10000, ngram_range=(3, 6), stop_words='english')
# TfidfVectorizer(analyzer='char', ngram_range=(3, 6), max_features=1000)
tfidf_matrix = tfidf_combined.fit_transform(data['combined_text'])
tfidf_features = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_combined.get_feature_names_out())

# Define features and target
X = tfidf_features
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Create DMatrix
dtrain = DMatrix(X_train, label=y_train)

# Define parameters
params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'max_depth': 3,
    'eta': 0.1,
    'subsample': 0.4,
}

# Perform cross-validation
cv_results = cv(
    params,
    dtrain,
    num_boost_round=100,
    nfold=5,
    metrics="merror",
    early_stopping_rounds=10,
    seed=42
)

# Get the optimal number of boosting rounds
optimal_boost_rounds = len(cv_results)
print(f"Optimal number of boosting rounds: {optimal_boost_rounds}")


Optimal number of boosting rounds: 19


In [8]:
# Train final model with optimal parameters
final_model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    max_depth=3,
    learning_rate=0.1,
    subsample=0.4,
    n_estimators=optimal_boost_rounds,
    random_state=42
)
final_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized XGBoost Accuracy: {accuracy}")
print("Optimized Classification Report:\n", classification_report(y_test, y_pred))

Optimized XGBoost Accuracy: 0.3596033402922756
Optimized Classification Report:
               precision    recall  f1-score   support

           0       0.35      0.93      0.51      4030
           1       0.37      0.06      0.11      3929
           2       0.56      0.04      0.08      3537

    accuracy                           0.36     11496
   macro avg       0.43      0.34      0.23     11496
weighted avg       0.42      0.36      0.24     11496

