In [1]:
import pandas as pd

train_df = pd.read_csv('./kaggle/input/llm-classification-finetuning/train.csv', engine='python', on_bad_lines='skip')

test_df = pd.read_csv('./kaggle/input/llm-classification-finetuning/test.csv', engine='python', on_bad_lines='skip')

print("### First 5 rows of train_df:\n")
print(train_df.head())
print("\n### First 5 rows of test_df:\n")
print(test_df.head())

print("\n### Info for train_df:\n")
train_df.info()
print("\n### Info for test_df:\n")
test_df.info()

print("\n### Missing values in train_df:\n")
print(train_df.isnull().sum())
print("\n### Missing values in test_df:\n")
print(test_df.isnull().sum())

### First 5 rows of train_df:

       id             model_a              model_b  \
0   30192  gpt-4-1106-preview           gpt-4-0613   
1   53567           koala-13b           gpt-4-0613   
2   65089  gpt-3.5-turbo-0613       mistral-medium   
3   96401    llama-2-13b-chat  mistral-7b-instruct   
4  198779           koala-13b   gpt-3.5-turbo-0314   

                                              prompt  \
0  ["Is it morally right to try to have a certain...   
1  ["What is the difference between marriage lice...   
2  ["explain function calling. how would you call...   
3  ["How can I create a test set for a very rare ...   
4  ["What is the best way to travel from Tel-Aviv...   

                                          response_a  \
0  ["The question of whether it is morally right ...   
1  ["A marriage license is a legal document that ...   
2  ["Function calling is the process of invoking ...   
3  ["Creating a test set for a very rare category...   
4  ["The best way to travel

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import numpy as np

train_df['combined_text'] = train_df['prompt'].astype(str) + " " + \
                            train_df['response_a'].astype(str) + " " + \
                            train_df['response_b'].astype(str)
test_df['combined_text'] = test_df['prompt'].astype(str) + " " + \
                           test_df['response_a'].astype(str) + " " + \
                           test_df['response_b'].astype(str)

tfidf_vectorizer = TfidfVectorizer()

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['combined_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['combined_text'])

print("Shape of X_train_tfidf:", X_train_tfidf.shape)
print("Shape of X_test_tfidf:", X_test_tfidf.shape)

def get_winner_category(row):
    if row['winner_model_a'] == 1:
        return 'model_a'
    elif row['winner_model_b'] == 1:
        return 'model_b'
    elif row['winner_tie'] == 1:
        return 'tie'
    else:
        return np.nan 

train_df['winner_category'] = train_df.apply(get_winner_category, axis=1)

label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(train_df['winner_category'])

print("First 5 encoded target values:", y_train_encoded[:5])
print("Original categories and their encoded values:", list(label_encoder.classes_), label_encoder.transform(list(label_encoder.classes_)))

Shape of X_train_tfidf: (57477, 287185)
Shape of X_test_tfidf: (3, 287185)
First 5 encoded target values: [0 1 2 0 1]
Original categories and their encoded values: ['model_a', 'model_b', 'tie'] [0 1 2]


In [3]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=42, max_iter=1000)

model.fit(X_train_tfidf, y_train_encoded)

print("Logistic Regression model trained successfully.")



Logistic Regression model trained successfully.


In [4]:
import pandas as pd

y_pred_proba = model.predict_proba(X_test_tfidf)

submission_df = pd.DataFrame({'id': test_df['id']})

class_names = label_encoder.classes_

model_a_idx = list(class_names).index('model_a')
model_b_idx = list(class_names).index('model_b')
tie_idx = list(class_names).index('tie')

submission_df['winner_model_a'] = y_pred_proba[:, model_a_idx]
submission_df['winner_model_b'] = y_pred_proba[:, model_b_idx]
submission_df['winner_tie'] = y_pred_proba[:, tie_idx]

print("### Submission DataFrame head:\n")
print(submission_df.head())

### Submission DataFrame head:

        id  winner_model_a  winner_model_b  winner_tie
0   136060        0.165830        0.377595    0.456575
1   211333        0.442286        0.273911    0.283802
2  1233961        0.398590        0.428564    0.172845


In [5]:
import pandas as pd

y_pred_proba = model.predict_proba(X_test_tfidf)

submission_df = pd.DataFrame({'id': test_df['id']})

class_names = label_encoder.classes_

model_a_idx = list(class_names).index('model_a')
model_b_idx = list(class_names).index('model_b')
tie_idx = list(class_names).index('tie')

submission_df['winner_model_a'] = y_pred_proba[:, model_a_idx]
submission_df['winner_model_b'] = y_pred_proba[:, model_b_idx]
submission_df['winner_tie'] = y_pred_proba[:, tie_idx]

print("### Submission DataFrame head:\n")
print(submission_df.head())

submission_df.to_csv('./kaggle/input/llm-classification-finetuning/submission.csv', index=False)
print("created")

### Submission DataFrame head:

        id  winner_model_a  winner_model_b  winner_tie
0   136060        0.165830        0.377595    0.456575
1   211333        0.442286        0.273911    0.283802
2  1233961        0.398590        0.428564    0.172845
created
