In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-classification-finetuning/sample_submission.csv
/kaggle/input/llm-classification-finetuning/train.csv
/kaggle/input/llm-classification-finetuning/test.csv


In [2]:
import pandas as pd

Train = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
Test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

In [3]:
def lenght_features(df):
    df['char_len_a'] = df['response_a'].str.len()
    df['char_len_b'] = df['response_b'].str.len()
    df['char_len_diff'] = df['char_len_a'] - df['char_len_b']

    df['word_len_a'] = df['response_a'].apply(lambda x: len(str(x).split()))
    df['word_len_b'] = df['response_b'].apply(lambda x: len(str(x).split()))
    df['word_len_diff'] = df['word_len_a'] - df['word_len_b']

    return df

Train = lenght_features(Train)
Test = lenght_features(Test)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf_sim = TfidfVectorizer(max_features=5000)
tfidf_sim.fit(pd.concat([
    Train['prompt'], Train['response_a'], Train['response_b'],
    Test['prompt'], Test['response_a'], Test['response_b']
]))

def get_sim(row):
    prompt_vec = tfidf_sim.transform([row['prompt']])
    a_vec = tfidf_sim.transform([row['response_a']])
    b_vec = tfidf_sim.transform([row['response_b']])
    sim_a = cosine_similarity(prompt_vec, a_vec)[0][0]
    sim_b = cosine_similarity(prompt_vec, b_vec)[0][0]
    return pd.Series({
        'sim_a': sim_a,
        'sim_b': sim_b,
        'sim_diff': sim_a - sim_b
    })

Train[['sim_a', 'sim_b', 'sim_diff']] = Train.apply(get_sim, axis=1)
Test[['sim_a', 'sim_b', 'sim_diff']] = Test.apply(get_sim, axis=1)

In [5]:
from textblob import TextBlob

def get_sentiment(text):
   return TextBlob(str(text)).sentiment.polarity

Train['sent_a'] = Train['response_a'].apply(get_sentiment)
Train['sent_b'] = Train['response_b'].apply(get_sentiment)
Train['sent_diff'] = Train['sent_a'] - Train['sent_b']

Test['sent_a'] = Test['response_a'].apply(get_sentiment)
Test['sent_b'] = Test['response_b'].apply(get_sentiment)
Test['sent_diff'] = Test['sent_a'] - Test['sent_b']

In [6]:
Features = ['char_len_a',
       'char_len_b', 'char_len_diff', 'word_len_a', 'word_len_b',
       'word_len_diff', 'sim_a', 'sim_b', 'sim_diff','sent_a', 'sent_b', 'sent_diff']

In [7]:
import numpy as np

y = np.argmax(Train[['winner_model_a', 'winner_model_b', 'winner_tie']].values, axis=1)
X = Train[Features]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

In [10]:
from sklearn.metrics import log_loss

y_pred = model.predict_proba(X_val)
log_loss_val = log_loss(y_val, y_pred)
print('Validation Log Loss: ', log_loss_val)

Validation Log Loss:  1.0431245370105229


In [11]:
X_test = Test[Features]
test_pred = model.predict_proba(X_test)

In [12]:
submission = pd.DataFrame({
    'id': Test['id'],
    'winner_model_a': test_pred[:, 0],
    'winner_model_b': test_pred[:, 1],
    'winner_tie': test_pred[:, 2]
})
submission.to_csv('submission.csv', index=False)