In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-classification-finetuning/sample_submission.csv
/kaggle/input/llm-classification-finetuning/train.csv
/kaggle/input/llm-classification-finetuning/test.csv


In [13]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

**Basic Text Statistics**

In [14]:
def text_stats(text):
    text = text or ""
    tokens = text.split()
    sentences = re.split(r'[.!?]', text)
    return {
        "n_chars": len(text),
        "n_tokens": len(tokens),
        "n_sentences":len([s for s in sentences if len(s.strip())>0]),
        "avg_token_len":np.mean([len(t) for t in tokens]) if tokens else 0,
        "punct_ratio": sum(c in ".,;:!?" for c in text)/(len(text)+1),
        "digit_ratio": sum(c.isdigit() for c in text) / (len(text) + 1),
        "uppercase_ratio": sum(c.isupper() for c in text) / (len(text) + 1),
    }

In [15]:
def formatting_features(text):
    text = text or ""

    return {
        "has_bullets": int(bool(re.search(r"^\s*[-*â€¢]", text, re.MULTILINE))),
        "n_bullets": len(re.findall(r"^\s*[-*â€¢]", text, re.MULTILINE)),
        "has_code": int("```" in text or "`" in text),
        "n_code_blocks": text.count("```"),
        "n_urls": len(re.findall(r"http[s]?://", text)),
        "has_steps": int(bool(re.search(r"\b(step|first|second|third)\b", text.lower()))),
        "question_marks": text.count("?"),
        "exclamations": text.count("!"),
    }


In [5]:
def fit_tfidf(df):
    corpus = (
        df["prompt"].tolist()
        + df["response_a"].tolist()
        + df["response_b"].tolist()
    )
    tfidf = TfidfVectorizer(
        ngram_range=(1,2),
        max_features=50_000,
        stop_words="english"
    )
    tfidf.fit(corpus)
    return tfidf

In [6]:
def tfidf_similarity(tfidf, text1, text2):
    vecs = tfidf.transform([text1, text2])
    return cosine_similarity(vecs[0], vecs[1])[0][0]

In [16]:
def build_response_features(df, response_col, prefix):
    features = []

    for text in df[response_col]:
        stats = text_stats(text)
        fmt = formatting_features(text)
        features.append({**stats, **fmt})

    feat_df = pd.DataFrame(features)
    feat_df.columns = [f"{prefix}_{c}" for c in feat_df.columns]
    return feat_df

In [17]:
def build_features(df):
    df = df.copy()
    # --- Response-level features ---
    fa = build_response_features(df, "response_a", "a")
    fb = build_response_features(df, "response_b", "b")
    # --- Delta features ---
    delta = fa.values - fb.values
    delta_df = pd.DataFrame(
        delta,
        columns=[c.replace("a_", "delta_") for c in fa.columns]
    )
    # --- TF-IDF similarity ---
    tfidf = fit_tfidf(df)

    df["sim_a_prompt"] = [
        tfidf_similarity(tfidf, p, a)
        for p, a in zip(df["prompt"], df["response_a"])
    ]
    df["sim_b_prompt"] = [
        tfidf_similarity(tfidf, p, b)
        for p, b in zip(df["prompt"], df["response_b"])
    ]
    df["delta_prompt_sim"] = df["sim_a_prompt"] - df["sim_b_prompt"]
    df["len_ratio"] = (
        (fa["a_n_tokens"] + 1) / (fb["b_n_tokens"] + 1)
    )
    # --- Model identity (encode later) ---
    model_feats = df[["model_a", "model_b"]]
    # --- Final feature set ---
    X = pd.concat(
        [
            fa,
            fb,
            delta_df,
            df[
                [
                    "sim_a_prompt",
                    "sim_b_prompt",
                    "delta_prompt_sim",
                    "len_ratio",
                ]
            ],
            model_feats,
        ],
        axis=1,
    )

    return X

In [18]:
def build_target(df):
    y = np.zeros(len(df))
    y[df["winner_model_a"] == 1] = 1.0
    y[df["winner_model_b"] == 1] = 0.0
    y[df["winner_tie"] == 1] = 0.5
    return y

In [19]:
def swap_augmentation(df, y):
    df_swapped = df.copy()

    df_swapped["response_a"], df_swapped["response_b"] = (
        df["response_b"],
        df["response_a"],
    )
    df_swapped["model_a"], df_swapped["model_b"] = (
        df["model_b"],
        df["model_a"],
    )

    y_swapped = 1 - y

    df_aug = pd.concat([df, df_swapped], axis=0).reset_index(drop=True)
    y_aug = np.concatenate([y, y_swapped])

    return df_aug, y_aug

In [20]:
import tensorflow as tf

In [22]:

with tf.device('/GPU:0'):
    
    df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
    
    y = build_target(df)
    df_aug, y_aug = swap_augmentation(df, y)
    
    X = build_features(df_aug)

In [23]:
X.head(5)

Unnamed: 0,a_n_chars,a_n_tokens,a_n_sentences,a_avg_token_len,a_punct_ratio,a_digit_ratio,a_uppercase_ratio,a_has_bullets,a_n_bullets,a_has_code,...,delta_n_urls,delta_has_steps,delta_question_marks,delta_exclamations,sim_a_prompt,sim_b_prompt,delta_prompt_sim,len_ratio,model_a,model_b
0,4538,656,40,5.919207,0.020048,0.003966,0.013439,0,0,0,...,0.0,0.0,1.0,3.0,0.392832,0.293792,0.09904,3.204878,gpt-4-1106-preview,gpt-4-0613
1,3114,531,21,4.854991,0.016051,0.002889,0.008668,0,0,0,...,0.0,0.0,0.0,0.0,0.609627,0.597318,0.012309,0.93007,koala-13b,gpt-4-0613
2,921,138,10,5.65942,0.019523,0.0,0.017354,0,0,1,...,0.0,0.0,0.0,2.0,0.404646,0.336475,0.068171,0.494662,gpt-3.5-turbo-0613,mistral-medium
3,3182,536,30,4.938433,0.019164,0.00377,0.018222,0,0,0,...,0.0,0.0,0.0,0.0,0.574644,0.553549,0.021095,2.018797,llama-2-13b-chat,mistral-7b-instruct
4,1300,230,11,4.630435,0.019985,0.008455,0.031514,0,0,0,...,0.0,0.0,0.0,0.0,0.608045,0.730724,-0.122679,1.878049,koala-13b,gpt-3.5-turbo-0314


In [19]:
import lightgbm as lgb

from sklearn.model_selection import GroupKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

In [None]:
#test data
test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
X_test = build_features(test_df)


In [None]:
def encode_models(X_train, X_valid, cols=["model_a", "model_b"]):
    for c in cols:
        le = LabelEncoder()
        all_vals = pd.concat([X_train[c], X_valid[c]]).astype(str)
        le.fit(all_vals)

        X_train[c] = le.transform(X_train[c].astype(str))
        X_valid[c] = le.transform(X_valid[c].astype(str))
        X_test[c] = le.transform(X_test[c].astype(str))
    return X_train, X_valid

In [21]:
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "max_depth": -1,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "lambda_l1": 0.5,
    "lambda_l2": 0.5,
    "verbosity": -1,
    "seed": 42,
}

In [22]:
def train_lgb_oof(X, y, prompts, n_splits=5):
    oof_preds = np.zeros(len(X))
    models = []

    gkf = GroupKFold(n_splits=n_splits)

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=prompts)):
        print(f"\nðŸ”¥ Fold {fold + 1}")

        X_tr, X_va = X.iloc[tr_idx].copy(), X.iloc[va_idx].copy()
        y_tr, y_va = y[tr_idx], y[va_idx]

        # Encode model identity
        X_tr, X_va = encode_models(X_tr, X_va)

        train_data = lgb.Dataset(X_tr, label=y_tr)
        valid_data = lgb.Dataset(X_va, label=y_va)

        model = lgb.train(
            lgb_params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=3000,
            callbacks=[
                lgb.early_stopping(100),
                lgb.log_evaluation(200),
            ],
        )

        preds = model.predict(X_va, num_iteration=model.best_iteration)
        preds = np.clip(preds, 0.02, 0.98)

        oof_preds[va_idx] = preds
        models.append(model)

        print(
            "Fold RMSE:",
            np.sqrt(np.mean((y_va - preds) ** 2)),
        )

    return oof_preds, models


In [23]:
# X, y_aug already built from feature engineering
prompts = df_aug["prompt"]

oof_lgb, lgb_models = train_lgb_oof(X, y_aug, prompts)

print("\nâœ… Overall OOF RMSE:",
      np.sqrt(np.mean((y_aug - oof_lgb) ** 2)))



ðŸ”¥ Fold 1
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.384406
[400]	valid_0's rmse: 0.383942
[600]	valid_0's rmse: 0.383832
Early stopping, best iteration is:
[500]	valid_0's rmse: 0.38372
Fold RMSE: 0.3837141828791263

ðŸ”¥ Fold 2
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.386393
[400]	valid_0's rmse: 0.385658
Early stopping, best iteration is:
[382]	valid_0's rmse: 0.385644
Fold RMSE: 0.38564225575798927

ðŸ”¥ Fold 3
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.384794
[400]	valid_0's rmse: 0.383987
[600]	valid_0's rmse: 0.383871
Early stopping, best iteration is:
[540]	valid_0's rmse: 0.383807
Fold RMSE: 0.38379807445071246

ðŸ”¥ Fold 4
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.383608
[400]	valid_0's rmse: 0.382734
Early stopping, best iteration is:
[481]	valid_0's rmse: 0.382563
Fold RMSE: 0.3825621900379248

ðŸ”¥

In [24]:
def format_submission(p):
    return pd.DataFrame({
        "winner_model_a": p,
        "winner_model_b": 1 - p,
        "winner_tie": np.zeros(len(p)),
    })

In [26]:
lgb_models

[<lightgbm.basic.Booster at 0x78ffc371b390>,
 <lightgbm.basic.Booster at 0x78ffc3718c10>,
 <lightgbm.basic.Booster at 0x79000857b990>,
 <lightgbm.basic.Booster at 0x78ffe075a150>,
 <lightgbm.basic.Booster at 0x78ff9e55c290>]