In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [46]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

**Basic Text Statistics**

In [47]:
def text_stats(text):
    text = text or ""
    tokens = text.split()
    sentences = re.split(r'[.!?]', text)
    return {
        "n_chars": len(text),
        "n_tokens": len(tokens),
        "n_sentences":len([s for s in sentences if len(s.strip())>0]),
        "avg_token_len":np.mean([len(t) for t in tokens]) if tokens else 0,
        "punct_ratio": sum(c in ".,;:!?" for c in text)/(len(text)+1),
        "digit_ratio": sum(c.isdigit() for c in text) / (len(text) + 1),
        "uppercase_ratio": sum(c.isupper() for c in text) / (len(text) + 1),
    }

In [48]:
def formatting_features(text):
    text = text or ""

    return {
        "has_bullets": int(bool(re.search(r"^\s*[-*â€¢]", text, re.MULTILINE))),
        "n_bullets": len(re.findall(r"^\s*[-*â€¢]", text, re.MULTILINE)),
        "has_code": int("```" in text or "`" in text),
        "n_code_blocks": text.count("```"),
        "n_urls": len(re.findall(r"http[s]?://", text)),
        "has_steps": int(bool(re.search(r"\b(step|first|second|third)\b", text.lower()))),
        "question_marks": text.count("?"),
        "exclamations": text.count("!"),
    }


In [49]:
def fit_tfidf(df):
    corpus = (
        df["prompt"].tolist()
        + df["response_a"].tolist()
        + df["response_b"].tolist()
    )
    tfidf = TfidfVectorizer(
        ngram_range=(1,2),
        max_features=50_000,
        stop_words="english"
    )
    tfidf.fit(corpus)
    return tfidf

In [50]:
def tfidf_similarity(tfidf, text1, text2):
    vecs = tfidf.transform([text1, text2])
    return cosine_similarity(vecs[0], vecs[1])[0][0]

In [51]:
def build_response_features(df, response_col, prefix):
    features = []

    for text in df[response_col]:
        stats = text_stats(text)
        fmt = formatting_features(text)
        features.append({**stats, **fmt})

    feat_df = pd.DataFrame(features)
    feat_df.columns = [f"{prefix}_{c}" for c in feat_df.columns]
    return feat_df

In [52]:
def build_features(df):
    df = df.copy()
    # --- Response-level features ---
    fa = build_response_features(df, "response_a", "a")
    fb = build_response_features(df, "response_b", "b")
    # --- Delta features ---
    delta = fa.values - fb.values
    delta_df = pd.DataFrame(
        delta,
        columns=[c.replace("a_", "delta_") for c in fa.columns]
    )
    # --- TF-IDF similarity ---
    tfidf = fit_tfidf(df)

    df["sim_a_prompt"] = [
        tfidf_similarity(tfidf, p, a)
        for p, a in zip(df["prompt"], df["response_a"])
    ]
    df["sim_b_prompt"] = [
        tfidf_similarity(tfidf, p, b)
        for p, b in zip(df["prompt"], df["response_b"])
    ]
    df["delta_prompt_sim"] = df["sim_a_prompt"] - df["sim_b_prompt"]
    df["len_ratio"] = (
        (fa["a_n_tokens"] + 1) / (fb["b_n_tokens"] + 1)
    )
    # --- Model identity (encode later) ---
    model_feats = df[["model_a", "model_b"]]
    # --- Final feature set ---
    X = pd.concat(
        [
            fa,
            fb,
            delta_df,
            df[
                [
                    "sim_a_prompt",
                    "sim_b_prompt",
                    "delta_prompt_sim",
                    "len_ratio",
                ]
            ],
            model_feats,
        ],
        axis=1,
    )

    return X

In [None]:
def build_target(df):
    y = np.zeros(len(df))
    y[df["winner_model_a"] == 1] = 1.0
    y[df["winner_model_b"] == 1] = 0.0
    y[df["winner_tie"] == 1] = 0.5
    return y

In [None]:
def swap_augmentation(df, y):
    df_swapped = df.copy()

    df_swapped["response_a"], df_swapped["response_b"] = (
        df["response_b"],
        df["response_a"],
    )
    df_swapped["model_a"], df_swapped["model_b"] = (
        df["model_b"],
        df["model_a"],
    )

    y_swapped = 1 - y

    df_aug = pd.concat([df, df_swapped], axis=0).reset_index(drop=True)
    y_aug = np.concatenate([y, y_swapped])

    return df_aug, y_aug

In [None]:
import tensorflow as tf

In [None]:

with tf.device('/GPU:0'):
    
    df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
    
    y = build_target(df)
    df_aug, y_aug = swap_augmentation(df, y)
    
    X = build_features(df_aug)

In [None]:
X.head(5)

In [None]:
import lightgbm as lgb

from sklearn.model_selection import GroupKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

In [None]:
#test data
with tf.device('/GPU:0'):
    test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
    X_test = build_features(test_df)


In [None]:
X_test

In [None]:
def encode_models(X_train, X_valid, cols=["model_a", "model_b"]):
    for c in cols:
        le = LabelEncoder()
        all_vals = pd.concat([X_train[c], X_valid[c], X_test[c]]).astype(str)
        le.fit(all_vals)

        X_train[c] = le.transform(X_train[c].astype(str))
        X_valid[c] = le.transform(X_valid[c].astype(str))
        X_test[c] = le.transform(X_test[c].astype(str))
    return X_train, X_valid

In [None]:
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "max_depth": -1,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "lambda_l1": 0.5,
    "lambda_l2": 0.5,
    "verbosity": -1,
    "seed": 42,
}

In [None]:
def train_lgb_oof(X, y, prompts, n_splits=5):
    oof_preds = np.zeros(len(X))
    models = []

    gkf = GroupKFold(n_splits=n_splits)

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=prompts)):
        print(f"\nðŸ”¥ Fold {fold + 1}")

        X_tr, X_va = X.iloc[tr_idx].copy(), X.iloc[va_idx].copy()
        y_tr, y_va = y[tr_idx], y[va_idx]

        # Encode model identity
        X_tr, X_va = encode_models(X_tr, X_va)

        train_data = lgb.Dataset(X_tr, label=y_tr)
        valid_data = lgb.Dataset(X_va, label=y_va)

        model = lgb.train(
            lgb_params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=3000,
            callbacks=[
                lgb.early_stopping(100),
                lgb.log_evaluation(200),
            ],
        )

        preds = model.predict(X_va, num_iteration=model.best_iteration)
        preds = np.clip(preds, 0.02, 0.98)

        oof_preds[va_idx] = preds
        models.append(model)

        print(
            "Fold RMSE:",
            np.sqrt(np.mean((y_va - preds) ** 2)),
        )

    return oof_preds, models


In [None]:
# X, y_aug already built from feature engineering
with tf.device('/GPU:0'):
    prompts = df_aug["prompt"]

    oof_lgb, lgb_models = train_lgb_oof(X, y_aug, prompts)

    print("\nâœ… Overall OOF RMSE:",
      np.sqrt(np.mean((y_aug - oof_lgb) ** 2)))


In [53]:
test_pred = lgb_models[4].predict(X_test)

In [54]:
test_pred

array([0.61589651, 0.55476302, 0.46846517, ..., 0.54120125, 0.55018336,
       0.59073206])

In [55]:
submission = pd.DataFrame({
    "winner_model_a": test_pred,
    "winner_model_b": 1 - test_pred,
    "winner_tie": np.zeros(len(test_pred))
})

In [56]:
submission

Unnamed: 0,winner_model_a,winner_model_b,winner_tie
0,0.615897,0.384103,0.0
1,0.554763,0.445237,0.0
2,0.468465,0.531535,0.0
3,0.541663,0.458337,0.0
4,0.388269,0.611731,0.0
...,...,...,...
57472,0.615788,0.384212,0.0
57473,0.531420,0.468580,0.0
57474,0.541201,0.458799,0.0
57475,0.550183,0.449817,0.0


In [59]:
submission.to_csv("submission.csv", index=True)


In [None]:
# def format_submission(p):
#     return pd.DataFrame({
#         "winner_model_a": p,
#         "winner_model_b": 1 - p,
#         "winner_tie": np.zeros(len(p)),
#     })