In [None]:
# Step 1: Baseline with Logistic Regression

In [None]:
# 1. 导入必要的库
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import log_loss, classification_report, confusion_matrix
import string

# 2. 加载数据
train = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

# 3. 特征工程：提取统计特征

def create_features(df):
    # 长度特征
    df['prompt_length'] = df['prompt'].str.len()
    df['response_a_length'] = df['response_a'].str.len()
    df['response_b_length'] = df['response_b'].str.len()
    # 单词数
    df['prompt_word_count'] = df['prompt'].str.split().str.len()
    df['response_a_word_count'] = df['response_a'].str.split().str.len()
    df['response_b_word_count'] = df['response_b'].str.split().str.len()
    # 标点数
    df['prompt_punc_count'] = df['prompt'].str.count(f'[{string.punctuation}]')
    df['response_a_punc_count'] = df['response_a'].str.count(f'[{string.punctuation}]')
    df['response_b_punc_count'] = df['response_b'].str.count(f'[{string.punctuation}]')
    # 差异特征
    df['response_length_diff'] = df['response_a_length'] - df['response_b_length']
    df['response_word_diff'] = df['response_a_word_count'] - df['response_b_word_count']
    return df

train = create_features(train)
test = create_features(test)

# 4. Label编码（可选：对model名称，但test不会用到）
le = LabelEncoder()
model_cols = []
for col in ['model_a', 'model_b']:
    train[f'{col}_enc'] = le.fit_transform(train[col])
    model_cols += [f'{col}_enc']

# 5. 构造标签
#   winner_model_a=1→类别0，winner_model_b=1→类别1，winner_tie=1→类别2
train['target'] = train[['winner_model_a', 'winner_model_b', 'winner_tie']].values.argmax(axis=1)

# 6. 选定全部数值特征
feature_cols = [
    'prompt_length', 'response_a_length', 'response_b_length',
    'prompt_word_count', 'response_a_word_count', 'response_b_word_count',
    'prompt_punc_count', 'response_a_punc_count', 'response_b_punc_count',
    'response_length_diff', 'response_word_diff'
    # 若希望，也可添加model_a_enc/model_b_enc
]

X = train[feature_cols]
y = train['target']

# 7. 数据拆分 + 标准化
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 8. 逻辑回归训练
clf = LogisticRegression(max_iter=1000, random_state=42, multi_class='ovr')
clf.fit(X_train_scaled, y_train)

# 9. 验证集效果
val_pred_proba = clf.predict_proba(X_val_scaled)
val_pred = clf.predict(X_val_scaled)
print('Validation Log Loss:', log_loss(y_val, val_pred_proba))
print('Classification Report:\n', classification_report(y_val, val_pred, digits=4))

# 10. 生成test特征、标准化并预测
X_test = test[feature_cols]
X_test_scaled = scaler.transform(X_test)
test_pred_proba = clf.predict_proba(X_test_scaled)

# 11. 生成Kaggle提交文件
submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': test_pred_proba[:,0],
    'winner_model_b': test_pred_proba[:,1],
    'winner_tie': test_pred_proba[:,2],
})
submission.to_csv('submission_baseline.csv', index=False)
print(submission.head())


In [None]:
#Step 2: Embedding-based model

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
train = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
def concat_text(df):
    return (
        df['prompt'].astype(str) + ' ' + df['response_a'].astype(str),
        df['prompt'].astype(str) + ' ' + df['response_b'].astype(str)
    )
train_a, train_b = concat_text(train)
test_a, test_b = concat_text(test)

emb_a = model.encode(train_a.tolist(), batch_size=32, show_progress_bar=True)
emb_b = model.encode(train_b.tolist(), batch_size=32, show_progress_bar=True)
X = np.hstack([emb_a, emb_b, emb_a - emb_b])
y = train[['winner_model_a','winner_model_b','winner_tie']].values.argmax(axis=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)
print('Val Log Loss:', log_loss(y_val, clf.predict_proba(X_val)))
emb_a_test = model.encode(test_a.tolist(), batch_size=32, show_progress_bar=True)
emb_b_test = model.encode(test_b.tolist(), batch_size=32, show_progress_bar=True)
X_test = np.hstack([emb_a_test, emb_b_test, emb_a_test - emb_b_test])
proba = clf.predict_proba(X_test)

submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': proba[:,0],
    'winner_model_b': proba[:,1],
    'winner_tie': proba[:,2]
})
submission.to_csv('submission_emb.csv', index=False)

In [None]:
# Step 5: Final Model (TF-IDF + Embedding + Calibration + Ensemble)

In [None]:
# === Step 5: Final Model (Baseline + Embeddings) with Weight Search Ensemble ===
import os, string, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

# 1) 读取数据（和前面一致）
TRAIN_PATH = '/kaggle/input/llm-classification-finetuning/train.csv'
TEST_PATH  = '/kaggle/input/llm-classification-finetuning/test.csv'
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

# 2) 标签：winner_model_a=0, winner_model_b=1, winner_tie=2
y = train[['winner_model_a','winner_model_b','winner_tie']].values.argmax(axis=1)

# 3) 统一一次划分，保证两种模型在同一验证集上对比
train_idx, val_idx = train_test_split(
    np.arange(len(train)),
    test_size=0.2,
    random_state=42,
    stratify=y
)
y_train, y_val = y[train_idx], y[val_idx]

# -----------------------------
# A) Baseline（数值统计特征 + LR）
# -----------------------------
def create_features(df):
    out = pd.DataFrame(index=df.index)
    out['prompt_length']      = df['prompt'].astype(str).str.len()
    out['response_a_length']  = df['response_a'].astype(str).str.len()
    out['response_b_length']  = df['response_b'].astype(str).str.len()

    out['prompt_word_count']     = df['prompt'].astype(str).str.split().str.len()
    out['response_a_word_count'] = df['response_a'].astype(str).str.split().str.len()
    out['response_b_word_count'] = df['response_b'].astype(str).str.split().str.len()

    out['prompt_punc_count']     = df['prompt'].astype(str).str.count(f'[{string.punctuation}]')
    out['response_a_punc_count'] = df['response_a'].astype(str).str.count(f'[{string.punctuation}]')
    out['response_b_punc_count'] = df['response_b'].astype(str).str.count(f'[{string.punctuation}]')

    out['response_length_diff'] = out['response_a_length'] - out['response_b_length']
    out['response_word_diff']   = out['response_a_word_count'] - out['response_b_word_count']
    return out

X_all_base  = create_features(train)
X_test_base = create_features(test)

Xtr_base = X_all_base.iloc[train_idx]
Xva_base = X_all_base.iloc[val_idx]

scaler = StandardScaler()
Xtrb = scaler.fit_transform(Xtr_base)
Xvab = scaler.transform(Xva_base)
Xteb  = scaler.transform(X_test_base)

clf_base = LogisticRegression(max_iter=1000, random_state=42, multi_class='ovr')
clf_base.fit(Xtrb, y_train)

proba_val_base = clf_base.predict_proba(Xvab)
proba_test_base = clf_base.predict_proba(Xteb)
ll_base = log_loss(y_val, proba_val_base)
print(f"[Baseline] Val logloss: {ll_base:.6f}")

# -----------------------------
# B) Embedding（MiniLM + LR）
# -----------------------------
use_embedding = True
proba_val_emb = None
proba_test_emb = None

if use_embedding:
    try:
        from sentence_transformers import SentenceTransformer

        def concat_text(df):
            # 简单拼接 prompt + response_a / response_b
            a = (df['prompt'].astype(str) + ' ' + df['response_a'].astype(str)).tolist()
            b = (df['prompt'].astype(str) + ' ' + df['response_b'].astype(str)).tolist()
            return a, b

        model_name = 'sentence-transformers/all-MiniLM-L12-v2'  # 与你 step2 一致
        st_model = SentenceTransformer(model_name)

        # 只对训练划分/验证划分分别编码，保证与统一划分一致
        a_all, b_all = concat_text(train)
        a_test, b_test = concat_text(test)

        a_tr = [a_all[i] for i in train_idx]
        b_tr = [b_all[i] for i in train_idx]
        a_va = [a_all[i] for i in val_idx]
        b_va = [b_all[i] for i in val_idx]

        emb_a_tr = st_model.encode(a_tr, batch_size=32, show_progress_bar=True)
        emb_b_tr = st_model.encode(b_tr, batch_size=32, show_progress_bar=True)
        Xtr_emb  = np.hstack([emb_a_tr, emb_b_tr, emb_a_tr - emb_b_tr])

        emb_a_va = st_model.encode(a_va, batch_size=32, show_progress_bar=True)
        emb_b_va = st_model.encode(b_va, batch_size=32, show_progress_bar=True)
        Xva_emb  = np.hstack([emb_a_va, emb_b_va, emb_a_va - emb_b_va])

        clf_emb = LogisticRegression(max_iter=200, random_state=42)
        clf_emb.fit(Xtr_emb, y_train)
        proba_val_emb = clf_emb.predict_proba(Xva_emb)
        ll_emb = log_loss(y_val, proba_val_emb)
        print(f"[Embedding] Val logloss: {ll_emb:.6f}")

        # test
        emb_a_te = st_model.encode(a_test, batch_size=32, show_progress_bar=True)
        emb_b_te = st_model.encode(b_test, batch_size=32, show_progress_bar=True)
        Xte_emb  = np.hstack([emb_a_te, emb_b_te, emb_a_te - emb_b_te])
        proba_test_emb = clf_emb.predict_proba(Xte_emb)

    except Exception as e:
        print("[Embedding] 加载/推理失败，将仅使用 Baseline。错误：", repr(e))
        use_embedding = False

# -----------------------------
# C) 权重融合（在同一验证集上网格搜索 alpha）
# -----------------------------
if use_embedding and (proba_val_emb is not None):
    best_alpha, best_ll = 0.5, 1e9
    for alpha in np.linspace(0, 1, 21):  # 0.00, 0.05, ..., 1.00
        blend_val = alpha*proba_val_base + (1-alpha)*proba_val_emb
        ll = log_loss(y_val, blend_val)
        if ll < best_ll:
            best_ll = ll
            best_alpha = alpha
    print(f"[Ensemble] Best alpha={best_alpha:.2f}, Val logloss={best_ll:.6f}")

    # 最终融合 test
    proba_test_final = best_alpha*proba_test_base + (1-best_alpha)*proba_test_emb
else:
    print("[Ensemble] 仅使用 Baseline 结果。")
    proba_test_final = proba_test_base

# -----------------------------
# D) 生成最终提交
# -----------------------------
submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': proba_test_final[:, 0],
    'winner_model_b': proba_test_final[:, 1],
    'winner_tie':     proba_test_final[:, 2],
})
submission.to_csv('submission.csv', index=False)
print("Saved -> submission.csv")
submission.head()
