In [None]:
# Step 1: Baseline with Logistic Regression

In [None]:
# 1. 导入必要的库
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import log_loss, classification_report, confusion_matrix
import string

# 2. 加载数据
train = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

# 3. 特征工程：提取统计特征

def create_features(df):
    # 长度特征
    df['prompt_length'] = df['prompt'].str.len()
    df['response_a_length'] = df['response_a'].str.len()
    df['response_b_length'] = df['response_b'].str.len()
    # 单词数
    df['prompt_word_count'] = df['prompt'].str.split().str.len()
    df['response_a_word_count'] = df['response_a'].str.split().str.len()
    df['response_b_word_count'] = df['response_b'].str.split().str.len()
    # 标点数
    df['prompt_punc_count'] = df['prompt'].str.count(f'[{string.punctuation}]')
    df['response_a_punc_count'] = df['response_a'].str.count(f'[{string.punctuation}]')
    df['response_b_punc_count'] = df['response_b'].str.count(f'[{string.punctuation}]')
    # 差异特征
    df['response_length_diff'] = df['response_a_length'] - df['response_b_length']
    df['response_word_diff'] = df['response_a_word_count'] - df['response_b_word_count']
    return df

train = create_features(train)
test = create_features(test)

# 4. Label编码（可选：对model名称，但test不会用到）
le = LabelEncoder()
model_cols = []
for col in ['model_a', 'model_b']:
    train[f'{col}_enc'] = le.fit_transform(train[col])
    model_cols += [f'{col}_enc']

# 5. 构造标签
#   winner_model_a=1→类别0，winner_model_b=1→类别1，winner_tie=1→类别2
train['target'] = train[['winner_model_a', 'winner_model_b', 'winner_tie']].values.argmax(axis=1)

# 6. 选定全部数值特征
feature_cols = [
    'prompt_length', 'response_a_length', 'response_b_length',
    'prompt_word_count', 'response_a_word_count', 'response_b_word_count',
    'prompt_punc_count', 'response_a_punc_count', 'response_b_punc_count',
    'response_length_diff', 'response_word_diff'
    # 若希望，也可添加model_a_enc/model_b_enc
]

X = train[feature_cols]
y = train['target']

# 7. 数据拆分 + 标准化
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 8. 逻辑回归训练
clf = LogisticRegression(max_iter=1000, random_state=42, multi_class='ovr')
clf.fit(X_train_scaled, y_train)

# 9. 验证集效果
val_pred_proba = clf.predict_proba(X_val_scaled)
val_pred = clf.predict(X_val_scaled)
print('Validation Log Loss:', log_loss(y_val, val_pred_proba))
print('Classification Report:\n', classification_report(y_val, val_pred, digits=4))

# 10. 生成test特征、标准化并预测
X_test = test[feature_cols]
X_test_scaled = scaler.transform(X_test)
test_pred_proba = clf.predict_proba(X_test_scaled)

# 11. 生成Kaggle提交文件
submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': test_pred_proba[:,0],
    'winner_model_b': test_pred_proba[:,1],
    'winner_tie': test_pred_proba[:,2],
})
submission.to_csv('submission_baseline.csv', index=False)
print(submission.head())


In [None]:
#Step 2: Embedding-based model

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

train = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

# 用本地模型路径加载（路径需和你 Add Input 名称一致）
model = SentenceTransformer('/kaggle/input/minilm-l12-v2-local/other/default/1/minilm_l12_v2_local')

# 拼接文本
def concat_text(df):
    return (
        df['prompt'].astype(str) + ' ' + df['response_a'].astype(str),
        df['prompt'].astype(str) + ' ' + df['response_b'].astype(str)
    )
train_a, train_b = concat_text(train)
test_a, test_b = concat_text(test)

emb_a = model.encode(train_a.tolist(), batch_size=32, show_progress_bar=True)
emb_b = model.encode(train_b.tolist(), batch_size=32, show_progress_bar=True)
X = np.hstack([emb_a, emb_b, emb_a - emb_b])
y = train[['winner_model_a','winner_model_b','winner_tie']].values.argmax(axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print('Val Log Loss:', log_loss(y_val, clf.predict_proba(X_val)))

emb_a_test = model.encode(test_a.tolist(), batch_size=32, show_progress_bar=True)
emb_b_test = model.encode(test_b.tolist(), batch_size=32, show_progress_bar=True)
X_test = np.hstack([emb_a_test, emb_b_test, emb_a_test - emb_b_test])
proba = clf.predict_proba(X_test)

submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': proba[:,0],
    'winner_model_b': proba[:,1],
    'winner_tie': proba[:,2]
})
submission.to_csv('submission_emb.csv', index=False)


In [None]:
#Step 3. Model Extensions

In [None]:
# =============================================================================
# Step 3: Model Extensions - Advanced Features and Ensemble Methods
# =============================================================================

print("Starting Step 3: Model Extensions...")

# 安装必要的库（如果尚未安装）
!pip install xgboost lightgbm --quiet

# 导入所有必要的库
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
import re
import warnings
warnings.filterwarnings('ignore')

print("所有库导入完成")

# 加载数据
print("加载数据...")
train = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

print(f"训练集大小: {train.shape}")
print(f"测试集大小: {test.shape}")

# =============================================================================
# 1. 偏差感知特征工程 (Bias-aware Features)
# =============================================================================

def extract_bias_aware_features(df):
    """
    提取位置偏差和冗长偏差相关特征
    """
    print("提取偏差感知特征...")
    features = pd.DataFrame(index=df.index)
    
    # 位置偏差特征 (Position Bias)
    features['position_a_first'] = 1  # response_a总是第一个
    features['position_b_second'] = 0
    
    # 冗长偏差特征 (Verbosity Bias)
    features['response_a_length'] = df['response_a'].str.len()
    features['response_b_length'] = df['response_b'].str.len()
    features['length_diff'] = features['response_a_length'] - features['response_b_length']
    features['length_ratio'] = features['response_a_length'] / (features['response_b_length'] + 1)
    
    # 词汇丰富度特征
    def lexical_richness(text):
        if pd.isna(text) or text == '':
            return 0
        words = str(text).split()
        if len(words) == 0:
            return 0
        return len(set(words)) / len(words)
    
    features['richness_a'] = df['response_a'].apply(lexical_richness)
    features['richness_b'] = df['response_b'].apply(lexical_richness)
    features['richness_diff'] = features['richness_a'] - features['richness_b']
    
    # 格式特征 (检查是否有列表、代码块等)
    def format_complexity(text):
        score = 0
        text_str = str(text)
        # 检查列表
        if re.search(r'\d+\.|\*|\-', text_str):
            score += 1
        # 检查代码块
        if '```' in text_str or '    ' in text_str:
            score += 1
        # 检查标题
        if re.search(r'^#+\s', text_str, re.MULTILINE):
            score += 1
        return score
    
    features['format_a'] = df['response_a'].apply(format_complexity)
    features['format_b'] = df['response_b'].apply(format_complexity)
    features['format_diff'] = features['format_a'] - features['format_b']
    
    # 问号和感叹号数量特征
    features['question_a'] = df['response_a'].str.count(r'\?')
    features['question_b'] = df['response_b'].str.count(r'\?')
    features['exclamation_a'] = df['response_a'].str.count(r'!')
    features['exclamation_b'] = df['response_b'].str.count(r'!')
    
    features['question_diff'] = features['question_a'] - features['question_b']
    features['exclamation_diff'] = features['exclamation_a'] - features['exclamation_b']
    
    print(f"偏差特征提取完成，特征数量: {features.shape[1]}")
    return features

# 提取偏差特征
bias_features_train = extract_bias_aware_features(train)
bias_features_test = extract_bias_aware_features(test)

# =============================================================================
# 2. 基础统计特征 (Basic Statistical Features)
# =============================================================================

def extract_basic_features(df):
    """
    提取基础统计特征（类似Step 1但更全面）
    """
    print("提取基础统计特征...")
    features = pd.DataFrame(index=df.index)
    
    # 长度特征
    features['prompt_length'] = df['prompt'].str.len()
    features['response_a_length'] = df['response_a'].str.len()
    features['response_b_length'] = df['response_b'].str.len()
    
    # 单词数量
    features['prompt_word_count'] = df['prompt'].str.split().str.len()
    features['response_a_word_count'] = df['response_a'].str.split().str.len()
    features['response_b_word_count'] = df['response_b'].str.split().str.len()
    
    # 标点符号数量
    import string
    features['prompt_punc_count'] = df['prompt'].str.count(f'[{re.escape(string.punctuation)}]')
    features['response_a_punc_count'] = df['response_a'].str.count(f'[{re.escape(string.punctuation)}]')
    features['response_b_punc_count'] = df['response_b'].str.count(f'[{re.escape(string.punctuation)}]')
    
    # 差异特征
    features['response_length_diff'] = features['response_a_length'] - features['response_b_length']
    features['response_word_diff'] = features['response_a_word_count'] - features['response_b_word_count']
    features['response_punc_diff'] = features['response_a_punc_count'] - features['response_b_punc_count']
    
    print(f"基础特征提取完成，特征数量: {features.shape[1]}")
    return features

# 提取基础特征
basic_features_train = extract_basic_features(train)
basic_features_test = extract_basic_features(test)

# =============================================================================
# 3. 嵌入特征 (Embedding Features)
# =============================================================================

print("加载嵌入模型...")
# 使用小组已有的预训练模型
try:
    model = SentenceTransformer('/kaggle/input/minilm-l12-v2-local/other/default/1/minilm_l12_v2_local')
    print("使用本地预训练模型")
except:
    # 如果本地模型不可用，使用在线模型
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    print("使用在线预训练模型")

def create_embedding_features(df, model):
    """
    创建嵌入特征
    """
    print("生成嵌入特征...")
    
    # 文本拼接
    def concat_text(df):
        return (
            df['prompt'].astype(str) + ' [SEP] ' + df['response_a'].astype(str),
            df['prompt'].astype(str) + ' [SEP] ' + df['response_b'].astype(str)
        )
    
    text_a, text_b = concat_text(df)
    
    # 获取嵌入特征
    emb_a = model.encode(text_a.tolist(), batch_size=32, show_progress_bar=True)
    emb_b = model.encode(text_b.tolist(), batch_size=32, show_progress_bar=True)
    
    # 嵌入特征工程
    embedding_features = np.hstack([
        emb_a, 
        emb_b, 
        emb_a - emb_b,  # 差异特征
        np.abs(emb_a - emb_b),  # 绝对差异
        emb_a * emb_b,  # 交互特征
    ])
    
    print(f"嵌入特征生成完成，特征维度: {embedding_features.shape}")
    return embedding_features

# 创建嵌入特征
print("为训练集创建嵌入特征...")
embedding_features_train = create_embedding_features(train, model)
print("为测试集创建嵌入特征...")
embedding_features_test = create_embedding_features(test, model)

# =============================================================================
# 4. 合并所有特征
# =============================================================================

print("合并所有特征...")
X_ensemble = np.hstack([
    embedding_features_train,
    bias_features_train.values,
    basic_features_train.values
])

X_test_ensemble = np.hstack([
    embedding_features_test,
    bias_features_test.values,
    basic_features_test.values
])

# 目标变量
y = train[['winner_model_a', 'winner_model_b', 'winner_tie']].values.argmax(axis=1)

print(f"最终特征矩阵大小: {X_ensemble.shape}")
print(f"测试集特征矩阵大小: {X_test_ensemble.shape}")

# =============================================================================
# 5. 数据准备
# =============================================================================

print("准备训练和验证数据...")
X_train, X_val, y_train, y_val = train_test_split(
    X_ensemble, y, test_size=0.2, random_state=42, stratify=y
)

# 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test_ensemble)

print("数据准备完成")

# =============================================================================
# 6. 训练多个基础模型
# =============================================================================

print("训练多个基础模型...")

# 基础模型集合
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        random_state=42,
        eval_metric='mlogloss',
        n_jobs=-1
    ),
    'LightGBM': lgb.LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1
    )
}

# 单独训练每个模型并评估
base_model_predictions = {}
base_model_scores = {}

for name, model in models.items():
    print(f"训练 {name}...")
    model.fit(X_train_scaled, y_train)
    
    # 验证集预测
    val_pred_proba = model.predict_proba(X_val_scaled)
    val_score = log_loss(y_val, val_pred_proba)
    
    base_model_predictions[name] = val_pred_proba
    base_model_scores[name] = val_score
    print(f"  {name} 验证集 Log Loss: {val_score:.4f}")

# =============================================================================
# 7. 集成方法
# =============================================================================

print("\n训练集成模型...")

# 7.1 投票集成 (Voting Ensemble)
print("训练投票集成模型...")
voting_clf = VotingClassifier(
    estimators=[(name, model) for name, model in models.items()],
    voting='soft'
)

voting_clf.fit(X_train_scaled, y_train)
voting_pred_proba = voting_clf.predict_proba(X_val_scaled)
voting_score = log_loss(y_val, voting_pred_proba)
print(f"投票集成验证集 Log Loss: {voting_score:.4f}")

# 7.2 概率校准 (Probability Calibration)
print("进行概率校准...")
# 选择性能最好的基础模型进行校准
best_base_model_name = min(base_model_scores, key=base_model_scores.get)
print(f"选择 {best_base_model_name} 进行概率校准")

calibrated_clf = CalibratedClassifierCV(
    models[best_base_model_name], 
    method='isotonic', 
    cv=3
)
calibrated_clf.fit(X_train_scaled, y_train)
calibrated_pred_proba = calibrated_clf.predict_proba(X_val_scaled)
calibrated_score = log_loss(y_val, calibrated_pred_proba)
print(f"校准后验证集 Log Loss: {calibrated_score:.4f}")

# 7.3 加权平均集成 (Weighted Average Ensemble)
print("进行加权平均集成...")
# 基于验证集性能计算权重（性能越好权重越高）
weights = {}
total_performance = sum(1/score for score in base_model_scores.values())
for name, score in base_model_scores.items():
    weights[name] = (1/score) / total_performance

print("模型权重分配:")
for name, weight in weights.items():
    print(f"  {name}: {weight:.3f}")

def weighted_average_predict(models, weights, X):
    """加权平均预测"""
    predictions = []
    for name, model in models.items():
        pred = model.predict_proba(X)
        weighted_pred = pred * weights[name]
        predictions.append(weighted_pred)
    
    # 平均预测
    final_pred = np.mean(predictions, axis=0)
    return final_pred

weighted_pred_proba = weighted_average_predict(models, weights, X_val_scaled)
weighted_score = log_loss(y_val, weighted_pred_proba)
print(f"加权平均集成验证集 Log Loss: {weighted_score:.4f}")

# =============================================================================
# 8. 模型性能比较和选择
# =============================================================================

print("\n" + "="*50)
print("模型性能比较")
print("="*50)

# 收集所有模型性能
all_scores = {
    **base_model_scores,
    'Voting Ensemble': voting_score,
    'Calibrated Model': calibrated_score,
    'Weighted Average': weighted_score
}

# 按性能排序
sorted_scores = sorted(all_scores.items(), key=lambda x: x[1])

print("\n模型性能排名:")
print("-" * 40)
for name, score in sorted_scores:
    print(f"{name:20} | Log Loss: {score:.4f}")

# 选择最佳模型
best_model_name, best_score = sorted_scores[0]
print(f"\n最佳模型: {best_model_name}")
print(f"最佳分数: {best_score:.4f}")

# =============================================================================
# 9. 使用最佳模型进行最终预测
# =============================================================================

print(f"\n使用 {best_model_name} 进行最终预测...")

if best_model_name == 'Voting Ensemble':
    final_model = voting_clf
    # 在所有数据上重新训练
    X_all_scaled = scaler.fit_transform(X_ensemble)
    final_model.fit(X_all_scaled, y)
    final_predictions = final_model.predict_proba(X_test_scaled)
    
elif best_model_name == 'Calibrated Model':
    final_model = calibrated_clf
    # 注意：CalibratedClassifierCV已经使用了交叉验证，不需要重新训练
    final_predictions = final_model.predict_proba(X_test_scaled)
    
elif best_model_name == 'Weighted Average':
    # 对于加权平均，在所有数据上重新训练基础模型
    print("在所有数据上重新训练基础模型用于加权平均...")
    for name, model in models.items():
        X_all_scaled = scaler.fit_transform(X_ensemble)
        model.fit(X_all_scaled, y)
    final_predictions = weighted_average_predict(models, weights, X_test_scaled)
    
else:
    # 单个基础模型
    final_model = models[best_model_name]
    X_all_scaled = scaler.fit_transform(X_ensemble)
    final_model.fit(X_all_scaled, y)
    final_predictions = final_model.predict_proba(X_test_scaled)

# =============================================================================
# 10. 生成提交文件
# =============================================================================

print("生成提交文件...")
submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': final_predictions[:, 0],
    'winner_model_b': final_predictions[:, 1],
    'winner_tie': final_predictions[:, 2],
})

submission.to_csv('submission_advanced_ensemble.csv', index=False)
print("高级集成模型提交文件已生成: submission_advanced_ensemble.csv")

print("\n提交文件预览:")
print(submission.head())

# =============================================================================
# 11. 误差分析 (修复后的版本)
# =============================================================================

print("\n" + "="*50)
print("误差分析")
print("="*50)

# 使用最佳模型在验证集上的预测进行分析
if best_model_name == 'Weighted Average':
    val_predictions_final = weighted_average_predict(models, weights, X_val_scaled)
else:
    val_predictions_final = final_model.predict_proba(X_val_scaled)

val_pred_labels = np.argmax(val_predictions_final, axis=1)

# 计算每个类别的log loss (修复：添加labels参数)
class_names = ['model_a', 'model_b', 'tie']
print("\n各类别Log Loss分析:")
for class_idx in range(3):
    class_mask = (y_val == class_idx)
    if np.sum(class_mask) > 0:
        # 修复：添加labels参数，明确指定所有可能的类别
        class_loss = log_loss(y_val[class_mask], val_predictions_final[class_mask], labels=[0, 1, 2])
        print(f"  类别 {class_names[class_idx]}: {class_loss:.4f}")
    else:
        print(f"  类别 {class_names[class_idx]}: 无样本")

# 混淆矩阵
cm = confusion_matrix(y_val, val_pred_labels)
print(f"\n混淆矩阵:")
print(cm)

# 准确率
accuracy = np.mean(y_val == val_pred_labels)
print(f"\n验证集准确率: {accuracy:.4f}")

# =============================================================================
# 12. 与基线模型比较
# =============================================================================

print("\n" + "="*50)
print("Step 3 完成总结")
print("="*50)

print("Step 3 高级集成模型完成!")
print(f"最佳模型: {best_model_name}")
print(f"验证集Log Loss: {best_score:.4f}")
print(f"验证集准确率: {accuracy:.4f}")
print("提交文件: submission_advanced_ensemble.csv")

print("\n请将此文件提交到Kaggle查看最终分数!")