# 数据的加载

### 从UCI官方加载数据集

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset 
uci_dataset = fetch_ucirepo(id=45)

# data (as pandas dataframes) 
X = uci_dataset.data.features
y = uci_dataset.data.targets

# variable information 
print(uci_dataset.variables)

### 从UCI加载数据，转化格式，转换为数字形式，并保存为mat数据 
X为特征数据：num_instances,features
Y为lable：(num_instances,1) 是一个列向量

In [None]:

import numpy as np
from sklearn.preprocessing import LabelEncoder
from ucimlrepo import fetch_ucirepo
from scipy.io import savemat

# 数据集
# 76 Nursery 
# 30 Contraceptive Method Choice
# 146 Satellite
# 33 Dermatology
# 23 Chess
uci_dataset = fetch_ucirepo(id=23)
# 1. 特征和标签分离
X = uci_dataset.data.features.values
X_columns = uci_dataset.data.features.columns.tolist()
y = uci_dataset.data.targets.values

# 2. 特征编码
label_encoder = LabelEncoder()
X_encoded = np.copy(X)

# 将特征数据每一列都进行编码
for i in range(X.shape[1]):
    X_encoded[:, i] = label_encoder.fit_transform(X_encoded[:, i])
y_encoded = label_encoder.fit_transform(y[:, 0])

# 转成int格式
X_encoded = X_encoded.astype(int)
y_encoded = y_encoded.astype(int)
# 保存为 .mat 文件
data_dict = {'X': X_encoded, 'Y': y_encoded.reshape(-1, 1)}  # 以字典形式存储
savemat('Chess2.mat', data_dict)
# 输出保存成功
print("数据保存成功！")

### 五折交叉验证

In [3]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.base import clone
import numpy as np


def k_fold_cross_validation_with_soft_labels(model, X, y, n_splits=5):
    """
    Perform 5-fold cross-validation and generate soft labels (probability predictions).

    Parameters:
    - model: A sklearn-compatible model with a `predict_proba` method.
    - X: Feature matrix (numpy array or pandas DataFrame).
    - y: Target vector (numpy array or pandas Series).

    Returns:
    - soft_labels: A numpy array containing the soft labels for each sample.
    - scores: A list of accuracy scores for each fold.
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)  # 5-fold cross-validation
    soft_labels = np.zeros((len(y), len(np.unique(y))))  # Initialize array for soft labels
    scores = []
    for train_index, test_index in kf.split(X):
        # Split data into train and test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # Clone and fit the model on the training set
        model_clone = clone(model)
        model_clone.fit(X_train, y_train)
        # Generate soft labels (probability predictions)
        y_proba = model_clone.predict_proba(X_test)
        soft_labels[test_index] = y_proba

        # Evaluate the model
        y_pred = np.argmax(y_proba, axis=1)  # Convert probabilities to class predictions
        score = accuracy_score(y_test, y_pred)
        scores.append(score)

    return soft_labels, scores


# 示例用法
if __name__ == "__main__":
    from sklearn.datasets import load_iris
    from sklearn.ensemble import RandomForestClassifier

    # 加载数据集
    data = load_iris()
    X, y = data.data, data.target

    # 定义模型
    model = RandomForestClassifier(random_state=42)

    # 执行5折交叉验证
    soft_labels, scores = k_fold_cross_validation_with_soft_labels(model, X, y, n_splits=5)
    hard_labels = np.argmax(soft_labels, axis=1)
    # 输出结果
    print("Soft labels:", soft_labels)
    print("Hard labels:", hard_labels)
    print("Cross-validation scores:", scores)
    print("Mean accuracy:", np.mean(scores))


Soft labels: [[1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [0.99 0.01 0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [0.95 0.05 0.  ]
 [0.98 0.02 0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [0.94 0.06 0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [0.9  0.1  0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [0.99 0.01 0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [0.   0.95 0.05]
 [0.   1.   0.  ]
 [0.   0.8  0.2 ]
 [0.   0.98 0.02]
 [0.   0.93 0.0