### 从UCI加载数据，转化格式，转换为数字形式，并保存为mat数据 
X为特征数据：num_instances*features
Y为lable：(num_instances,1) 是一个列向量

In [4]:

import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pandas as pd
from ucimlrepo import fetch_ucirepo
from scipy.io import savemat
# 76 Nursery 
# 30 Contraceptive Method Choice
# 146 Satellite


# 数据集
uci_dataset = fetch_ucirepo(id=146)
# 示例数据集
data = uci_dataset.data
print(data.headers.tolist())  # 
# 1. 特征和标签分离
X = data.features.values
X_columns = data.features.columns.tolist()
y = data.targets.values

# 3. 特征编码
label_encoder = LabelEncoder()
X_encoded = np.copy(X)
# 将特征数据每一列都进行编码
for i in range(X.shape[1]):
    X_encoded[:, i] = label_encoder.fit_transform(X_encoded[:, i])
y_encoded = label_encoder.fit_transform(y)
#indexs = np.where(y_encoded == 2)

#X_encoded = np.delete(X_encoded, indexs[0], axis=0)
# 转成float格式
X_encoded = X_encoded.astype(float)
#y_encoded = np.delete(y_encoded, indexs[0], axis=0)
y_encoded = label_encoder.fit_transform(y_encoded)
# 保存为 .mat 文件
data_dict = {'X': X_encoded, 'Y': y_encoded.reshape(-1, 1)}  # 以字典形式存储
savemat('Satellite.mat', data_dict)

# 将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.3, random_state=42)

# 数据标准化
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# 构建并训练MLP模型
mlp = MLPClassifier(hidden_layer_sizes=(15,), max_iter=100, random_state=42,learning_rate_init=0.1)
mlp.fit(X_train, y_train)
index_pred_proba = mlp.predict_proba(X_test)
# 预测和评估模型
y_pred = mlp.predict(X_test)

# 输出结果
print("准确率:", accuracy_score(y_test, y_pred))
print("\n分类报告:\n", classification_report(y_test, y_pred))
# 打印混淆矩阵
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(index_pred_proba.shape)
index_pred_proba

['Attribute1', 'Attribute2', 'Attribute3', 'Attribute4', 'Attribute5', 'Attribute6', 'Attribute7', 'Attribute8', 'Attribute9', 'Attribute10', 'Attribute11', 'Attribute12', 'Attribute13', 'Attribute14', 'Attribute15', 'Attribute16', 'Attribute17', 'Attribute18', 'Attribute19', 'Attribute20', 'Attribute21', 'Attribute22', 'Attribute23', 'Attribute24', 'Attribute25', 'Attribute26', 'Attribute27', 'Attribute28', 'Attribute29', 'Attribute30', 'Attribute31', 'Attribute32', 'Attribute33', 'Attribute34', 'Attribute35', 'Attribute36', 'class']
准确率: 0.23303987571206627

分类报告:
               precision    recall  f1-score   support

           0       0.23      1.00      0.38       450
           1       0.00      0.00      0.00       186
           2       0.00      0.00      0.00       416
           3       0.00      0.00      0.00       201
           4       0.00      0.00      0.00       219
           5       0.00      0.00      0.00       459

    accuracy                           0.23   

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


array([[0.24326894, 0.11259506, 0.21212099, 0.10328532, 0.10662391,
        0.22210578],
       [0.24326894, 0.11259506, 0.21212099, 0.10328532, 0.10662391,
        0.22210578],
       [0.24326894, 0.11259506, 0.21212099, 0.10328532, 0.10662391,
        0.22210578],
       ...,
       [0.24326894, 0.11259506, 0.21212099, 0.10328532, 0.10662391,
        0.22210578],
       [0.24326894, 0.11259506, 0.21212099, 0.10328532, 0.10662391,
        0.22210578],
       [0.24326894, 0.11259506, 0.21212099, 0.10328532, 0.10662391,
        0.22210578]])

## numpy转mat的demo

In [2]:
import numpy as np
from scipy.io import savemat

# 示例数据
X = np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]])  # 特征数据
y = np.array([[0, 1, 0]])  # 标签数据
print(X.shape)
print(y.shape)
y = y.reshape(-1, 1)
# 保存为 .mat 文件
data_dict = {'X': X, 'y': y}  # 以字典形式存储
savemat('dataset.mat', data_dict)

print("数据已保存为 dataset.mat 文件。")

from scipy.io import loadmat

# 加载 .mat 文件
loaded_data = loadmat('dataset.mat')

XX = loaded_data['X']
YY = loaded_data['y']
# 查看加载的数据
print("加载的 X：\n", XX)
print("加载的 y：\n", YY)
print(YY[:, 0].shape)

(3, 3)
(1, 3)
数据已保存为 dataset.mat 文件。
加载的 X：
 [[1.1 2.2 3.3]
 [4.4 5.5 6.6]
 [7.7 8.8 9.9]]
加载的 y：
 [[0]
 [1]
 [0]]
(3,)


### 测试保存的mat

In [7]:
from utils.dataset_utils import get_classes_indexes_counts
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import confusion_matrix
# 导入必要的库
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import scipy.io as sio  # 从.mat文件中读取数据集
import numpy as np
# 加载鸢尾花数据集
# data = load_iris()
mat_data = sio.loadmat('Satellite.mat')
X = mat_data['X']  # 特征
y = mat_data['Y'][:, 0]  # 标签
classes, counts = get_classes_indexes_counts(y)
print(counts)
print(X.shape)
print(y.shape)
# 将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# classes, counts = get_classes_indexes_counts(np.argmax(y_test, axis=1))
classes, counts = get_classes_indexes_counts(y_test)
print(counts)
# 数据标准化
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)
# One-hot encode target variable 强制将类别转换为0-1序列，0表示不是该类，1表示属于该类

# 构建并训练MLP模型
mlp = MLPClassifier(hidden_layer_sizes=(15,), max_iter=200, random_state=42,learning_rate_init=0.001)
# mlp.fit(X_train, np.argmax(y_train, axis=1))
mlp.fit(X_train, y_train)
print(mlp.classes_)
index_pred_proba = mlp.predict_proba(X_test)
# 预测和评估模型
y_pred = mlp.predict(X_test)
#y_test_labels = np.argmax(y_test, axis=1)
# y_pred_labels = np.argmax(y_pred, axis=1)
y_pred_labels = y_pred
# 输出结果
print("准确率:", accuracy_score(y_test, y_pred_labels))
print("\n分类报告:\n", classification_report(y_test, y_pred_labels))
# 打印混淆矩阵
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_labels))

print(index_pred_proba.shape)
res=np.sum(index_pred_proba, axis=1)
print(res)

[1533  703 1358  626  707 1508]
(6435, 36)
(6435,)
[450 186 416 201 219 459]
[0 1 2 3 4 5]
准确率: 0.8208182288969446

分类报告:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       450
           1       0.98      0.96      0.97       186
           2       0.81      0.82      0.82       416
           3       0.46      0.32      0.38       201
           4       0.83      0.80      0.82       219
           5       0.75      0.86      0.80       459

    accuracy                           0.82      1931
   macro avg       0.80      0.79      0.79      1931
weighted avg       0.81      0.82      0.81      1931

Confusion Matrix:
[[431   1   7   2   8   1]
 [  0 178   0   2   6   0]
 [  6   0 343  42   1  24]
 [  1   0  48  64   5  83]
 [  9   3   1   4 176  26]
 [  0   0  26  25  15 393]]
(1931, 6)
[1. 1. 1. ... 1. 1. 1.]




### k-folds交叉验证


In [None]:
# 导入必要的库
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# 加载鸢尾花数据集
iris = load_iris()
X, y = iris.data, iris.target

# 定义MLP模型
mlp_model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42)

# 定义5折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 使用交叉验证进行预测
y_pred = cross_val_predict(mlp_model, X, y, cv=kf)

# 输出分类结果报告
report = classification_report(y, y_pred, target_names=iris.target_names)
print(report)


In [None]:
import random

# 设置参数
lambda_ = 1.2  # 指数分布的参数λ（lambda）
threshold = 1.0  # 阈值（阈值决定了生成0或1）

def generate_sequence(n):
    sequence = []
    for _ in range(n):
        # 生成一个指数分布的随机数
        value = random.expovariate(lambda_)
        # 根据值与阈值的比较，生成 0 或 1
        if value < threshold:
            sequence.append(1)
        else:
            sequence.append(0)
    return sequence

# 生成一个包含100个元素的0和1的序列
sequence = generate_sequence(100)
print(sequence)
random.randint(1,1)