# 安装依赖

# 最小-最大规范化

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import load_iris

# 导入MinMaxScaler包
# 导入鸢尾花数据集
iris=load_iris()
# 提取前6个数据
data=iris.data[0:6]

print(data)
# 输出未归一化的结果
mms=MinMaxScaler().fit_transform(data)

print(mms)
# 输出归一化的结果

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]]
[[0.625      0.55555556 0.25       0.        ]
 [0.375      0.         0.25       0.        ]
 [0.125      0.22222222 0.         0.        ]
 [0.         0.11111111 0.5        0.        ]
 [0.5        0.66666667 0.25       0.        ]
 [1.         1.         1.         1.        ]]


# 属性 A 和属性 B 的协方差与相关系数

In [None]:
import pandas as pd
import numpy as np

# 导入 pandas
# 导入 NumPy
a = [47, 83, 81, 18, 72, 41, 50, 66, 47, 20, 96, 21, 16, 60, 37, 59, 22, 16, 32, 63]
# 属性A的值
b = [56, 96, 84, 21, 87, 67, 43, 64, 85, 67, 68, 64, 95, 58, 56, 75, 6, 11, 68, 63]
# 属性B的值
data = np.array([a, b]).T
# 数据转置
dfab = pd.DataFrame(data, columns=['A','B'])
# 转为 DataFrame 格式
print('属性A和B的协方差:', dfab.A.cov(dfab.B))
# 输出属性A和B的协方差
print('属性A和B的相关系数:', dfab.A.corr(dfab.B))
# 输出属性A和B的相关系数

属性A和B的协方差: 310.2157894736842
属性A和B的相关系数: 0.49924871046524394


# Z-score 规范化

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris

iris = load_iris()
data = iris.data[0:6]
print(data)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]]


# 方差筛选法

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.datasets import load_iris

iris = load_iris()

print("\n原始数据:\n",iris.data[0:5]) #输出原始数据

x_var=VarianceThreshold(threshold=0.5).fit_transform(iris.data)
print("方差筛选法过滤后数据\n",format(x_var[0:5])) #输出方差筛选法过滤后的数据


原始数据:
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
方差筛选法过滤后数据
 [[5.1 1.4 0.2]
 [4.9 1.4 0.2]
 [4.7 1.3 0.2]
 [4.6 1.5 0.2]
 [5.  1.4 0.2]]


# 数据替换

In [None]:
import re

address = "江苏省徐州市鼓楼区幸福路碧园小区19栋18号1单元202室" #定义数据

address=re.sub('[0-9]','*',address)   #对地址中的数据进行替换

pattern = "区(.*?)路(.*?)小区"   #设置正则表达式

pat=re.findall(pattern,address)[0][::-1]   #查找反序中的字符

for i in pat:  #遍历字符
    if i in address:  #判断字符在address中
        address=address.replace(i,'*') #字符替换

print(address)  #输出结果

江苏省徐州市鼓楼区*路*小区**栋**号*单元***室


# 分箱

In [None]:
import numpy as np
import math

# 1. 原始数据
salary = np.array([2200,2300,2400,2500,
                   2500,2800,3000,3200,
                   3500,3800,4000,4500,
                   4700,4800,4900,5000])

# 2. 等深分箱(假设要分成4箱，每箱4条数据)
#   salary.size = 16, int(salary.size/4) = 4
depth = salary.reshape(int(salary.size / 4), 4)
print("等深分箱:")
print(depth)

# ============ 均值平滑 ============
mean_depth = np.full((depth.shape[0], depth.shape[1]), 0, dtype=float)
for i in range(depth.shape[0]):
    row_mean = depth[i].mean()  # 计算当前行（一个分箱）的均值
    for j in range(depth.shape[1]):
        mean_depth[i][j] = row_mean

print("\n等深分箱 -- 均值平滑:")
print(mean_depth)

# ============ 中值平滑 ============
median_depth = np.full((depth.shape[0], depth.shape[1]), 0, dtype=float)
for i in range(depth.shape[0]):
    row_median = np.median(depth[i])  # 计算当前行的中位数
    for j in range(depth.shape[1]):
        median_depth[i][j] = row_median

print("\n等深分箱 -- 中值平滑:")
print(median_depth)

# ============ 边界值平滑 ============
# 首先确定每个分箱的左右边界
edge_depth = np.full((depth.shape[0], depth.shape[1]), 0, dtype=float)
edge_left = np.zeros(depth.shape[0], dtype=float)
edge_right = np.zeros(depth.shape[0], dtype=float)

for i in range(depth.shape[0]):
    edge_left[i] = depth[i][0]    # 该分箱的最左值
    edge_right[i] = depth[i][-1]  # 该分箱的最右值

# 根据距离最近原则进行边界平滑
# 通常也会把“第 1 个元素直接用左边界，第 4 个元素直接用右边界”，中间元素则比较两边距离。
for i in range(depth.shape[0]):
    for j in range(depth.shape[1]):
        if j == 0:
            # 第 1 个元素，用左边界
            edge_depth[i][j] = edge_left[i]
        elif j == depth.shape[1] - 1:
            # 最后 1 个元素，用右边界
            edge_depth[i][j] = edge_right[i]
        else:
            # 中间元素，看它更接近左边界还是右边界
            left_dist = (depth[i][j] - edge_left[i]) ** 2
            right_dist = (depth[i][j] - edge_right[i]) ** 2
            if left_dist <= right_dist:
                edge_depth[i][j] = edge_left[i]
            else:
                edge_depth[i][j] = edge_right[i]

print("\n等深分箱 -- 边界值平滑:")
print(edge_depth)


等深分箱:
[[2200 2300 2400 2500]
 [2500 2800 3000 3200]
 [3500 3800 4000 4500]
 [4700 4800 4900 5000]]

等深分箱 -- 均值平滑:
[[2350. 2350. 2350. 2350.]
 [2875. 2875. 2875. 2875.]
 [3950. 3950. 3950. 3950.]
 [4850. 4850. 4850. 4850.]]

等深分箱 -- 中值平滑:
[[2350. 2350. 2350. 2350.]
 [2900. 2900. 2900. 2900.]
 [3900. 3900. 3900. 3900.]
 [4850. 4850. 4850. 4850.]]

等深分箱 -- 边界值平滑:
[[2200. 2200. 2500. 2500.]
 [2500. 2500. 3200. 3200.]
 [3500. 3500. 3500. 4500.]
 [4700. 4700. 5000. 5000.]]
