In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import datasets

digits = datasets.load_digits()

In [2]:
features = StandardScaler().fit_transform(digits.data)

# 创建保留 99% 的信息的PCA
pca = PCA(n_components=0.99, whiten=True)

features_pca = pca.fit_transform(features)

# 显示结果
print('Original:', features.shape[1])
print('Reduced:', features_pca.shape[1])


Original: 64
Reduced: 54


In [3]:
# 对线性不可分数据进行特征降维
from sklearn.decomposition import PCA, KernelPCA
from sklearn.datasets import make_circles

# 创建线性不可分数据
features,_ = make_circles(n_samples=1000, random_state=1, noise=0.1, factor=0.1)

# 核PCA算法
kpca = KernelPCA(kernel='rbf', gamma=15, n_components=1)
features_kpca = kpca.fit_transform(features)

print('Original:', features.shape[1])
print('Reduced:', features_kpca.shape[1])

Original: 2
Reduced: 1


In [7]:
# 最大化类间可分性进行特征降维

'''
lda 一种特征抽取的手段将每种数据中心在某个维度上进行映射
    每种数据的中心需要相距较远，而每个中心与其对应的数据集又要距离较近，这样才能区分不同类型数据

'''
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

iris = datasets.load_iris()
features = iris.data
target = iris.target

# 创建并运行LDA 然后用他来进行特征变换
lda = LinearDiscriminantAnalysis(n_components=1)
features_lda = lda.fit(features, target).transform(features)

print('Original:', features.shape[1])
print('Reduced:', features_lda.shape[1])

Original: 4
Reduced: 1


In [5]:
# 查看数据保留情况
lda.explained_variance_ratio_

array([ 0.99147248])

In [6]:
lda = LinearDiscriminantAnalysis(n_components=None)
features_lda = lda.fit(features, target)

# 获取方差的百分比的数组
lda_var_ratios = lda.explained_variance_ratio_

# 函数
def select_n_components(var_ratio, goal_var: float)-> int:
    # 设置总方差的初始值
    total_variance = 0.0
    # 设置特征数量的初始值
    n_components = 0
    # 遍历方差百分比数组的元素
    for explained_variance in var_ratio:
        # 将该百分比加入总方差
        total_variance += explained_variance
        
        # n_components 加1
        n_components += 1
        
        # 如果达到目标值阈值结束
        if total_variance >= goal_var:
            break
    return n_components

select_n_components(lda_var_ratios, 0.95)

1

In [8]:
# 使用矩阵分解法进行特征降维

'''
NMF 无负矩阵分解法 将矩阵进行拆分后认为拆分的矩阵在特征数量上会进行减少
'''
from sklearn.decomposition import NMF

features = digits.data

nmf = NMF(n_components=10, random_state=1)
features_nmf = nmf.fit_transform(features)

print('Original:', features.shape[1])
print('Reduced:', features_nmf.shape[1])

Original: 64
Reduced: 10


In [9]:
# 对稀疏数据进行特征降维
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import numpy as np

features = StandardScaler().fit_transform(digits.data)

features_spares = csr_matrix(features)

# 创建tsvd
tsvd = TruncatedSVD(n_components=10)

features_sparse_tsvd = tsvd.fit_transform(features_spares)

print('Original:', features_spares.shape[1])
print('Reduced:', features_sparse_tsvd.shape[1])

Original: 64
Reduced: 10
