# 对半导体制造数据降维

In [1]:
#导入 numpy 库
import numpy as np
import codecs
from sklearn.preprocessing import Imputer

In [5]:
#解析文本数据函数
# filename 文件名txt
# delimitor  分割符号
def loadDataSet(filename,delimitor='\t'):
    #打开文本文件
    file=codecs.open(filename,'r','utf8')
    #对文本文件的每一行分割开来 存入列表中  作为列表的某一行
    #行中的每一列对应各个分隔开的特征
    matrics=[line.strip().split(delimitor) for line in file.readlines()]
    #利用map函数将每个line 映射为 float
    data=[list( map(float,line)) for line in matrics]
    #将float的值转为矩阵
    return np.array(data)

In [6]:
#数据清洗_处理缺失值数据
def clean_NanData(dataSet):
    imp=Imputer(missing_values='NaN',strategy='mean',axis=0)
    imp.fit(dataSet)  #模型
    result=imp.transform(dataSet)
    return result

In [7]:
#解析数据
dataSet=loadDataSet('secom.data',' ')
print("清理前的：",dataSet[0])
result=clean_NanData(dataSet)
print("清理后的",result[0])

清理前的： [  3.03093000e+03   2.56400000e+03   2.18773330e+03   1.41112650e+03
   1.36020000e+00   1.00000000e+02   9.76133000e+01   1.24200000e-01
   1.50050000e+00   1.62000000e-02  -3.40000000e-03   9.45500000e-01
   2.02439600e+02   0.00000000e+00   7.95580000e+00   4.14871000e+02
   1.00433000e+01   9.68000000e-01   1.92396300e+02   1.25190000e+01
   1.40260000e+00  -5.41900000e+03   2.91650000e+03  -4.04375000e+03
   7.51000000e+02   8.95500000e-01   1.77300000e+00   3.04900000e+00
   6.42333000e+01   2.02220000e+00   1.63200000e-01   3.51910000e+00
   8.33971000e+01   9.51260000e+00   5.06170000e+01   6.42588000e+01
   4.93830000e+01   6.63141000e+01   8.69555000e+01   1.17513200e+02
   6.12900000e+01   4.51500000e+00   7.00000000e+01   3.52717300e+02
   1.01841000e+01   1.30369100e+02   7.23309200e+02   1.30720000e+00
   1.41228200e+02   1.00000000e+00   6.24314500e+02   2.18317400e+02
   0.00000000e+00   4.59200000e+00   4.84100000e+00   2.83400000e+03
   9.31700000e-01   9.484000

In [8]:
result.shape

(1567, 590)

In [10]:
#输出到pca的中间结果
n=2
meanValues=np.mean(result,axis=0)
meanRemoved=result-meanValues
covValues=np.cov(meanRemoved,rowvar=0)  #  求协方差矩阵
eigValues,eigVectors=np.linalg.eig(np.mat(covValues))   #求特征值与向量
print('特征值')
print(eigValues)
print(shape(eigValues))
print('前',n,'个特征所占的比重:',np.sum(eigValues[0:n])/np.sum(eigValues) )
print('特征向量')
print(eigVectors)
print(shape(eigVectors))

'''
从上面的特征值  我们看到如下的信息
1.里面很多值都是0，这意味着这些特征值都是其他特征的副本，都可以通过其他的特征来表示，其本身没有提供额外的信息
2.可以看到最前面的15个特征值得数量级大于105  后面的特征值都特别小  这表明  这里面特征只有部分是重要特征
'''

特征值
[  5.34151979e+07 +0.00000000e+00j   2.17466719e+07 +0.00000000e+00j
   8.24837662e+06 +0.00000000e+00j   2.07388086e+06 +0.00000000e+00j
   1.31540439e+06 +0.00000000e+00j   4.67693557e+05 +0.00000000e+00j
   2.90863555e+05 +0.00000000e+00j   2.83668601e+05 +0.00000000e+00j
   2.37155830e+05 +0.00000000e+00j   2.08513836e+05 +0.00000000e+00j
   1.96098849e+05 +0.00000000e+00j   1.86856549e+05 +0.00000000e+00j
   1.52422354e+05 +0.00000000e+00j   1.13215032e+05 +0.00000000e+00j
   1.08493848e+05 +0.00000000e+00j   1.02849533e+05 +0.00000000e+00j
   1.00166164e+05 +0.00000000e+00j   8.33473762e+04 +0.00000000e+00j
   8.15850591e+04 +0.00000000e+00j   7.76560524e+04 +0.00000000e+00j
   6.66060410e+04 +0.00000000e+00j   6.52620058e+04 +0.00000000e+00j
   5.96776503e+04 +0.00000000e+00j   5.16269933e+04 +0.00000000e+00j
   5.03324580e+04 +0.00000000e+00j   4.54661746e+04 +0.00000000e+00j
   4.41914029e+04 +0.00000000e+00j   4.15532551e+04 +0.00000000e+00j
   3.55294040e+04 +0.00000000e

NameError: name 'shape' is not defined

In [11]:
'''
以下函数实现降维过程，首先根据数据矩阵的协方差的特征值和特征向量，得到最大的N个特征值对应特征向量组成的矩阵，可以称为压缩矩阵
得到了压缩矩阵之后，将去均值的数矩阵乘以压缩矩阵，就实现了原始数据特征转化为新的空间特征，进而使数据特征得到了压缩处理

然后在根据压缩矩阵和特征均值，反构得到了原始数据矩阵   通过这样的方式可以用于调试和验证
'''
#pca 封装函数
def pca(dataMatrics,topNfeat=4096):
    '''
    dataMatrics  待降维矩阵
    topNfeat    要保留的特征数
    '''
    #求数据矩阵每一列的均值
    meanValues=np.mean(dataMatrics,axis=0)
    #数据矩阵每一列特征减去该列的特征均值
    meanRemoved=dataMatrics-meanValues
    #计算协方差矩阵，除数n-1是为了得到协方差的无偏估计
    #  cov(x,0)=cov(x)  除数  n-1  （n为样本个数）
    #  cov(x,1)  除数是n
    covMatrics=np.cov(meanRemoved,rowvar=0)
    #计算协方差矩阵的特征值及其对应的特征向量均保持在相应的矩阵中
    eigValues,eigVectors=np.linalg.eig(np.mat(covMatrics) )
    #sort()  对特征值矩阵排序(由小到大)
    #argsort(): 对特征值矩阵进行由小到大排序  返回对应排序后的索引
    eigValInd=np.argsort(eigValues)
    #从排好序的矩阵最后一个开始自下而上选取最大的N个特征值  返回其对应的索引
    eigValInd=eigValInd[:-(topNfeat+1):-1]
    #将特征值最大的N个特征值对应索引的特征向量提取出来，组成亚索矩阵
    redEigVects=eigVectors[:,eigValInd]
    #将去除均值后的   数据矩阵*压缩矩阵  转换到新的空间，使维度降低到N
    lowDDataMatrics=meanRemoved*redEigVects
    #利用降维后的矩阵反构出原数据矩阵
    recoverMat=(lowDDataMatrics*redEigVects.T)+meanValues
    #返回值：降维后的低维矩阵,还原的矩阵
    return lowDDataMatrics,recoverMat

In [14]:
print('pca降维')
n=20
newMat,recoverMat=pca(result,n)
print('只保留前',n,'个特征后的新低维数据：')
print(newMat)
print(newMat.shape)
print('还原后的高维近似数据：')
print(recoverMat)
print(recoverMat.shape)

pca降维
只保留前 20 个特征后的新低维数据：
[[  5.18389617e+03+0.j   3.02264772e+03+0.j  -6.88386243e+02+0.j ...,
   -4.36932721e+02+0.j  -1.09048232e+02+0.j  -5.37639954e+02+0.j]
 [  1.86669728e+03+0.j   4.02163902e+03+0.j   1.50557353e+03+0.j ...,
   -1.08154052e+02+0.j   2.31447780e+02+0.j   2.74388089e+02+0.j]
 [  3.15474165e+03+0.j   3.46198582e+03+0.j   1.85544208e+03+0.j ...,
   -4.07803532e+01+0.j   4.03787771e+02+0.j  -4.18093198e+02+0.j]
 ..., 
 [  3.82121714e+03+0.j   1.57303288e+02+0.j   1.19846485e+03+0.j ...,
   -4.78419433e+01+0.j  -1.73710715e+01+0.j   1.51656937e+01+0.j]
 [  4.27104024e+03+0.j   1.30047276e+03+0.j  -3.81634520e+02+0.j ...,
   -8.00295443e+01+0.j   1.01895360e+00+0.j  -9.57019615e+00+0.j]
 [  3.56287329e+03+0.j   3.72760720e+03+0.j   4.18435474e+02+0.j ...,
   -1.44795769e+02+0.j  -9.48838494e+01+0.j  -1.09570868e+01+0.j]]
(1567, 20)
还原后的高维近似数据：
[[  3.01047911e+03+0.j   2.50672705e+03+0.j   2.20329423e+03+0.j ...,
    1.49018647e-02+0.j   4.74453671e-03+0.j   1.03157960e

# PCA 算法模拟：利用奇异值分解

In [15]:
A=loadDataSet('secom.data',' ')
A[0:1]

array([[  3.03093000e+03,   2.56400000e+03,   2.18773330e+03,
          1.41112650e+03,   1.36020000e+00,   1.00000000e+02,
          9.76133000e+01,   1.24200000e-01,   1.50050000e+00,
          1.62000000e-02,  -3.40000000e-03,   9.45500000e-01,
          2.02439600e+02,   0.00000000e+00,   7.95580000e+00,
          4.14871000e+02,   1.00433000e+01,   9.68000000e-01,
          1.92396300e+02,   1.25190000e+01,   1.40260000e+00,
         -5.41900000e+03,   2.91650000e+03,  -4.04375000e+03,
          7.51000000e+02,   8.95500000e-01,   1.77300000e+00,
          3.04900000e+00,   6.42333000e+01,   2.02220000e+00,
          1.63200000e-01,   3.51910000e+00,   8.33971000e+01,
          9.51260000e+00,   5.06170000e+01,   6.42588000e+01,
          4.93830000e+01,   6.63141000e+01,   8.69555000e+01,
          1.17513200e+02,   6.12900000e+01,   4.51500000e+00,
          7.00000000e+01,   3.52717300e+02,   1.01841000e+01,
          1.30369100e+02,   7.23309200e+02,   1.30720000e+00,
        

In [16]:
A=clean_NanData(A)
A[0:1]

array([[  3.03093000e+03,   2.56400000e+03,   2.18773330e+03,
          1.41112650e+03,   1.36020000e+00,   1.00000000e+02,
          9.76133000e+01,   1.24200000e-01,   1.50050000e+00,
          1.62000000e-02,  -3.40000000e-03,   9.45500000e-01,
          2.02439600e+02,   0.00000000e+00,   7.95580000e+00,
          4.14871000e+02,   1.00433000e+01,   9.68000000e-01,
          1.92396300e+02,   1.25190000e+01,   1.40260000e+00,
         -5.41900000e+03,   2.91650000e+03,  -4.04375000e+03,
          7.51000000e+02,   8.95500000e-01,   1.77300000e+00,
          3.04900000e+00,   6.42333000e+01,   2.02220000e+00,
          1.63200000e-01,   3.51910000e+00,   8.33971000e+01,
          9.51260000e+00,   5.06170000e+01,   6.42588000e+01,
          4.93830000e+01,   6.63141000e+01,   8.69555000e+01,
          1.17513200e+02,   6.12900000e+01,   4.51500000e+00,
          7.00000000e+01,   3.52717300e+02,   1.01841000e+01,
          1.30369100e+02,   7.23309200e+02,   1.30720000e+00,
        

In [17]:
#计算协方差矩阵
C=np.cov(A,rowvar=0)
C

array([[  5.39940056e+03,  -8.47962623e+02,   1.02671010e+01, ...,
         -1.67440688e-02,  -5.93197815e-03,   2.87879850e+01],
       [ -8.47962623e+02,   6.43649877e+03,   1.35942679e+01, ...,
          1.21967287e-02,   2.32652705e-03,   3.37335304e+02],
       [  1.02671010e+01,   1.35942679e+01,   8.63239193e+02, ...,
         -7.59126039e-03,  -2.59521865e-03,  -9.07023669e+01],
       ..., 
       [ -1.67440688e-02,   1.21967287e-02,  -7.59126039e-03, ...,
          7.75231441e-05,   2.45865358e-05,   3.22979001e-01],
       [ -5.93197815e-03,   2.32652705e-03,  -2.59521865e-03, ...,
          2.45865358e-05,   8.21484994e-06,   1.04706789e-01],
       [  2.87879850e+01,   3.37335304e+02,  -9.07023669e+01, ...,
          3.22979001e-01,   1.04706789e-01,   8.81006310e+03]])

In [18]:
C=np.dot(A.T,A)
C

array([[  1.42476689e+10,   1.17881893e+10,   1.03946263e+10, ...,
          7.77954220e+04,   2.49473158e+04,   4.70851360e+08],
       [  1.17881893e+10,   9.77134310e+09,   8.60635593e+09, ...,
          6.44524062e+04,   2.06667458e+04,   3.90337629e+08],
       [  1.03946263e+10,   8.60635593e+09,   7.58940595e+09, ...,
          5.67978265e+04,   1.82142305e+04,   3.43546029e+08],
       ..., 
       [  7.77954220e+04,   6.44524062e+04,   5.67978265e+04, ...,
          5.46720252e-01,   1.74897972e-01,   3.07888498e+03],
       [  2.49473158e+04,   2.06667458e+04,   1.82142305e+04, ...,
          1.74897972e-01,   5.66050836e-02,   9.89137650e+02],
       [  4.70851360e+08,   3.90337629e+08,   3.43546029e+08, ...,
          3.07888498e+03,   9.89137650e+02,   2.93633282e+07]])

In [20]:
# 对协方差矩阵进行奇异值分解得到特征向量U
U,S,V=np.linalg.svd(C)
print(np.shape(U))
print('特征向量U',U)
print('奇异值',S)

(590, 590)
特征向量U [[ -1.91438429e-01  -8.90909168e-02  -7.86943130e-02 ...,  -1.51395837e-09
   -4.12374079e-09  -1.55314502e-08]
 [ -1.58455424e-01  -7.43927238e-02  -6.46661696e-02 ...,  -2.03895778e-07
    1.15572618e-07   7.76085130e-08]
 [ -1.39663949e-01  -6.57311230e-02  -5.78244633e-02 ...,   9.02966037e-07
   -7.77863624e-08   6.62918764e-07]
 ..., 
 [ -1.04418320e-06  -5.20651509e-07  -4.35089604e-07 ...,  -1.21405939e-02
    9.06673967e-03  -2.51635673e-03]
 [ -3.35008866e-07  -1.63366439e-07  -1.47472066e-07 ...,   3.65224594e-02
   -2.80282158e-02   6.77754446e-03]
 [ -6.30520268e-03  -3.43211242e-03  -2.51913217e-03 ...,   1.75564216e-08
    1.67556288e-08   5.30598125e-08]]
奇异值 [  3.67352695e+11   7.05784844e+10   3.05930686e+10   1.29105409e+10
   2.93124016e+09   2.04041652e+09   7.32396872e+08   4.55490118e+08
   4.27766150e+08   3.58446528e+08   3.26305561e+08   2.99648769e+08
   2.92364892e+08   1.77354439e+08   1.69907461e+08   1.61156019e+08
   1.57193562e+08   1.3

In [22]:
#原有590个特征
oldn=590
#保留的特征数
n=20
print('还原率：',np.sum(S[0:n])/np.sum(S))
U_reduce=U[:,0:n].reshape(oldn,n)
print(np.shape(U_reduce))
U_reduce

还原率： 0.997782769785
(590, 20)


array([[ -1.91438429e-01,  -8.90909168e-02,  -7.86943130e-02, ...,
          1.49561848e-02,  -1.17834672e-02,   1.18394406e-02],
       [ -1.58455424e-01,  -7.43927238e-02,  -6.46661696e-02, ...,
          8.45945870e-03,  -1.57044486e-02,   2.10535880e-02],
       [ -1.39663949e-01,  -6.57311230e-02,  -5.78244633e-02, ...,
          1.32791236e-02,  -2.44579145e-03,   1.17822441e-02],
       ..., 
       [ -1.04418320e-06,  -5.20651509e-07,  -4.35089604e-07, ...,
         -6.88690100e-07,   4.70935761e-07,  -2.03945606e-06],
       [ -3.35008866e-07,  -1.63366439e-07,  -1.47472066e-07, ...,
         -2.48865018e-07,   1.41403549e-07,  -6.04526137e-07],
       [ -6.30520268e-03,  -3.43211242e-03,  -2.51913217e-03, ...,
         -7.27704556e-03,   2.84480816e-03,   2.38974104e-03]])

In [23]:
#利用主成分特征矩阵来进行降维处理，得到的结果就是一个降维后的数据
Z=np.dot(A,U_reduce)
print(np.shape(Z))
Z

(1567, 20)


array([[ -1.16479384e+04,  -5.22680871e+03,  -3.74826764e+03, ...,
         -4.21440992e+02,  -1.34337995e+02,   5.40334982e+02],
       [ -1.24833511e+04,  -1.86484154e+03,  -4.22575485e+03, ...,
         -9.13171257e+01,   2.16346930e+02,  -2.35946380e+02],
       [ -1.26570861e+04,  -3.42116855e+03,  -4.06491658e+03, ...,
         -7.19991782e+01,   4.06130425e+02,   4.01908141e+02],
       ..., 
       [ -1.30333250e+04,  -4.65986722e+03,  -9.41440980e+02, ...,
         -4.89508785e+01,  -1.84272561e+01,  -1.55170387e+01],
       [ -1.24533510e+04,  -4.77016653e+03,  -2.02594816e+03, ...,
         -8.10711888e+01,  -2.01348452e+00,   1.11255379e+01],
       [ -1.17065440e+04,  -3.39866634e+03,  -4.06510839e+03, ...,
         -9.06562469e+01,  -1.23072048e+02,   5.67544699e+01]])