In [2]:
#PLS分析的是2组数据，每组数据维度不同，但2组数据之间有联系 
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA

# #############################################################################
# Dataset based latent variables model  建立数据集之间与数据集内部的相关参数分析

n = 500
# 2 latents vars: 2个潜在的变量
l1 = np.random.normal(size=n)
l2 = np.random.normal(size=n)

latents = np.array([l1, l1, l2, l2]).T   
X = latents + np.random.normal(size=4 * n).reshape((n, 4))  #X，Y 的 size=（500,4）
Y = latents + np.random.normal(size=4 * n).reshape((n, 4))

X_train = X[:n // 2] #将数据集X和数据集Y切分  X分为X_train和Y_train  同理Y
Y_train = Y[:n // 2]#  X_train、X_test 的size=（250,4）
X_test = X[n // 2:] 
Y_test = Y[n // 2:]

print("Corr(X)")
print(np.round(np.corrcoef(X.T), 2)) #1 round() 浮点数的四舍五入 2 表示保留2为小数 corrcoef() 求相关系数（矩阵）
print("Corr(Y)")
print(np.round(np.corrcoef(Y.T), 2))

Automatically created module for IPython interactive environment
Corr(X)
[[ 1.    0.44  0.    0.03]
 [ 0.44  1.   -0.01  0.08]
 [ 0.   -0.01  1.    0.55]
 [ 0.03  0.08  0.55  1.  ]]
Corr(Y)
[[ 1.    0.39 -0.09 -0.01]
 [ 0.39  1.    0.01  0.02]
 [-0.09  0.01  1.    0.54]
 [-0.01  0.02  0.54  1.  ]]


In [15]:
X.shape

(500, 4)

In [3]:
# Transform data #处理数据 首先调用PLSCanonical()类
# ~~~~~~~~~~~~~~
plsca = PLSCanonical(n_components=2) #1 建立一个PLSCanonical实例
plsca.fit(X_train, Y_train)  # 2 调用函数fit 
X_train_r, Y_train_r = plsca.transform(X_train, Y_train) #3降维度处理数据   
X_test_r, Y_test_r = plsca.transform(X_test, Y_test)#X_train_r、Y_train_r 的size = (250,2) 对应为2种成分的数据矩阵  其他一样

In [12]:
Y_test_r.shape

(250, 2)

In [16]:
# 1) On diagonal plot X vs Y scores on each components 在对角线上是X与Y的主成分1的相关性分析
plt.figure(figsize=(12, 8))
plt.subplot(221)
plt.scatter(X_train_r[:, 0], Y_train_r[:, 0], label="train",
            marker="o", c="b", s=25)
plt.scatter(X_test_r[:, 0], Y_test_r[:, 0], label="test",
            marker="o", c="r", s=25)
plt.xlabel("x scores")
plt.ylabel("y scores")
plt.title('Comp. 1: X vs Y (test corr = %.2f)' % 
          np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1])#第一种成分 X_test_r[:, 0], Y_test_r[:, 0]相应的最大相关系数
plt.xticks(())
plt.yticks(())
plt.legend(loc="best")

plt.subplot(224)
plt.scatter(X_train_r[:, 1], Y_train_r[:, 1], label="train",
            marker="o", c="b", s=25)
plt.scatter(X_test_r[:, 1], Y_test_r[:, 1], label="test",
            marker="o", c="r", s=25)
plt.xlabel("x scores")
plt.ylabel("y scores")
plt.title('Comp. 2: X vs Y (test corr = %.2f)' %
          np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1])##第二种成分 X_test_r[:, 1], Y_test_r[:, 1]相应的最大相关系数
plt.xticks(())
plt.yticks(())
plt.legend(loc="best")
plt.show()

# 2) Off diagonal plot components 1 vs 2 for X and Y  非对角线上是数据集X上内部成分1 和 成分2的相关系分析
plt.subplot(222)
plt.scatter(X_train_r[:, 0], X_train_r[:, 1], label="train",
            marker="*", c="b", s=50)
plt.scatter(X_test_r[:, 0], X_test_r[:, 1], label="test",
            marker="*", c="r", s=50)
plt.xlabel("X comp. 1"
plt.ylabel("X comp. 2")
plt.title('X comp. 1 vs X comp. 2 (test corr = %.2f)'
          % np.corrcoef(X_test_r[:, 0], X_test_r[:, 1])[0, 1])
plt.legend(loc="best")
plt.xticks(())
plt.yticks(())

plt.subplot(223) #非对角线上是数据集Y上内部成分1 和 成分2的相关系分析
plt.scatter(Y_train_r[:, 0], Y_train_r[:, 1], label="train",
            marker="*", c="b", s=50)
plt.scatter(Y_test_r[:, 0], Y_test_r[:, 1], label="test",
            marker="*", c="r", s=50)
plt.xlabel("Y comp. 1")
plt.ylabel("Y comp. 2")
plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)'
          % np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1])
plt.legend(loc="best")
plt.xticks(())
plt.yticks(())
plt.show()

<matplotlib.legend.Legend at 0x7f358ccdbe48>

In [18]:
# PLS regression, with multivariate response, a.k.a. PLS2 多变量PLS  求解矩阵B
n = 1000
q = 3
p = 10
X = np.random.normal(size=n * p).reshape((n, p)) #X组数据size=(1000,10)
B = np.array([[1, 2] + [0] * (p - 2)] * q).T#初始B矩阵 size=(10,3)
# each Yj = 1*X1 + 2*X2 + noize
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5#Y组数据size=(1000,3)
#使用算法 2个变量组
pls2 = PLSRegression(n_components=3)#1 建立PLSRegression类
pls2.fit(X, Y)#2 将数据放入模型
print("True B (such that: Y = XB + Err)")
print(B)
# compare pls2.coef_ with B
print("Estimated B")
print(np.round(pls2.coef_, 1))#3.计算B值 使用coef_属性计算B值 对应方程系数
pls2.predict(X)#4 对数据X进行预测 可以与Y进行对比


True B (such that: Y = XB + Err)
[[1 1 1]
 [2 2 2]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
Estimated B
[[ 1.   1.   1. ]
 [ 2.   2.   2. ]
 [-0.  -0.1 -0. ]
 [ 0.   0.   0. ]
 [ 0.   0.1  0. ]
 [ 0.   0.1  0. ]
 [-0.  -0.  -0. ]
 [ 0.   0.   0. ]
 [-0.  -0.  -0. ]
 [-0.   0.  -0. ]]


array([[ 11.21766342,  11.17820073,  11.23946269],
       [ 10.26093161,  10.36890095,  10.31673387],
       [  5.10036084,   5.41543701,   5.16181511],
       ..., 
       [  6.61104897,   6.55847013,   6.60367931],
       [  4.26116591,   4.17009048,   4.23844096],
       [  7.16917081,   7.36383648,   7.22391533]])

In [19]:
Y


array([[ 12.99191204,   9.69847327,  10.43729544],
       [ 11.19854231,  10.7012581 ,  10.34205222],
       [  5.3577617 ,   4.37670438,   5.14891213],
       ..., 
       [  6.787191  ,   7.67052644,   5.99125415],
       [  3.36314466,   4.41601134,   7.56772785],
       [  8.16989952,   8.60363434,   8.39949284]])

In [25]:
# PLS regression, with univariate response, a.k.a. PLS1 单变量的pls
n = 1000
p = 10
X = np.random.normal(size=n * p).reshape((n, p)) #X数据集size=(1000,10)
y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 #Y数据集size=(1000,1)
#1 建立实例
pls1 = PLSRegression(n_components=3)
pls1.fit(X, y)#2 加载模型
# note that the number of components exceeds 1 (the dimension of y)
print("Estimated betas")
print(np.round(pls1.coef_, 1))#3 计算B值
pls1.predict(X)#4预测X值对应的值  可以与y进行对比
pls1.predict(X).shape 

Estimated betas
[[ 1. ]
 [ 2. ]
 [-0. ]
 [ 0. ]
 [ 0. ]
 [-0. ]
 [ 0. ]
 [ 0. ]
 [ 0.1]
 [-0. ]]


(1000, 1)

In [26]:
y.shape

(1000,)