In [None]:
import numpy as np

from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()


In [None]:
data.feature_names

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import pandas as pd
from pandas.tools.plotting import scatter_matrix

In [None]:
df = pd.DataFrame(data.data[:, 0:10], 
                  columns=data.feature_names[0:10])
scatter_matrix(df, figsize=(10,10));

In [None]:
df = pd.DataFrame(data.data[:, 6:8],
                  columns=data.feature_names[6:8])
scatter_matrix(df, figsize=(3,3));

In [None]:
df = pd.DataFrame(data.data[:, [0,2]],
                  columns=data.feature_names[[0,2]])
scatter_matrix(df, figsize=(3,3));

In [None]:
X = data.data[:, [0,2]]
y = data.target
names = data.feature_names[[0,2]]

In [None]:
X.shape, y.shape

In [None]:
plt.scatter(X[:, 0], X[:, 1])
plt.xlim(0, 180)
plt.ylim(20, 200)
plt.xlabel(names[0])
plt.ylabel(names[1])

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA()

In [None]:
pca.fit(X)

In [None]:
X_new = pca.transform(X)

In [None]:
plt.scatter(X_new[:, 0], X_new[:, 1])
plt.ylim(-60, 120)

In [None]:
pca.explained_variance_

In [None]:
pca.explained_variance_ / pca.explained_variance_.sum()

In [None]:
pca.explained_variance_ratio_

In [None]:
X = data.data[:, [6,7]]
y = data.target
names = data.feature_names[[6,7]]

plt.scatter(X[:, 0], X[:, 1])
plt.xlim(0, 0.5)
plt.ylim(0, 0.5)
plt.xlabel(names[0])
plt.ylabel(names[1])

In [None]:
pca.fit(X)
X_new = pca.transform(X)
plt.scatter(X_new[:, 0], X_new[:, 1])
plt.xlim(-0.1, 0.4)
plt.ylim(-0.25, 0.25)

In [None]:
pca.explained_variance_

In [None]:
pca.explained_variance_ratio_ # 寄与率

In [None]:
# eigで検算
m = X.mean(axis=0)

In [None]:
Xp = (X - m)

In [None]:
C = Xp.transpose().dot(Xp)

In [None]:
w, _ = np.linalg.eig(C)

In [None]:
w

In [None]:
w / w.sum() # 寄与率

In [None]:
# data全体でやる

X = data.data
y = data.target

from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits=1, 
                  train_size=0.8, 
                  test_size=0.2, 
                  random_state=0)

train_index, test_index = next(ss.split(X, y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]


In [None]:
pca.fit(X_train)

In [None]:
plt.plot(pca.explained_variance_ratio_)

In [None]:
plt.plot(np.add.accumulate(pca.explained_variance_ratio_))

In [None]:
X_train_pca = pca.transform(X_train)
X_test_pca  = pca.transform(X_test)

In [None]:
from sklearn import linear_model
clf = linear_model.LogisticRegression()

In [None]:
clf.fit(X_train_pca, y_train)
clf.score(X_test_pca, y_test)

In [None]:
clf.fit(X_train_pca[:, 0:1], y_train)
clf.score(X_test_pca[:, 0:1], y_test)

In [None]:
clf.fit(X_train_pca[:, 0:3], y_train)
clf.score(X_test_pca[:, 0:3], y_test)

In [None]:
scores = []
i_range = range(1,31)

for i in i_range:

    clf.fit(X_train_pca[:, 0:i], y_train)
    scores.append( clf.score(X_test_pca[:, 0:i],
                             y_test) )
    
scores = np.array(scores)

In [None]:
plt.plot(i_range, scores)
plt.ylim(0.7,1);

In [None]:
clf.fit(X_train_pca[:, 0:2], y_train)
clf.score(X_test_pca[:, 0:2], y_test)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

degree 1: $x_1, x_2, x_3$

degree 2: $x_1 x_2, x_1 x_3, x_2 x_3$

degree 3: $x_1 x_2 x_3$

degree 1: $x_1, x_2, x_3, x_4$

degree 2: $x_1 x_2, x_1 x_3, x_1 x_4, x_2 x_3, x_2 x_4, x_3 x_4$

degree 3: $x_1 x_2 x_3, x_1 x_2 x_4, x_1 x_3 x_4, x_2 x_3 x_4$

degree 4: $x_1 x_2 x_3 x_4$

In [None]:
polf = PolynomialFeatures(degree=2)

In [None]:
polf.fit(X_train)

In [None]:
X_train_poly = polf.transform(X_train)
X_test_poly  = polf.transform(X_test)

In [None]:
X_train.shape, X_train_poly.shape

In [None]:
X_test.shape, X_test_poly.shape

In [None]:
clf.fit(X_train_poly, y_train)
clf.score(X_test_poly, y_test)

In [None]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
pca.fit(X_train_poly)

In [None]:
plt.plot(np.add.accumulate(pca.explained_variance_ratio_))

In [None]:
scores = []

n_features = X_train_poly.shape[1]
i_range = range(1, n_features, 10)

X_train_poly_pca = pca.transform(X_train_poly)
X_test_poly_pca  = pca.transform(X_test_poly)

for i in i_range:

    clf.fit(X_train_poly_pca[:, 0:i], y_train)
    
    scores.append( clf.score(X_test_poly_pca[:, 0:i],
                             y_test) )

scores = np.array(scores)

In [None]:
plt.plot(i_range, scores);
plt.title("max {0:.4f} at {1}".format(scores.max(), 
                                      i_range[np.argmax(scores)]))

In [None]:
for d in [2, 3, 4]:
    print("d=", d)
    
    polf = PolynomialFeatures(degree=d)
    polf.fit(X_train)
    X_train_poly = polf.transform(X_train)
    X_test_poly  = polf.transform(X_test)

    pca.fit(X_train_poly)
    X_train_poly_pca = pca.transform(X_train_poly)
    X_test_poly_pca  = pca.transform(X_test_poly)
    
    scores = []
    n_features = min(500, X_train_poly.shape[1])
    i_range = range(1, n_features, 10)
    
    print("max dimension: ", X_train_poly.shape[1])

    print("i=", end="")
    for i in i_range:
        print(i, end=",")
        clf.fit(X_train_poly_pca[:, 0:i], y_train)
        scores.append( clf.score(X_test_poly_pca[:, 0:i], y_test) )
    print("")
    
    scores = np.array(scores)
    plt.plot(i_range, scores, label="d={0}".format(d))

plt.legend();