In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn

In [None]:
mglearn.plots.plot_scaling()

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer=load_breast_cancer()

In [None]:
cancer=load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=1)
print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_train)

In [None]:
X_train_scaled=scaler.transform(X_train)
print(X_train.min(axis=0))
print(X_train_scaled.min(axis=0))

print(X_train.max(axis=0))
print(X_train_scaled.min(axis=0))

print(np.average(X_train,axis=0))
print(np.average(X_train_scaled,axis=0))
print(X_train_scaled.min(axis=0))


In [None]:
from sklearn.datasets import make_blobs
X,_=make_blobs(n_samples=50,centers=5,random_state=4,cluster_std=2)

X_train,X_test=train_test_split(X,random_state=5,test_size=.1)

scaler=MinMaxScaler().fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

test_scaler=MinMaxScaler().fit(X_test)
X_test_scaled_bad=test_scaler.transform(X_test)

fig,axes=plt.subplots(ncols=3,nrows=1,figsize=(9,3),tight_layout=True)

datasets=[[X_train,X_test],[X_train_scaled,X_test_scaled],[X_train_scaled,X_test_scaled_bad]]
titles=['original','correct scaled','bad scaled']

for ax,data,title in zip(axes.ravel(),datasets,titles):
    ax.scatter(data[0][:,0],data[0][:,1],color=mglearn.cm2(0),label='Training set',s=60)
    ax.scatter(data[1][:,0],data[1][:,1],color=mglearn.cm2(1),label='Test set',s=60,marker='^')
    ax.set_title(title)
axes[0].legend(loc='upper left')

In [None]:
from sklearn.svm import SVC
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=0)
svc=SVC(C=100)
svc.fit(X_train,y_train)
print(f'accuracy on test : {svc.score(X_test,y_test):.2f}')

In [None]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

scaler=MinMaxScaler().fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

svc=SVC(C=100)
svc.fit(X_train_scaled,y_train)
print(f'accuracy on test : {svc.score(X_test_scaled,y_test):.2f}')

In [None]:
scaler=StandardScaler().fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

svc=SVC(C=100)
svc.fit(X_train_scaled,y_train)
print(f'accuracy on test : {svc.score(X_test_scaled,y_test):.2f}')

In [None]:
mglearn.plots.plot_pca_illustration()

In [None]:
len(cancer.data[0])

In [None]:
cancer.feature_names

In [None]:
fig,axes=plt.subplots(15,2,figsize=(10,20),tight_layout=True)
malignant=cancer.data[cancer.target==0]
benign=cancer.data[cancer.target==1]
ax=axes.ravel()

for i in range(30):
    _,bins=np.histogram(cancer.data[:,i],bins=50)
    ax[i].hist(malignant[:,i],bins=bins,color=mglearn.cm3(0),alpha=.5)
    ax[i].hist(benign[:,i],bins=bins,color=mglearn.cm3(2),alpha=.5)
    ax[i].set_title(cancer.feature_names[i])
    ax[i].set_yticks(())
ax[0].set_xlabel('Feature magnitude')
ax[0].set_ylabel('Frequency')
ax[0].legend(['malignant','benign'],loc='best')

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

X_scaled=StandardScaler().fit_transform(cancer.data)
pca=PCA(n_components=2).fit(X_scaled)
X_pca=pca.transform(X_scaled)
print(f'original shape : {X_scaled.shape}')
print(f'decomposed shape : {X_pca.shape}')

In [None]:
plt.scatter(X_pca[cancer.target==0,0],X_pca[cancer.target==0,1],marker='o')
plt.scatter(X_pca[cancer.target==1,0],X_pca[cancer.target==1,1],marker='^')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.gca().set_aspect("equal")
plt.legend(['malignant','benign'],loc='best')


In [None]:
cancer.keys()

In [None]:
plt.matshow(pca.components_,cmap='viridis')
plt.yticks([0,1],['PC1','PC2'])
plt.xticks(range(len(cancer.feature_names)),cancer.feature_names,rotation=60,ha='left')
plt.colorbar()

In [None]:
from sklearn.datasets import fetch_lfw_people


In [None]:
people=fetch_lfw_people(
    min_faces_per_person=20,
    resize=.7,
    download_if_missing=True,
    )


In [None]:
image_shape=people.images[0].shape
fig,axes=plt.subplots(2,5,figsize=(15,8),subplot_kw={'xticks':(),'yticks':()},tight_layout=True)

for tar,im,ax in zip(people.target,people.images,axes.ravel()):
    ax.imshow(im)
    ax.set_title(people.target_names[tar])

In [None]:
counts=np.bincount(people.target)
for i, (count,name) in enumerate(zip(counts,people.target_names)):
    print(f'{name:25} {count:3}',end='   ')
    if (1+i)%3 == 0:
        print()

In [None]:
people.target.shape

In [None]:
np.where(people.target == people.target[0])[0][:50]

In [None]:
mask = np.zeros(people.target.shape,dtype=bool)
for target in np.unique(people.target):
    mask[np.where(people.target == target)[0][:50]]=1
X_people=people.data[mask]
y_people=people.target[mask]
X_people=X_people/255.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
X_train,X_test,y_train,y_test=train_test_split(X_people,y_people,stratify=y_people,random_state=0)
knn = KNeighborsClassifier(n_neighbors=1).fit(X_train,y_train)
print(f'1-nn accuracy on train:{knn.score(X_train,y_train):.2f}')
print(f'1-nn accuracy on test:{knn.score(X_test,y_test):.2f}')

In [None]:
mglearn.plots.plot_pca_whitening()

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=100,whiten=True,random_state=0).fit(X_train)
X_train_pca=pca.transform(X_train)
X_test_pca=pca.transform(X_test)
print(f'X_train shape : {X_train.shape}')
print(f'X_train_pca shape : {X_train_pca.shape}')

In [None]:
knn = KNeighborsClassifier(n_neighbors=1).fit(X_train_pca,y_train)
print(f'1-nn accuracy on train:{knn.score(X_train_pca,y_train):.2f}')
print(f'1-nn accuracy on test:{knn.score(X_test_pca,y_test):.2f}')

In [None]:
fig,axes = plt.subplots(3,5,figsize=(15,12),subplot_kw={'xticks':(),'yticks':()})
for i, (compornent,ax) in enumerate(zip(pca.components_,axes.ravel())):
    ax.imshow(compornent.reshape(image_shape),cmap='viridis')
    ax.set_title(f'{i+1}.compornent')

In [None]:
mglearn.plots.plot_pca_faces(X_train,X_test,image_shape)

In [None]:
mglearn.discrete_scatter(X_train_pca[:,0],X_train_pca[:,1],y_train)
plt.xlabel('PC1')
plt.ylabel('PC2')

In [None]:
mglearn.plots.plot_nmf_illustration()

In [None]:

(X_train,X_test,image_shape)

In [None]:
S=mglearn.datasets.make_signals()
plt.figure(figsize=(6,1))
plt.plot(S,'-')
plt.xlabel('time')
plt.ylabel('signal')

In [None]:
A=np.random.RandomState(0).uniform(size=(100,3))
X=np.dot(S,A.T)
print(f'Shape of measurements: {X.shape}')

In [None]:
from sklearn.decomposition import NMF,PCA
S_ = NMF(n_components=3,random_state=42).fit_transform(X)
H_ = PCA(n_components=3).fit_transform(X)

In [None]:
models=[X,S,S_,H_]
names=[
    'observations (first three data)',
    'True source',
    'NMF recovered',
    'PCA recovered',
]
fig,axes=plt.subplots(ncols=1,nrows=4,figsize=(16,8),
                      gridspec_kw={'hspace':.5},
                      subplot_kw={'xticks':(),'yticks':()})

for model,name,ax in zip(models,names,axes):
    ax.set_title(name)
    ax.plot(model[:,:3],'-')

In [None]:
from sklearn.datasets import load_digits
digits=load_digits()
fig,axes=plt.subplots(
    ncols=5,
    nrows=2,
    figsize=(10,5),
    subplot_kw={'xticks':(),'yticks':()}
    )
for ax,img in zip(axes.ravel(),digits.images):
    ax.imshow(img)

In [None]:
digit_pca=PCA(n_components=10).fit_transform(digits.data)
plt.figure(figsize=(10,10))
plt.xlim(digit_pca[:,0].min()-1,digit_pca[:,0].max()+1)
plt.ylim(digit_pca[:,1].min()-1,digit_pca[:,1].max()+1)
for i in range(len(digits.data)):
    plt.text(
        digit_pca[i,0],
        digit_pca[i,1],
        str(digits.target[i]),
        color=plt.cm.tab10(digits.target[i]),
        fontdict={'weight':'bold','size':9},
        ha='center',
        va='center'
        )
plt.xlabel('PC1')
plt.ylabel('PC2')

In [None]:
from sklearn.manifold import TSNE
digit_tsne=TSNE(random_state=42).fit_transform(digits.data)
plt.figure(figsize=(10,10))
plt.xlim(digit_tsne[:,0].min()-1,digit_tsne[:,0].max()+1)
plt.ylim(digit_tsne[:,1].min()-1,digit_tsne[:,1].max()+1)
for i in range(len(digits.data)):
    plt.text(
        digit_tsne[i,0],
        digit_tsne[i,1],
        str(digits.target[i]),
        color=plt.cm.tab10(digits.target[i]),
        fontdict={'weight':'bold','size':9},
        ha='center',
        va='center'
        )
plt.xlabel('PC1')
plt.ylabel('PC2')

In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

X,y = make_blobs(random_state=1)

kmeans=KMeans(n_clusters=3)
kmeans.fit(X)

In [None]:
print(f'Cluster membership:{kmeans.labels_}')

In [None]:
mglearn.discrete_scatter(X[:,0],X[:,1],kmeans.labels_,markers='o')
mglearn.discrete_scatter(
    kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],[0,1,2],markers='^',markeredgewidth=2
)

In [None]:
fig,axes = plt.subplots(nrows=2,ncols=3,figsize=(12,6),tight_layout=True)

for i,ax in enumerate(axes.ravel()):
    d=i+1
    kmeans=KMeans(n_clusters=d)
    kmeans.fit(X)
    mglearn.discrete_scatter(X[:,0],X[:,1],kmeans.labels_,markers='o',ax=ax)
    mglearn.discrete_scatter(
        kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],range(d),markers='^',markeredgewidth=2,ax=ax)
    ax.set_title(f'n_cluster={d}')

In [None]:
X_varied,y_varied=make_blobs(
    n_samples=200,
    cluster_std=[1,2.5,.5],
    random_state=170
)

fig,axes=plt.subplots(ncols=2,figsize=(6,3))

y_pred=KMeans(n_clusters=3,random_state=0).fit_predict(X_varied)
mglearn.discrete_scatter(X_varied[:,0],X_varied[:,1],y_pred,ax=axes[0])
mglearn.discrete_scatter(X_varied[:,0],X_varied[:,1],y_varied,ax=axes[1])
axes[0].set_title('pred_cluster')
axes[1].set_title('true_cluster')


In [None]:
X,y = make_blobs(random_state=170,n_samples=600)
rng=np.random.RandomState(74)
transformation=rng.normal(size=(2,2))
X=np.dot(X,transformation)
kmeans=KMeans(n_clusters=3).fit(X)
y_pred=kmeans.predict(X)
fig,axes = plt.subplots(ncols=2)

axes[0].scatter(X[:,0],X[:,1],c=y_pred,alpha=.5)
axes[0].scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],c='r',marker='^')
axes[0].set_title('kmeans n compornents = 3')
axes[1].scatter(X[:,0],X[:,1],c=y,alpha=.5)
axes[1].set_title('true cluster')

In [None]:
from sklearn.datasets import make_moons

fig,axes = plt.subplots(ncols=3,nrows=1,figsize=(6,2))

for n,ax in zip([1,.1,.01],axes):
    X,y = make_moons(
        n_samples=200,
        noise=n,
        random_state=0
    )
    ax.scatter(X[:,0],X[:,1],c=y)

In [None]:
X,y = make_moons(
    n_samples=200,
    noise=.05,
    random_state=0
)

kmeans=KMeans(n_clusters=2).fit(X)
y_pred=kmeans.predict(X)

fig,axes=plt.subplots(1,3,figsize=(12,4))
axes[0].scatter(X[:,0],X[:,1],c=y_pred,alpha=.5)
axes[0].scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],c='r',marker='^',s=100)
axes[0].set_title('prediction')
axes[1].scatter(X[:,0],X[:,1],c=y,alpha=.5)
axes[1].set_title('true')

kmeans=KMeans(n_clusters=10).fit(X)
y_pred=kmeans.predict(X)

axes[2].scatter(X[:,0],X[:,1],c=y_pred,alpha=.5)
axes[2].scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],c='r',marker='^',s=100)
axes[2].set_title('prediction')

In [None]:
distance_features=kmeans.transform(X)
distance_features

次元圧縮では元の次元以下しか特長量を出せないが、クラスタリングの場合は元の次元以上のクラスタに分けて、それぞれの中心からの距離を新たな特長量にすることで次元を増やすことができる

In [None]:
mglearn.plots.plot_agglomerative_algorithm()

In [None]:
mglearn.plots.plot_agglomerative()

In [None]:
from sklearn.cluster import AgglomerativeClustering
X,y = make_blobs(random_state=1)
agg=AgglomerativeClustering(n_clusters=3).fit(X)
y_pred=agg.fit_predict(X)

plt.scatter(X[:,0],X[:,1],c=y_pred)


In [None]:
from scipy.cluster.hierarchy import dendrogram, ward

X,y=make_blobs(random_state=0,n_samples=12)
linkage_ar=ward(X)
dendrogram(linkage_ar)

ax=plt.gca()
bounds=ax.get_xbound()
ax.plot(bounds,[7.25,7.25],'--',c='k')
ax.plot(bounds,[4,4],'--',c='k')

ax.text(bounds[1]+1,7.25,'two clusters',va='center',fontdict={'size':15})
ax.text(bounds[1]+1,4,'three clusters',va='center',fontdict={'size':15})
ax.set_xlabel('sample index')
ax.set_ylabel('cluster distance')

In [None]:
from sklearn.cluster import DBSCAN
X,y=make_blobs(random_state=0,n_samples=12)

y_pred=DBSCAN().fit_predict(X)

print(f'cluster membership {y_pred}')

In [None]:
mglearn.plots.plot_dbscan()

In [None]:
X,y = make_moons(
    n_samples=200,
    noise=.05,
    random_state=0
)

from sklearn.preprocessing import StandardScaler
X_scaled=StandardScaler().fit_transform(X)

min_samples=[1,5,10]
eps=np.linspace(.1,.3,5)
fig,axes=plt.subplots(len(min_samples),len(eps),figsize=(len(eps)*3.5,len(min_samples)*3))

for ax_col,m in zip(axes,min_samples):
    for ax,e in zip(ax_col,eps):
        ax.set_title(f'min_samples={m} eps={e}')
        y_pred=DBSCAN(min_samples=m,eps=e).fit_predict(X_scaled)
        ax.scatter(X_scaled[y_pred==-1,0],X_scaled[y_pred==-1,1],c='gray')
        ax.scatter(X_scaled[y_pred!=-1,0],X_scaled[y_pred!=-1,1],c=y_pred[y_pred!=-1])