In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
import seaborn as sns
sns.set()

- データセットの準備

In [None]:
X,y=mglearn.datasets.make_forge()

In [None]:
mglearn.discrete_scatter(X[:,0],X[:,1],y)

In [None]:
df=pd.DataFrame(X,columns=['feature1','feature2'])
df['class']=y
df.head()

In [None]:
sns.scatterplot(
    data=df,
    x='feature1',
    y='feature2',
    hue='class',
    style='class',
)

In [None]:
X,y = mglearn.datasets.make_wave(n_samples=40)
df=pd.DataFrame(X,columns=['feature'])
df['responce']=y
sns.scatterplot(
    data=df,
    x='feature',
    y='responce'
    )

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
cancer.keys()

In [None]:
class_df=pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
class_df['target']=cancer['target']
class_df['target_name']=class_df['target'].apply(lambda x:cancer['target_names'][x])
class_df.head()

In [None]:
class_df['target_name'].value_counts()

In [None]:
X,y = mglearn.datasets.load_extended_boston()

In [None]:
reg_df = pd.DataFrame(X)

In [None]:
reg_df

- k最近傍法

In [None]:
mglearn.plots.plot_knn_classification(n_neighbors=3)

In [None]:
from sklearn.model_selection import train_test_split
X,y=mglearn.datasets.make_forge()
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)

In [None]:
clf.fit(X_train,y_train)

In [None]:
clf.predict(X_test)

In [None]:
clf.score(X_test,y_test)

In [None]:
fig,axes = plt.subplots(ncols=3,nrows=1,figsize=(10,3),tight_layout=True)
axes = axes.ravel()
for n,ax in zip([1,3,9],axes):
    clf=KNeighborsClassifier(n_neighbors=n).fit(X_train,y_train)
    mglearn.plots.plot_2d_separator(clf,X,fill=True,eps=0.5,ax=ax,alpha=.4)
    mglearn.discrete_scatter(X[:,0],X[:,1],y,ax=ax)
    ax.set_title(f'{n} neighbor(s)')
    ax.set_xlabel('feature 1')
    ax.set_ylabel('feature 2')

In [None]:
X_train,X_test,y_train,y_test=train_test_split(cancer['data'],cancer['target'],random_state=3)
training_accuracy=[]
test_accuracy=[]
for n in range(1,21):
    clf=KNeighborsClassifier(n_neighbors=n).fit(X_train,y_train)
    training_accuracy.append(clf.score(X_train,y_train))
    test_accuracy.append(clf.score(X_test,y_test))

sns.lineplot(
    data=pd.DataFrame({
        'training_accuracy':training_accuracy,
        'test_accuracy':test_accuracy
    }),
)

In [None]:
mglearn.plots.plot_knn_regression(n_neighbors=1)
mglearn.plots.plot_knn_regression(n_neighbors=10)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
X,y = mglearn.datasets.make_wave(n_samples=40)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
reg=KNeighborsRegressor(n_neighbors=3).fit(X_train,y_train)

In [None]:
reg.score(X_test,y_test)

In [None]:
np.arange(1,11).reshape(-1,1)

In [None]:
test_sampls=np.linspace(-3,3,1000).reshape(-1,1)
n=[1,3,7,9]
fig,axes=plt.subplots(2,2,figsize=(8,8),tight_layout=True)
axes=axes.ravel()
for n,ax in zip(n,axes):
    reg=KNeighborsRegressor(n_neighbors=n).fit(X_train,y_train)
    preds=reg.predict(test_sampls)
    ax.plot(test_sampls,preds)
    ax.scatter(X_train,y_train,marker='^',c='b')
    ax.scatter(X_test,y_test,marker='v',c='r')
    ax.set_title(f'{n} neighbor')
    ax.set_ylabel('responce')
    ax.set_xlabel('feature')

axes[0].legend(['predict','train','test'],loc='best')


- 線型モデル

In [None]:
mglearn.plots.plot_linear_regression_wave()

In [None]:
from sklearn.linear_model import LinearRegression
X,y = mglearn.datasets.make_wave(n_samples=60)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
ln=LinearRegression().fit(X_train,y_train)
print(f'coef:{ln.coef_}')
print(f'intercept:{ln.intercept_}')
print(f'train_score:{np.round(ln.score(X_train,y_train),2)}')
print(f'test_score:{np.round(ln.score(X_test,y_test),2)}')

In [None]:
X,y = mglearn.datasets.load_extended_boston()
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
ln=LinearRegression().fit(X_train,y_train)
# print(f'coef:{ln.coef_}')
# print(f'intercept:{ln.intercept_}')
print(f'train_score:{np.round(ln.score(X_train,y_train),2)}')
print(f'test_score:{np.round(ln.score(X_test,y_test),2)}')

- リッジ回帰  
各変数の重みを最小化する、という目線で制約をかける＝L2正則化

In [None]:
from sklearn.linear_model import Ridge
ridge=Ridge().fit(X_train,y_train)
# print(f'coef:{ln.coef_}')
# print(f'intercept:{ln.intercept_}')
print(f'train_score:{np.round(ridge.score(X_train,y_train),2)}')
print(f'test_score:{np.round(ridge.score(X_test,y_test),2)}')

In [None]:
ridge10=Ridge(alpha=10).fit(X_train,y_train)
# print(f'coef:{ln.coef_}')
# print(f'intercept:{ln.intercept_}')
print(f'train_score:{np.round(ridge10.score(X_train,y_train),2)}')
print(f'test_score:{np.round(ridge10.score(X_test,y_test),2)}')

In [None]:
ridge01=Ridge(alpha=.1).fit(X_train,y_train)
# print(f'coef:{ln.coef_}')
# print(f'intercept:{ln.intercept_}')
print(f'train_score:{np.round(ridge01.score(X_train,y_train),2)}')
print(f'test_score:{np.round(ridge01.score(X_test,y_test),2)}')

In [None]:
plt.plot(ridge.coef_,'s',label='alpha=1')
plt.plot(ridge10.coef_,'^',label='alpha=10')
plt.plot(ridge01.coef_,'v',label='alpha=.1')
plt.plot(ln.coef_,'o',label='liner')
plt.xlabel('coeficient_no')
plt.xlabel('weight')
plt.ylim(-25,25)
plt.legend()

In [None]:
mglearn.plots.plot_ridge_n_samples()

- Lasso回帰  
ridgeよりも変数を削除する（影響しない係数を０にすることを目指す）正則化ロジック。L1正則化

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso().fit(X_train,y_train)
print(f'train_score:{np.round(lasso.score(X_train,y_train),2)}')
print(f'test_score:{np.round(lasso.score(X_test,y_test),2)}')
print(f'used:{np.sum(lasso.coef_!=0)}')

In [None]:
lasso001 = Lasso(alpha=0.01,max_iter=100000).fit(X_train,y_train)
print(f'train_score:{np.round(lasso001.score(X_train,y_train),2)}')
print(f'test_score:{np.round(lasso001.score(X_test,y_test),2)}')
print(f'used:{np.sum(lasso001.coef_!=0)}')

In [None]:
lasso00001 = Lasso(alpha=0.0001,max_iter=100000).fit(X_train,y_train)
print(f'train_score:{np.round(lasso00001.score(X_train,y_train),2)}')
print(f'test_score:{np.round(lasso00001.score(X_test,y_test),2)}')
print(f'used:{np.sum(lasso00001.coef_!=0)}')

In [None]:
# plt.plot(ridge.coef_,'o',label='ridge alpha=1')
plt.plot(lasso.coef_,'s',label='lasso alpha=1')
plt.plot(lasso001.coef_,'^',label='lasso alpha=.01')
# plt.plot(lasso00001.coef_,'v',label='lasso alpha=.00001')

plt.xlabel('coeficient_no')
plt.xlabel('weight')
# plt.ylim(-25,25)
plt.legend()

- 線型回帰による分類モデル SVMとロジスティック回帰

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

X,y = mglearn.datasets.make_forge()
fig,axes = plt.subplots(1,2,figsize=(10,3))

for model,ax in zip([LinearSVC(),LogisticRegression()],axes):
    clf=model.fit(X,y)
    mglearn.plots.plot_2d_separator(clf,X,fill=False,eps=.5,ax=ax,alpha=.7)
    mglearn.discrete_scatter(X[:,0],X[:,1],y,ax=ax)
    ax.set_title(f'{clf.__class__.__name__}')
    ax.set_xlabel('feature0')
    ax.set_ylabel('feature1')

In [None]:
mglearn.plots.plot_linear_svc_regularization()

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(
    cancer.data,
    cancer.target,
    stratify=cancer.target,
    random_state=0
    )
logreg=LogisticRegression(max_iter=10000).fit(X_train,y_train)
print(f'training set score:{logreg.score(X_train,y_train):.2f}')
print(f'test set score:{logreg.score(X_test,y_test):.2f}')

In [None]:
logreg100=LogisticRegression(C=100,max_iter=10000).fit(X_train,y_train)
print(f'training set score:{logreg100.score(X_train,y_train):.2f}')
print(f'test set score:{logreg100.score(X_test,y_test):.2f}')

In [None]:
logreg001=LogisticRegression(C=.01,max_iter=10000).fit(X_train,y_train)
print(f'training set score:{logreg001.score(X_train,y_train):.2f}')
print(f'test set score:{logreg001.score(X_test,y_test):.2f}')

In [None]:
plt.plot(logreg.coef_.T,'o',label='C=1')
plt.plot(logreg100.coef_.T,'^',label='C=100')
plt.plot(logreg001.coef_.T,'^',label='C=0.01')
plt.legend()
plt.xticks(range(cancer.data.shape[1]),cancer.feature_names,rotation=90)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(
    cancer.data,
    cancer.target,
    stratify=cancer.target,
    random_state=0
    )
for C,maker in zip([0.001,1,100],['o','^','v']):
    lr_l1=LogisticRegression(C=C,penalty='l1',solver='liblinear').fit(X_train,y_train)
    plt.plot(lr_l1.coef_.T,maker,label='C=1')


- 多クラス分類

In [None]:
from sklearn.datasets import make_blobs
X,y = make_blobs(random_state=42)
plt.scatter(x=X[y==0,0],y=X[y==0,1],c='r',marker='o')
plt.scatter(x=X[y==1,0],y=X[y==1,1],c='g',marker='^')
plt.scatter(x=X[y==2,0],y=X[y==2,1],c='b',marker='v')
plt.legend(['class0','class1','class2'])

In [None]:

from sklearn.svm import LinearSVC
linear_svc = LinearSVC().fit(X,y)
print(f'coeficient shape:{linear_svc.coef_.shape}')
print(f'intercept shape:{linear_svc.intercept_.shape}')

分類境界はcoef1*x1+coef2*x2+intercept=0になる線。  
x2について解くと、x2=-(coef1*x1+intercept)/coef2

In [None]:
plt.scatter(x=X[y==0,0],y=X[y==0,1],c='r',marker='o')
plt.scatter(x=X[y==1,0],y=X[y==1,1],c='g',marker='^')
plt.scatter(x=X[y==2,0],y=X[y==2,1],c='b',marker='v')

line=np.linspace(-15,15)
for coef,intercept,color in zip(linear_svc.coef_,linear_svc.intercept_,['r','g','b']):
    plt.plot(line,-((line*coef[0])+intercept)/coef[1],c=color)

plt.xlim(-15,15)
plt.ylim(-15,15)

plt.legend(['class0','class1','class2','class0 line','class1 line','class2 line'])

In [None]:
mglearn.plots.plot_2d_classification(linear_svc,X,fill=True,alpha=.7)
mglearn.discrete_scatter(X[:,0],X[:,1],y)
line=np.linspace(-15,15)
for coef,intercept,color in zip(linear_svc.coef_,linear_svc.intercept_,['b','r','g']):
    plt.plot(line,-((line*coef[0])+intercept)/coef[1],c=color)
plt.legend(['class0','class1','class2','class0 line','class1 line','class2 line'],loc=(1.01,0.3))

- ナイーブベイズ  
3種類ある。  
Gaussianは連続値  
bernoulliは2値  
Multinomiaは頻度  

In [None]:
X=np.array([
    [0,1,0,1],
    [1,0,1,1],
    [0,0,0,1],
    [1,0,1,0]
])
y=np.array([0,1,0,1])

In [None]:
counts={}
for label in np.unique(y):
    counts[label]=X[y==label].sum(axis=0)
print(counts)

- 決定木

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(
    cancer.data,cancer.target,stratify=cancer.target,random_state=42
)
tree=DecisionTreeClassifier(random_state=0).fit(X_train,y_train)
print(f'accuracy on training : {tree.score(X_train,y_train):.3f}')
print(f'accuracy on test : {tree.score(X_test,y_test):.3f}')

In [None]:
tree=DecisionTreeClassifier(max_depth=4,random_state=0).fit(X_train,y_train)
print(f'accuracy on training : {tree.score(X_train,y_train):.3f}')
print(f'accuracy on test : {tree.score(X_test,y_test):.3f}')

In [None]:
cancer.feature_names

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(
    tree,
    out_file='tree.dot',
    class_names=cancer.target_names,
    feature_names=cancer.feature_names,
    impurity=False,
    filled=True,
)

In [None]:
print(f'feature importances:{tree.feature_importances_}')

In [None]:
def feature_importances_plot(model):
    n_features=cancer.data.shape[1]
    plt.barh(range(n_features),model.feature_importances_,align='center')
    plt.yticks(np.arange(n_features),cancer.feature_names)
    plt.xlabel('feature importance')
    plt.ylabel('feature')

In [None]:
feature_importances_plot(tree)