# Standardization

trainの平均と分散を使って、testを標準化する。

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

# range scaling

最大最小をある範囲に収める。

In [None]:
from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler([-1,1])
mmscaler.fit(X_train)
X_train_mms = mmscaler.transform(X_train)
X_test_mms  = mmscaler.transform(X_test)

# Normalization

In [None]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
# normalizer.fit(X_train) サンプルごとにノルムが1になるようにするので、fitする必要はない。列ごとに集計する必要がない。
X_train_norm = normalizer.transform(X_train)
X_test_norm  = normalizer.transform(X_test)

for norm in ['l2', 'l1', 'max']:
    normalizer = Normalizer(norm=norm)
    normalizer.fit(X_train)
    X_train_norm = normalizer.transform(X_train)
    X_test_norm  = normalizer.transform(X_test)
    clf.fit(X_train_norm, y_train)
    print(norm, clf.score(X_test_norm, y_test))

# PCA Whitening

PCAをして、標準化(平均0, 分散1)する。

In [None]:
from sklearn.decomposition import PCA
pca = PCA(whiten=True)
pca.fit(X)
X_new = pca.transform(X)

# ZCA Whitening

PCA Whiteningをしてから、元の座標系に座標変換して戻す。

In [None]:
from sklearn.decomposition import PCA
pca = PCA(whiten=True)
pca.fit(X)
X_new = pca.transform(X)
X_new2 = X_new.dot(pca.components_) # 元の座標系表示に戻す。pca.components_:対角化行列

# 2値化

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer

pipe = Pipeline([('bin', Binarizer()), 
                 ('clf', LogisticRegression())])
from sklearn.model_selection import GridSearchCV

param = {'bin__threshold': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4]}

gs1 = GridSearchCV(pipe, param, n_jobs=-1, verbose=2)
gs1.fit(X_train, y_train)