In [None]:
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
X_iris, y_iris = iris.data, iris.target

In [None]:
print(X_iris.shape, y_iris.shape)

In [None]:
print(X_iris[0], y_iris[0])

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
# Get dataset with only the first two attributes
X, y = X_iris[:, :2], y_iris
# Split the dataset into a training and a testing set
# Test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 33)

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
print(X_train)

In [None]:
# Standardize the features
## ?? 理解preprocessing中的标准化
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print(X_train)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline 
# 在Jupyter内显示图像
colors = ['red', 'greenyellow', 'blue']
for i in range(len(colors)):
    xs = X_train[:, 0][y_train == i]
    ys = X_train[:, 1][y_train == i]
    plt.scatter(xs, ys, c = colors[i])
plt.legend(iris.target_names)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train, y_train)

In [None]:
print(clf.coef_)

In [None]:
print(clf.intercept_)

In [None]:
x_min, x_max = X_train[:, 0].min() - .5, X_train[:, 0].max() + .5
y_min, y_max = X_train[:, 1].min() - .5, X_train[:, 1].max() + .5
xs = np.arange(x_min, x_max, 0.5)
fig, axes = plt.subplots(1, 3)
fig.set_size_inches(10, 6)
for i in range(3):
    axes[i].set_aspect('equal')
    axes[i].set_title('Class ' + str(i) + ' versus the rest')
    axes[i].set_xlabel('Sepal length')
    axes[i].set_ylabel('Sepal width')
    axes[i].set_xlim(x_min, x_max)
    axes[i].set_ylim(y_min, y_max)
    plt.sca(axes[i])
    plt.scatter(X_train[:, 0], X_train[:, 1], c = y_train,
               cmap = plt.cm.prism)
    ys = (-clf.intercept_[i] - 
          xs * clf.coef_[i, 0]) / clf.coef_[i, 1]
    plt.plot(xs, ys, hold=True)


In [None]:
# 测试新样本
print(clf.predict(scaler.transform([[4.7, 3.1]])))

In [None]:
print(clf.decision_function(scaler.transform([[4.7, 3.1]])))

In [None]:
from sklearn import metrics
y_train_pred = clf.predict(X_train)
print(metrics.accuracy_score(y_train, y_train_pred))

In [None]:
y_pred = clf.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
print(metrics.classification_report(y_test, y_pred, target_names=iris.target_names))

In [None]:
print(metrics.confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# create a composite estimator made by a pipeline of the standarization and the linear model
clf = Pipeline([
        ('scaler', StandardScaler()),
        ('linear_model', SGDClassifier())
    ])
# create a k-fold cross validation iterator of k=5 folds
cv = KFold(X.shape[0], 5, shuffle=True, random_state=33)
# by default the score used is the one returned by score method of the estimator (accuracy)
scores = cross_val_score(clf, X, y, cv=cv)
print(scores)

In [None]:
from scipy.stats import sem
def mean_score(scores):
    return ("Mean score: {0:.3f} (+/- {1:.3f})"
           ).format(np.mean(scores), sem(scores))
print(mean_score(scores))