## 数据加载

In [60]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
dataset = np.loadtxt("../data/pima-indians-diabetes.data", delimiter=",")
# separate the data from the target attributes
X = dataset[:,0:8]
y = dataset[:,8]
X,X_test,y,y_test=train_test_split(X,y,test_size=0.2)
scaler=StandardScaler()
X=scaler.fit_transform(X)
X_test=scaler.transform(X_test)

## logistic regression

In [61]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)
print(model)

expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

expected_test = y_test
predicted_test = model.predict(X_test)
print(metrics.classification_report(expected_test, predicted_test))
print(metrics.confusion_matrix(expected_test, predicted_test))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
             precision    recall  f1-score   support

        0.0       0.79      0.88      0.83       389
        1.0       0.74      0.60      0.67       225

avg / total       0.78      0.78      0.77       614

[[342  47]
 [ 89 136]]
             precision    recall  f1-score   support

        0.0       0.81      0.88      0.84       111
        1.0       0.61      0.47      0.53        43

avg / total       0.75      0.77      0.76       154

[[98 13]
 [23 20]]


## naive bayes

In [62]:
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

expected_test = y_test
predicted_test = model.predict(X_test)
print(metrics.classification_report(expected_test, predicted_test))
print(metrics.confusion_matrix(expected_test, predicted_test))

GaussianNB(priors=None)
             precision    recall  f1-score   support

        0.0       0.80      0.85      0.82       389
        1.0       0.70      0.63      0.66       225

avg / total       0.76      0.77      0.76       614

[[330  59]
 [ 84 141]]
             precision    recall  f1-score   support

        0.0       0.84      0.88      0.86       111
        1.0       0.65      0.56      0.60        43

avg / total       0.78      0.79      0.79       154

[[98 13]
 [19 24]]


## k-nearest neighbors

In [63]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

expected_test = y_test
predicted_test = model.predict(X_test)
print(metrics.classification_report(expected_test, predicted_test))
print(metrics.confusion_matrix(expected_test, predicted_test))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
             precision    recall  f1-score   support

        0.0       0.83      0.90      0.86       389
        1.0       0.79      0.67      0.73       225

avg / total       0.81      0.82      0.81       614

[[350  39]
 [ 74 151]]
             precision    recall  f1-score   support

        0.0       0.81      0.82      0.82       111
        1.0       0.52      0.51      0.52        43

avg / total       0.73      0.73      0.73       154

[[91 20]
 [21 22]]


## decision tree

In [64]:
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

expected_test = y_test
predicted_test = model.predict(X_test)
print(metrics.classification_report(expected_test, predicted_test))
print(metrics.confusion_matrix(expected_test, predicted_test))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       389
        1.0       1.00      1.00      1.00       225

avg / total       1.00      1.00      1.00       614

[[389   0]
 [  0 225]]
             precision    recall  f1-score   support

        0.0       0.79      0.77      0.78       111
        1.0       0.44      0.47      0.45        43

avg / total       0.69      0.69      0.69       154

[[86 25]
 [23 20]]


## SVM

In [65]:
from sklearn import metrics
from sklearn.svm import SVC
# fit a SVM model to the data
model = SVC()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

expected_test = y_test
predicted_test = model.predict(X_test)
print(metrics.classification_report(expected_test, predicted_test))
print(metrics.confusion_matrix(expected_test, predicted_test))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

        0.0       0.82      0.93      0.87       389
        1.0       0.85      0.65      0.74       225

avg / total       0.83      0.83      0.82       614

[[363  26]
 [ 78 147]]
             precision    recall  f1-score   support

        0.0       0.81      0.88      0.84       111
        1.0       0.61      0.47      0.53        43

avg / total       0.75      0.77      0.76       154

[[98 13]
 [23 20]]


## Ridge

In [66]:
import numpy as np
from sklearn.linear_model import Ridge
rg=Ridge(alpha=1.0)
model.fit(X,y)
print(model)
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

expected_test = y_test
predicted_test = model.predict(X_test)
print(metrics.classification_report(expected_test, predicted_test))
print(metrics.confusion_matrix(expected_test, predicted_test))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

        0.0       0.82      0.93      0.87       389
        1.0       0.85      0.65      0.74       225

avg / total       0.83      0.83      0.82       614

[[363  26]
 [ 78 147]]
             precision    recall  f1-score   support

        0.0       0.81      0.88      0.84       111
        1.0       0.61      0.47      0.53        43

avg / total       0.75      0.77      0.76       154

[[98 13]
 [23 20]]


## gridsearch

In [67]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   0.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
0.299566989927
1.0


In [68]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'alpha': sp_rand()}
# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(X, y)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
          fit_params={}, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000000071FDD68>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)
0.299566872643
0.998254285681


## feature selection

In [69]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)
# display the relative importance of each attribute
print(model.feature_importances_)

[ 0.10220783  0.24359043  0.0959296   0.08445285  0.08467504  0.13074808
  0.11363674  0.14475944]
