In [1]:
import numpy as np
dataset = np.loadtxt(r'C:\Users\Zishi\Desktop\pima-indians-diabetes.data.txt',
                      delimiter=',')
X = dataset[:,0:7]
y = dataset[:,8]

In [2]:
from sklearn import preprocessing

normalized_X = preprocessing.normalize(X)
standardized_X = preprocessing.scale(X)

In [4]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_)

[ 0.12319434  0.27680159  0.11091331  0.09056323  0.08668441  0.16689549
  0.14494764]


In [13]:
np.sort(model.feature_importances_)[-3:]

array([ 0.14494764,  0.16689549,  0.27680159])

In [21]:
from sklearn.feature_selection import SelectFromModel
selected = SelectFromModel(model, prefit=True)
X_new = selected.transform(X)
X_new.shape

(768, 3)

In [24]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
lrmodel = LogisticRegression()
rfe = RFE(lrmodel,3)
rfe = rfe.fit(X,y)
print(rfe.support_)
print(rfe.ranking_)

[ True False False False False  True  True]
[1 2 3 5 4 1 1]


In [25]:
from sklearn import metrics
lrmodel.fit(X,y)
print(lrmodel)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [26]:
expected = y
predicted = lrmodel.predict(X)
print(metrics.classification_report(expected,predicted))

             precision    recall  f1-score   support

        0.0       0.79      0.89      0.84       500
        1.0       0.74      0.55      0.63       268

avg / total       0.77      0.77      0.77       768



In [27]:
print(metrics.confusion_matrix(expected,predicted))

[[447  53]
 [120 148]]


In [28]:
from sklearn.naive_bayes import GaussianNB
bmodel = GaussianNB()
bmodel.fit(X,y)
print(bmodel)

GaussianNB(priors=None)


In [29]:
expected = y
predicted = bmodel.predict(X)
print(metrics.classification_report(expected,predicted))

             precision    recall  f1-score   support

        0.0       0.80      0.86      0.83       500
        1.0       0.69      0.60      0.64       268

avg / total       0.76      0.77      0.76       768



In [30]:
print(metrics.confusion_matrix(expected,predicted))

[[429  71]
 [108 160]]


In [39]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
# fit a k-nearest neighbor model to the data
knmodel = KNeighborsClassifier()
knmodel.fit(X,y)
print(knmodel)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')


In [40]:
expected = y
predicted = knmodel.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))

             precision    recall  f1-score   support

        0.0       0.82      0.90      0.86       500
        1.0       0.77      0.63      0.69       268

avg / total       0.80      0.80      0.80       768



In [33]:
print(metrics.confusion_matrix(expected, predicted))

[[448  52]
 [ 98 170]]


In [34]:
from sklearn.tree import DecisionTreeClassifier
# fit a CART model to the data
dartmodel = DecisionTreeClassifier()
dartmodel.fit(X, y)
print(dartmodel)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')


In [35]:
expected = y
predicted = dartmodel.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       500
        1.0       1.00      1.00      1.00       268

avg / total       1.00      1.00      1.00       768



In [36]:
print(metrics.confusion_matrix(expected, predicted))

[[500   0]
 [  0 268]]


In [42]:
from sklearn.svm import SVC
# fit a SVM model to the data
svcmodel = SVC()
svcmodel.fit(X, y)
print(svcmodel)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [43]:
expected = y
predicted = svcmodel.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       500
        1.0       1.00      1.00      1.00       268

avg / total       1.00      1.00      1.00       768



In [44]:
print(metrics.confusion_matrix(expected, predicted))

[[500   0]
 [  0 268]]


In [47]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   0.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
0.282118955686
1.0


In [50]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'alpha': sp_rand()}
# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(X, y)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
          fit_params={}, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021C9FF60898>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)
0.28211884204
0.99578299868


In [52]:
lrmodel = LogisticRegression(multi_class='multinomial',solver='lbfgs')
lrmodel.fit(X,y)
print(lrmodel)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)


In [53]:
expected = y
predicted = lrmodel.predict(X)
print(metrics.classification_report(expected,predicted))

             precision    recall  f1-score   support

        0.0       0.79      0.89      0.84       500
        1.0       0.73      0.56      0.64       268

avg / total       0.77      0.78      0.77       768



In [54]:
print(metrics.confusion_matrix(expected,predicted))

[[445  55]
 [117 151]]


In [56]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
ldamodel = LinearDiscriminantAnalysis()
ldamodel.fit(X,y)
print(ldamodel)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)


In [57]:
expected = y
predicted = ldamodel.predict(X)
print(metrics.classification_report(expected,predicted))

             precision    recall  f1-score   support

        0.0       0.79      0.89      0.84       500
        1.0       0.74      0.56      0.64       268

avg / total       0.77      0.78      0.77       768



In [58]:
from sklearn.ensemble import RandomForestClassifier
# fit a CART model to the data
rfmodel = RandomForestClassifier()
rfmodel.fit(X, y)
print(rfmodel)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


In [59]:
expected = y
predicted = rfmodel.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))

             precision    recall  f1-score   support

        0.0       0.97      1.00      0.99       500
        1.0       0.99      0.95      0.97       268

avg / total       0.98      0.98      0.98       768



In [60]:
print(metrics.confusion_matrix(expected,predicted))

[[498   2]
 [ 13 255]]


In [64]:
from sklearn.tree import DecisionTreeClassifier
# fit a CART model to the data
treemodel = DecisionTreeClassifier(criterion='entropy')
treemodel.fit(X, y)
print(treemodel)
expected = y
predicted = treemodel.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       500
        1.0       1.00      1.00      1.00       268

avg / total       1.00      1.00      1.00       768



In [65]:
len(y)

768

In [67]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=123)

In [68]:
X_train.shape

(537, 7)

In [69]:
lrmodel = LogisticRegression(multi_class='multinomial',solver='lbfgs')
lrmodel.fit(X_train,y_train)
print(lrmodel)
expected = y_test
predicted = lrmodel.predict(X_test)
print(metrics.classification_report(expected,predicted))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)
             precision    recall  f1-score   support

        0.0       0.81      0.88      0.84       143
        1.0       0.77      0.66      0.71        88

avg / total       0.79      0.80      0.79       231



In [70]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
ldamodel = LinearDiscriminantAnalysis()
ldamodel.fit(X_train,y_train)
print(ldamodel)
expected = y_test
predicted = lrmodel.predict(X_test)
print(metrics.classification_report(expected,predicted))

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)
             precision    recall  f1-score   support

        0.0       0.81      0.88      0.84       143
        1.0       0.77      0.66      0.71        88

avg / total       0.79      0.80      0.79       231



In [73]:
from sklearn.tree import DecisionTreeClassifier
# fit a CART model to the data
treemodel = DecisionTreeClassifier(criterion='entropy')
treemodel.fit(X_train, y_train)
print(treemodel)
expected = y_test
predicted = treemodel.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected,predicted))

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
             precision    recall  f1-score   support

        0.0       0.80      0.79      0.80       143
        1.0       0.67      0.68      0.67        88

avg / total       0.75      0.75      0.75       231

[[113  30]
 [ 28  60]]


In [72]:
from sklearn.ensemble import RandomForestClassifier
# fit a CART model to the data
rfmodel = RandomForestClassifier()
rfmodel.fit(X_train, y_train)
print(rfmodel)
expected = y_test
predicted = rfmodel.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected,predicted))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
             precision    recall  f1-score   support

        0.0       0.76      0.85      0.80       143
        1.0       0.69      0.57      0.62        88

avg / total       0.74      0.74      0.73       231

[[121  22]
 [ 38  50]]


In [74]:
from sklearn.svm import SVC
# fit a SVM model to the data
svcmodel = SVC()
svcmodel.fit(X_train, y_train)
print(svcmodel)
expected = y_test
predicted = svcmodel.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected,predicted))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

        0.0       0.62      1.00      0.76       143
        1.0       0.00      0.00      0.00        88

avg / total       0.38      0.62      0.47       231

[[143   0]
 [ 88   0]]


  'precision', 'predicted', average, warn_for)


In [75]:
from sklearn.naive_bayes import GaussianNB
bmodel = GaussianNB()
bmodel.fit(X_train,y_train)
print(bmodel)
expected = y_test
predicted = bmodel.predict(X_test)
print(metrics.classification_report(expected,predicted))
print(metrics.confusion_matrix(expected,predicted))

GaussianNB(priors=None)
             precision    recall  f1-score   support

        0.0       0.78      0.87      0.82       143
        1.0       0.74      0.60      0.66        88

avg / total       0.76      0.77      0.76       231

[[124  19]
 [ 35  53]]
