In [1]:
from sklearn.datasets import load_breast_cancer

In [2]:
breast_cancer = load_breast_cancer()

In [4]:
data = breast_cancer.data

In [5]:
target = breast_cancer.target

In [6]:
print(breast_cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_data, test_data, train_class, test_class = train_test_split(data, target, test_size=0.3)

In [9]:
import numpy as np

In [10]:
np.shape(train_data)

(398, 30)

In [12]:
np.shape(test_data)

(171, 30)

In [20]:
np.count_nonzero(train_class)/len(train_class)

0.6331658291457286

In [21]:
np.count_nonzero(test_class)/len(test_class)

0.6140350877192983

In [22]:
np.count_nonzero(train_class)/(len(train_class) - np.count_nonzero(train_class))

1.726027397260274

In [23]:
np.count_nonzero(test_class)/(len(test_class) - np.count_nonzero(test_class))

1.5909090909090908

In [25]:
from sklearn.naive_bayes import GaussianNB

In [26]:
gb = GaussianNB()

In [27]:
gb.fit(train_data, train_class)

GaussianNB(priors=None, var_smoothing=1e-09)

In [29]:
pred = gb.predict(test_data)

In [30]:
from sklearn.metrics import accuracy_score, classification_report

In [32]:
accuracy_score(pred, test_class)

0.8947368421052632

In [37]:
def accuracy(pred, original):
    ctr = 0
    for i in range(len(pred)):
        if(pred[i] == original[i]):
            ctr = ctr + 1
    return ctr/len(pred)

In [38]:
accuracy(pred, test_class)

0.8947368421052632

In [40]:
print(classification_report(pred, test_class))

              precision    recall  f1-score   support

           0       0.79      0.93      0.85        56
           1       0.96      0.88      0.92       115

   micro avg       0.89      0.89      0.89       171
   macro avg       0.87      0.90      0.89       171
weighted avg       0.90      0.89      0.90       171



In [41]:
# 100 object => 90 are ball, 10 are chocolate

In [42]:
# you build a model to predict how many are ball and chocolate

In [43]:
# your model says, 92 are ball, 8 are chocolate

'''                Actual Ball   |    Actual Chocolate   Total
Pred Ball              88                  4               92
Pred Chocolate         2                   6               8
Total                  90                  10
'''

In [45]:
# Precision(ball) = Pred Ball/ (Pred ball + Pred Chocolate) = 88/90 
# Recall(ball) = Pred Ball /(Actual Ball + Actual Chocolate) = 88/92

In [46]:
# Precision(chocolate) = 6/10 
# Recall(chocolate) = 6/8

In [47]:
from sklearn.tree import DecisionTreeClassifier

In [72]:
tree = DecisionTreeClassifier(min_samples_split=5)

In [73]:
tree.fit(train_data, train_class)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [74]:
pred = tree.predict(test_data)

In [75]:
print(accuracy_score(pred, test_class))
print(classification_report(pred, test_class))

0.9181286549707602
              precision    recall  f1-score   support

           0       0.80      0.98      0.88        54
           1       0.99      0.89      0.94       117

   micro avg       0.92      0.92      0.92       171
   macro avg       0.90      0.94      0.91       171
weighted avg       0.93      0.92      0.92       171



In [76]:
from sklearn.ensemble import RandomForestClassifier

In [93]:
rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=2)

In [94]:
rf.fit(train_data, train_class)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [95]:
pred = rf.predict(test_data)

In [96]:
print(accuracy_score(pred, test_class))
print(classification_report(pred, test_class))

0.9239766081871345
              precision    recall  f1-score   support

           0       0.85      0.95      0.90        59
           1       0.97      0.91      0.94       112

   micro avg       0.92      0.92      0.92       171
   macro avg       0.91      0.93      0.92       171
weighted avg       0.93      0.92      0.92       171



In [97]:
from sklearn.ensemble import AdaBoostClassifier

In [122]:
ada = AdaBoostClassifier(n_estimators=100)

In [123]:
ada.fit(train_data, train_class)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)

In [124]:
pred = ada.predict(test_data)

In [125]:
print(accuracy_score(pred, test_class))
print(classification_report(pred, test_class))

0.9473684210526315
              precision    recall  f1-score   support

           0       0.86      1.00      0.93        57
           1       1.00      0.92      0.96       114

   micro avg       0.95      0.95      0.95       171
   macro avg       0.93      0.96      0.94       171
weighted avg       0.95      0.95      0.95       171



In [126]:
from sklearn.ensemble import BaggingClassifier

In [143]:
bag = BaggingClassifier(n_estimators=10)

In [144]:
bag.fit(train_data, train_class)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [145]:
pred = bag.predict(test_data)

In [146]:
print(accuracy_score(pred, test_class))
print(classification_report(pred, test_class))

0.935672514619883
              precision    recall  f1-score   support

           0       0.88      0.95      0.91        61
           1       0.97      0.93      0.95       110

   micro avg       0.94      0.94      0.94       171
   macro avg       0.93      0.94      0.93       171
weighted avg       0.94      0.94      0.94       171



In [147]:
from sklearn.svm import SVC

In [160]:
svm = SVC(C= 0.1, degree=4, kernel='linear')

In [161]:
svm.fit(train_data, train_class)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=4, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [162]:
pred = svm.predict(test_data)

In [163]:
print(accuracy_score(pred, test_class))
print(classification_report(pred, test_class))

0.9415204678362573
              precision    recall  f1-score   support

           0       0.86      0.98      0.92        58
           1       0.99      0.92      0.95       113

   micro avg       0.94      0.94      0.94       171
   macro avg       0.93      0.95      0.94       171
weighted avg       0.95      0.94      0.94       171



In [164]:
from sklearn.datasets import load_wine

In [165]:
from sklearn.datasets import load_iris