## Problem Statement
*****************
The dataset contains diagnosis data about breast cancer patients
 and whether they are Benign (healthy) or Malignant
 (possible disease). We need to predict whether new patients 
 are benign or malignant based on model built on this data.

## Techniques Used

1. Principal Component Analysis
2. Training and Testing
3. Confusion Matrix
4. Bagging
5. Boosting


In [4]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics

In [6]:
#os.chdir("C:\Personal\V2Maestros\Modules\Machine Learning Algorithms\Advanced Methods")

### Data Engineering and Analysis

In [9]:
#Load the dataset

cancer_data = pd.read_csv("breast_cancer.csv")
cancer_data.dtypes
cancer_data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,905520,B,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,868871,B,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


### Principal Component Analysis

In this section, we first scale the data and discover the
 principal components of the data. Then we only pick the 
 top components that have the heaviest influence on the 
 target.

In [10]:
from sklearn.decomposition import PCA

predictors = cancer_data.iloc[0:,2:]
targets = cancer_data.diagnosis

#Do PCA
pca=PCA(n_components=4)
reduced_predictors=pca.fit_transform(predictors)
reduced_predictors

array([[ -3.83867453e+02,   8.91186935e+00,  -1.14565022e+00,
          8.06649530e+00],
       [ -5.50815536e+02,  -2.73989111e+01,   1.79447405e+01,
          2.90133273e-01],
       [ -4.97867645e+02,  -2.88272353e+01,   1.46491508e+00,
          1.82202175e+00],
       ..., 
       [  1.07058280e+02,  -3.82196566e-01,  -2.71023046e+01,
         -1.19375395e+00],
       [ -1.17871335e+02,   5.89600115e+01,  -8.53244289e+00,
          4.88865434e+00],
       [  9.39399721e+02,   2.85852964e+02,  -5.51258179e+01,
          1.11445421e+01]])

In [13]:
#Convert target to integer
targets[targets == 'B']=0
targets[targets == 'M']=1
targets=targets.astype('int64')

  result = getattr(x, name)(y)


TypeError: invalid type comparison

In [15]:
#Correlations
DataFrame(reduced_predictors).join(targets).corr()

Unnamed: 0,0,1,2,3,diagnosis
0,1.0,-6.477203000000001e-17,0.0,3.05636e-17,0.733037
1,-6.477203000000001e-17,1.0,-6.635643000000001e-17,-3.800068e-18,-0.038906
2,0.0,-6.635643000000001e-17,1.0,2.041112e-18,-0.096222
3,3.05636e-17,-3.800068e-18,2.041112e-18,1.0,-0.344393
diagnosis,0.7330368,-0.03890556,-0.09622202,-0.3443929,1.0


In [16]:
#Split as training and testing
pred_train, pred_test, tar_train, tar_test  =   train_test_split(DataFrame(reduced_predictors), targets, test_size=.3)

pred_train.shape
pred_test.shape
tar_train.shape
tar_test.shape

(171,)

In [17]:
#Build model on training data

#Using support vector machines
from sklearn import ensemble
from sklearn.tree import DecisionTreeClassifier
#classifier=ensemble.BaggingClassifier(DecisionTreeClassifier())
classifier=ensemble.AdaBoostClassifier(DecisionTreeClassifier())


classifier=classifier.fit(pred_train,tar_train)

predictions=classifier.predict(pred_test)

sklearn.metrics.confusion_matrix(tar_test,predictions)
sklearn.metrics.accuracy_score(tar_test, predictions)
sklearn.metrics.classification_report(tar_test, predictions)

'             precision    recall  f1-score   support\n\n          0       0.96      0.96      0.96       103\n          1       0.94      0.94      0.94        68\n\navg / total       0.95      0.95      0.95       171\n'