In [None]:
# This data can be found on https://www.kaggle.com/uciml/breast-cancer-wisconsin-data which has a compilation of data to predict 
# whether breast cancer is malignant or benign. To compare, I will use "Breast Cancer Analysis and Prediction" which was 
# who used two logistic regression models to create an ensemle with an accuracy of 97.1% (Lugat, 2018).

In [40]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [24]:
data = pd.read_csv("C:/Users/Tara/Documents/A_GCU/DSC-540/cancer.csv")

data.head

<bound method NDFrame.head of            id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0      842302         M        17.99         10.38          122.80     1001.0   
1      842517         M        20.57         17.77          132.90     1326.0   
2    84300903         M        19.69         21.25          130.00     1203.0   
3    84348301         M        11.42         20.38           77.58      386.1   
4    84358402         M        20.29         14.34          135.10     1297.0   
..        ...       ...          ...           ...             ...        ...   
564    926424         M        21.56         22.39          142.00     1479.0   
565    926682         M        20.13         28.25          131.20     1261.0   
566    926954         M        16.60         28.08          108.30      858.1   
567    927241         M        20.60         29.33          140.10     1265.0   
568     92751         B         7.76         24.54           47.92      181.0  

In [25]:
# See if there are any missing values in the data:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [26]:
# Drop unneeded variables
data = data.drop(['Unnamed: 32','id'], axis = 1)

# Assign target variable
data.diagnosis.replace(to_replace = dict(M = 1, B = 0), inplace = True)

In [27]:
# Define X and Y
y = np.array(data.diagnosis.tolist())
data = data.drop('diagnosis', 1)
X = np.array(data)

# Normalize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [37]:
# Implement a 10-fold cross-validation for a decision tree classifier with 100 trees.
kfold = model_selection.KFold(n_splits=10)
cart = DecisionTreeClassifier()
num_trees = 100

#  Now use this model to create a bagging based Ensemble
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees)
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print(results.mean())

# We can see that we have an accuracy of 96.67%. Let's see if we can make it better.

0.9666666666666666


In [42]:
# Now lets try a voting based ensemble which involves logistic regression, decision tree, and support vector machine:

kfold = model_selection.KFold(n_splits=10)

# create the models for the ensemble:
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))

model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))

model3 = SVC()
estimators.append(('svm', model3))

# create the ensemble model:
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, y, cv=kfold)
print(results.mean())

# We can see that we have an accuracy of 97.3% which is even higher than our comparison article!

0.9736215538847116


In [43]:
# Whereas Lugat used two logistic regression models to create their ensemble, I first implemented a decision tree 
# with bagging ensemble which resulted in a 96.6% accuracy. However, when I implemented a logistic regression, decison tree 
# classifier and support vector machine, I found an accuracy of 97.4%, which prevails Lugat's ensemble. 

In [None]:
# Resources:

# Brownlee, J. (2016, June 3). Ensemble Machine Learning Algorithms in Python with scikit-learn. 
# Machine Learning Mastery. https://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/

# Lugat, V. (2018). Breast Cancer Analysis and Prediction. Kaggle.
# https://www.kaggle.com/vincentlugat/breast-cancer-analysis-and-prediction

# Paul, S. (2018, Sept 6). Ensemble Learning in Python.
# https://www.datacamp.com/community/tutorials/ensemble-learning-python 

# Samanta, S. (2021). Breast_cancer. Kaggle.
# https://www.kaggle.com/sudipsamanta35/breast-cancer 

# Singh, R.K. (2021). Breast Cancer- Classification. Kaggle.
# https://www.kaggle.com/sudipsamanta35/breast-cancer