In [None]:
# We will use "Ensemble Decision Making System for Breast Cancer Data" by Lavanya and Usha Rani, which uses the 
# "Breast Cancer Wisconsin (Diagnostic) Data Set" as well as the original data set. I could not find a downloadable dataset for 
# the original, so I will just analyze the diagnostic data, as the code would be the same anways. The authors first implement 
# a CART model, then a CART model with feature selection, and then CART model with feature selection and boosting. 

In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold

In [2]:
data = pd.read_csv("C:/Users/Tara/Documents/A_GCU/DSC-540/cancer.csv")

data.head

<bound method NDFrame.head of            id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0      842302         M        17.99         10.38          122.80     1001.0   
1      842517         M        20.57         17.77          132.90     1326.0   
2    84300903         M        19.69         21.25          130.00     1203.0   
3    84348301         M        11.42         20.38           77.58      386.1   
4    84358402         M        20.29         14.34          135.10     1297.0   
..        ...       ...          ...           ...             ...        ...   
564    926424         M        21.56         22.39          142.00     1479.0   
565    926682         M        20.13         28.25          131.20     1261.0   
566    926954         M        16.60         28.08          108.30      858.1   
567    927241         M        20.60         29.33          140.10     1265.0   
568     92751         B         7.76         24.54           47.92      181.0  

In [3]:
# See if there are any missing values in the data:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [4]:
# Drop unneeded variables
data = data.drop(['Unnamed: 32','id'], axis = 1)

# Assign target variable
data.diagnosis.replace(to_replace = dict(M = 1, B = 0), inplace = True)

In [5]:
# Define X and Y
y = np.array(data.diagnosis.tolist())
data = data.drop('diagnosis', 1)
X = np.array(data)

# Normalize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [8]:
# Implement a 10-fold cross-validation (the same amount as the author used) for a decision tree classifier with 100 trees.
kfold = model_selection.KFold(n_splits=10)
cart = DecisionTreeClassifier()
num_trees = 100
results = model_selection.cross_val_score(cart, X, y, cv=kfold)
print(results.mean())

0.9332393483709274


In [21]:
# CART model + Feature Selection (FS)
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)
model = DecisionTreeClassifier()
pipeline = Pipeline(steps=[('s',rfe),('m',model)])

# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.929 (0.030)


In [24]:
kfold = model_selection.KFold(n_splits=10)

# create the models for the ensemble:
estimators = []

model1 = DecisionTreeClassifier()
estimators.append(('cart', model1))

model2 = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)
estimators.append(('rfe', model2))

model3 = AdaBoostClassifier(n_estimators=70)
estimators.append(('boost', model3))

# create the ensemble model:
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, y, cv=kfold)
print(results.mean())

0.9454260651629072


In [None]:
# The authors had an accuracy of 92.97% for their CART model, 94.72% for CART with FS, and 95.43% for CART with FS and Boosting.
# While my accuracies are 93.3%, 92.9%, and 94.5%, these inconsistencies can be explained by the article not listing the exact 
# number of trees used for their CART model or how they choose to implement feature selection. 

In [None]:
# Resources:

# Brownlee, J. (2020, Aug 28). Recursive Feature Elimination (RFE) for Feature Selection in Python. 
# Machine Learning Mastery. https://machinelearningmastery.com/rfe-feature-selection-in-python/ 

# Lavanya, D. & Usha Rani, K. (2012). Ensemble Decision Making System for Breast Cancer Data. 
# International Journal of Computer Applications. 51. 19-23. 
# http://doi.org/10.5120/8134-1823

# Lugat, V. (2018). Breast Cancer Analysis and Prediction. Kaggle.
# https://www.kaggle.com/vincentlugat/breast-cancer-analysis-and-prediction

# Paul, S. (2018, Sept 6). Ensemble Learning in Python.
# https://www.datacamp.com/community/tutorials/ensemble-learning-python 