In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [2]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
data.columns = ['Sample code', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                'Normal Nucleoli', 'Mitoses', 'Class']
data.head()

Unnamed: 0,Sample code,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
# Drop the 'Sample code' column as it's not required for prediction
data.drop(['Sample code'], axis=1, inplace=True)

# Replace '?' with 0 to handle missing data, ensuring numerical analysis accuracy and preventing calculation errors.
data = data.replace('?', np.nan)

# Convert the 'Bare Nuclei' column to an integer type to ensure a uniform data type across the column,
# which is essential for consistent data manipulation and analysis.
data['Bare Nuclei'] = pd.to_numeric(data['Bare Nuclei']).fillna(0).astype('int64')

In [6]:
data.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [7]:
values = data.values
X = values[:, 0:9]
y = values[:, 9]

In [8]:
# Import required libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Create an instance of SimpleImputer to fill in missing values
imputer = SimpleImputer()

# Impute the missing values in the dataset and obtaining imputed data
imputeData = imputer.fit_transform(values)

# Create an instance of MinMaxScaler to scale features to a range of (0, 1)
scaler = MinMaxScaler(feature_range=(0, 1))

# Scale the imputed data features to a range of (0, 1) to obtain normalized data
normalizedData = scaler.fit_transform(imputeData)

In [9]:
# Import libraries
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection

# Set up KFold cross-validation with 10 splits
kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=True)

# Initialize list for base models
estimators = []

# Add Logistic Regression model to estimators
model1 = LogisticRegression()
estimators.append(('logistic', model1))

# Add Decision Tree model to estimators
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))

# Add Support Vector Machine model to estimators
model3 = SVC()
estimators.append(('svm', model3))

# Create Voting Classifier ensemble with estimators
ensemble = VotingClassifier(estimators)

# Evaluate ensemble model using cross-validation
results = model_selection.cross_val_score(ensemble, X, y, cv=kfold)

# Print mean accuracy score of ensemble model
print(results.mean())

0.9642028985507245


In [11]:
# Initialize classifiers: Logistic Regression, Decision Tree, and Support Vector Classifier (SVC)
LogRefg_clf = LogisticRegression()
Dtree_clf = DecisionTreeClassifier()
svc_slf = SVC()

# Train classifiers on the normalized dataset
LogRefg_clf.fit(X, y)
Dtree_clf.fit(X, y)
svc_slf.fit(X, y)

# Generate predictions for each classifier
LogRefg_pred = LogRefg_clf.predict(X)
Dtree_pred = Dtree_clf.predict(X)
svc_pred = svc_slf.predict(X)

# Average the predictions from all classifiers
avg_preds = np.mean([LogRefg_pred, Dtree_pred, svc_pred], axis=0)

# Evaluate the accuracy of the averaged predictions using R-squared score
acc = r2_score(y, avg_preds)

# Print the R-squared score
print(acc)

0.9521523008812143


In [12]:
y

array([2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 4, 2, 4, 4,
       2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 4, 2, 4, 4, 4,
       4, 2, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4,
       2, 4, 4, 2, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4,
       2, 4, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 4, 2, 4, 2, 4, 2, 2, 2,
       4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 4, 4, 2,
       2, 4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 4,
       2, 4, 2, 4, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 4, 4, 2, 2, 2, 4, 4, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 2, 4, 2,
       2, 4, 4, 4, 4, 2, 4, 4, 2, 4, 4, 4, 2, 4, 2, 2, 4, 4, 4, 4, 2, 2,
       2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4,
       4, 2, 4, 4, 4, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 4, 2, 2, 4, 4, 4, 4,
       4, 2, 4, 4, 2, 2, 4, 4, 2, 4, 2, 2, 2, 4, 4,

In [13]:
avg_preds

array([2.        , 3.33333333, 2.        , 3.33333333, 2.        ,
       4.        , 2.        , 2.        , 2.        , 2.        ,
       2.        , 2.        , 2.66666667, 2.        , 4.        ,
       4.        , 2.        , 2.        , 4.        , 2.        ,
       4.        , 4.        , 2.        , 4.        , 2.        ,
       4.        , 2.        , 2.        , 2.        , 2.        ,
       2.        , 2.        , 4.        , 2.        , 2.        ,
       2.        , 4.        , 2.        , 4.        , 4.        ,
       3.33333333, 4.        , 4.        , 3.33333333, 4.        ,
       2.        , 4.        , 2.        , 2.        , 4.        ,
       4.        , 2.66666667, 4.        , 4.        , 4.        ,
       4.        , 4.        , 4.        , 4.        , 4.        ,
       4.        , 2.        , 4.        , 3.33333333, 2.        ,
       4.        , 2.        , 4.        , 4.        , 2.        ,
       2.        , 4.        , 2.        , 4.        , 4.     