In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('./data/breast_cancer.csv')

In [8]:
#highly correlated columns to remove
to_drop = ['worst radius', 'worst texture', 'worst perimeter', 'worst area','worst concave points','mean concave points','radius error','area error','mean radius','mean perimeter']
df.drop(to_drop,axis=1,inplace=True)

In [9]:
## Independent and dependent features
X = df.drop('target',axis=1)
y = df['target']

In [10]:
y

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: target, Length: 569, dtype: int64

In [11]:
#rectifying target imbalance
from imblearn.combine import SMOTETomek

resampler = SMOTETomek(sampling_strategy="auto")
X , y = resampler.fit_resample(X, y)

In [12]:
X[y==1].shape , X[y==0].shape

((338, 20), (338, 20))

In [13]:
#All features in X are numerical in nature
numerical_cols = X.columns
numerical_cols

Index(['mean texture', 'mean area', 'mean smoothness', 'mean compactness',
       'mean concavity', 'mean symmetry', 'mean fractal dimension',
       'texture error', 'perimeter error', 'smoothness error',
       'compactness error', 'concavity error', 'concave points error',
       'symmetry error', 'fractal dimension error', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst symmetry',
       'worst fractal dimension'],
      dtype='object')

In [14]:
# since all the features are only numerical in nature creating only numerical pipeline
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [15]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline , numerical_cols)
])

In [16]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [17]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [18]:
X_train.shape , X_test.shape

((473, 20), (203, 20))

In [19]:
X_train.head()

Unnamed: 0,numerical_pipeline__mean texture,numerical_pipeline__mean area,numerical_pipeline__mean smoothness,numerical_pipeline__mean compactness,numerical_pipeline__mean concavity,numerical_pipeline__mean symmetry,numerical_pipeline__mean fractal dimension,numerical_pipeline__texture error,numerical_pipeline__perimeter error,numerical_pipeline__smoothness error,numerical_pipeline__compactness error,numerical_pipeline__concavity error,numerical_pipeline__concave points error,numerical_pipeline__symmetry error,numerical_pipeline__fractal dimension error,numerical_pipeline__worst smoothness,numerical_pipeline__worst compactness,numerical_pipeline__worst concavity,numerical_pipeline__worst symmetry,numerical_pipeline__worst fractal dimension
0,-0.745779,-0.082082,1.431951,1.018398,0.878597,1.277479,0.725524,-0.501486,-0.07212,-0.063419,-0.060916,-0.14962,0.035237,-0.544469,0.017941,2.390396,1.139527,0.843165,1.109273,1.196897
1,-1.578308,-0.911618,0.034678,-1.155442,-1.051957,0.022887,-0.237389,-0.823367,-0.842852,0.614984,-1.094812,-0.979942,-0.738256,0.468233,-0.9603,-0.013341,-1.193646,-1.152852,-0.108323,-0.95111
2,0.592297,-0.678155,-1.134959,-1.138819,-1.086639,-1.074402,-0.424744,-0.135957,-0.769731,-0.693461,-0.904431,-1.096598,-1.374082,-0.536739,-0.623422,-0.393821,-0.603076,-0.893676,0.295865,-0.158485
3,-0.196171,-0.87795,-1.174645,-1.234164,-0.828464,-1.181828,-0.873522,-0.513023,-0.64424,0.702703,-1.080464,-0.583308,-0.611643,0.204105,-0.732692,-0.241629,-1.18212,-0.786686,-0.878125,-1.02367
4,-0.094134,0.101156,0.907787,0.204426,0.588717,0.410391,-0.686168,-0.814906,-0.020243,-0.311901,-0.731132,-0.2402,-0.272225,-0.808596,-0.596645,0.904287,-0.21887,0.386706,-0.14522,-0.260402


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import  accuracy_score , roc_auc_score

In [21]:
# a function to evaluate the model
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    roc_score = roc_auc_score(true, predicted)

    return accuracy , roc_score

In [29]:
models={
    'LogisticRegression':LogisticRegression(),
    'RidgeClassifier':RidgeClassifier(),
    'BernoulliNB':BernoulliNB(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'KNeighborsClassifier':KNeighborsClassifier(),
    'AdaBoostClassifier':AdaBoostClassifier(),
    'GradientBoostingClassifier':GradientBoostingClassifier(),
    'BaggingClassifier':BaggingClassifier(),
    'RandomForestClassifier':RandomForestClassifier(),
    'SVC':SVC(),
    'XGBClassifier':XGBClassifier()
}
model_list=[]
roc=[]
acc = []
performance = []
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    accuracy , roc_score =evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    roc.append(roc_score)
    acc.append(accuracy)
    performance.append((list(models.keys())[i] , accuracy ))


    print('Model Training Performance')
    print("ROC:",roc_score)
    print("Accuracy:",accuracy)

    
    print('='*35)
    print('\n')

print("Best Model" , sorted(performance , key = lambda x: x[1])[-1])
model = models[sorted(performance , key = lambda x: x[1])[-1][0]]
print(model)


LogisticRegression
Model Training Performance
ROC: 0.9901283796926669
Accuracy: 0.9901477832512315


RidgeClassifier
Model Training Performance
ROC: 0.9665434740322895
Accuracy: 0.9655172413793104


BernoulliNB
Model Training Performance
ROC: 0.9073137521882902
Accuracy: 0.9064039408866995


DecisionTreeClassifier
Model Training Performance
ROC: 0.9253063606302275
Accuracy: 0.9261083743842364


KNeighborsClassifier
Model Training Performance
ROC: 0.960075860727485
Accuracy: 0.9605911330049262


AdaBoostClassifier
Model Training Performance
ROC: 0.9798191013421514
Accuracy: 0.9802955665024631


GradientBoostingClassifier
Model Training Performance
ROC: 0.9755397782532581
Accuracy: 0.9753694581280788


BaggingClassifier
Model Training Performance
ROC: 0.9493289243337871
Accuracy: 0.9507389162561576


RandomForestClassifier
Model Training Performance
ROC: 0.9699474810348182
Accuracy: 0.9704433497536946


SVC
Model Training Performance
ROC: 0.9699474810348182
Accuracy: 0.9704433497536946

