In [218]:
import pandas as pd

In [219]:
df = pd.read_csv('./data/breast_cancer.csv')

In [220]:
#highly correlated columns to remove
to_drop = ['worst radius', 'worst texture', 'worst perimeter', 'worst area','worst concave points','mean concave points','radius error','area error','mean radius','mean perimeter']
df.drop(to_drop,axis=1,inplace=True)

In [221]:
## Independent and dependent features
X = df.drop('target',axis=1)
y = df['target']

In [222]:
y

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: target, Length: 569, dtype: int64

In [223]:
#rectifying target imbalance
from imblearn.combine import SMOTETomek

resampler = SMOTETomek(sampling_strategy="auto")
X , y = resampler.fit_resample(X, y)

In [224]:
X[y==1].shape , X[y==0].shape

((339, 20), (339, 20))

In [225]:
#All features in X are numerical in nature
numerical_cols = X.columns
numerical_cols

Index(['mean texture', 'mean area', 'mean smoothness', 'mean compactness',
       'mean concavity', 'mean symmetry', 'mean fractal dimension',
       'texture error', 'perimeter error', 'smoothness error',
       'compactness error', 'concavity error', 'concave points error',
       'symmetry error', 'fractal dimension error', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst symmetry',
       'worst fractal dimension'],
      dtype='object')

In [226]:
# since all the features are only numerical in nature creating only numerical pipeline
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [227]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline , numerical_cols)
])

In [228]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [229]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [230]:
X_train.shape , X_test.shape

((474, 20), (204, 20))

In [231]:
X_train.head()

Unnamed: 0,numerical_pipeline__mean texture,numerical_pipeline__mean area,numerical_pipeline__mean smoothness,numerical_pipeline__mean compactness,numerical_pipeline__mean concavity,numerical_pipeline__mean symmetry,numerical_pipeline__mean fractal dimension,numerical_pipeline__texture error,numerical_pipeline__perimeter error,numerical_pipeline__smoothness error,numerical_pipeline__compactness error,numerical_pipeline__concavity error,numerical_pipeline__concave points error,numerical_pipeline__symmetry error,numerical_pipeline__fractal dimension error,numerical_pipeline__worst smoothness,numerical_pipeline__worst compactness,numerical_pipeline__worst concavity,numerical_pipeline__worst symmetry,numerical_pipeline__worst fractal dimension
0,0.223076,0.729335,0.21951,0.3363,0.166768,1.014736,-0.214057,0.167426,0.951461,0.421676,0.079397,0.15401,0.693709,0.978883,0.121362,-0.132586,-0.232513,-0.23322,0.450012,-0.442349
1,0.501031,-0.266558,0.082267,-0.014229,0.344514,0.175139,-0.14312,-0.351551,-0.241577,-0.378874,0.256528,0.545656,-0.114941,-0.034891,0.132563,0.489808,0.999763,1.043518,0.989288,0.986451
2,0.256602,-1.409219,-0.223954,0.365634,0.3469,1.400644,3.013059,1.470488,-0.867819,2.097317,1.71055,1.611877,-0.345823,0.293467,3.394618,1.32675,1.046224,1.137778,0.562858,3.668844
3,0.000191,-0.446572,-0.485488,-0.875253,-0.530316,-0.868541,-0.813956,-0.104233,-0.026777,-0.331633,-0.903807,-0.421128,-0.474885,-0.628505,-0.773143,0.285121,-0.737369,-0.185237,-0.131478,-0.587182
4,-1.162047,-0.913649,-1.029068,-1.211074,-1.217655,-0.232151,-0.208135,-0.395798,-0.741153,-0.477514,-1.114373,-1.166404,-1.554885,-0.418169,-0.705025,-0.783799,-1.102992,-1.436624,-0.24086,-0.560472


In [232]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import  accuracy_score , roc_auc_score , f1_score

In [233]:
# a function to evaluate the model
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    roc_score = roc_auc_score(true, predicted)
    f_1_score = f1_score(true, predicted)

    return accuracy , roc_score , f_1_score

In [234]:
models={
    'LogisticRegression':LogisticRegression(),
    'RidgeClassifier':RidgeClassifier(),
    'BernoulliNB':BernoulliNB(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'KNeighborsClassifier':KNeighborsClassifier(),
    'AdaBoostClassifier':AdaBoostClassifier(),
    'GradientBoostingClassifier':GradientBoostingClassifier(),
    'BaggingClassifier':BaggingClassifier(),
    'RandomForestClassifier':RandomForestClassifier(),
    'SVC':SVC(),
    'XGBClassifier':XGBClassifier()
}
model_list=[]
roc=[]
acc = []
f1 = []
performance = []
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    accuracy , roc_score  , f_1_score =evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    roc.append(roc_score)
    acc.append(accuracy)
    f1.append(f_1_score)
    performance.append((list(models.keys())[i] , accuracy ))


    print('Model Training Performance')
    print("ROC:",roc_score)
    print("Accuracy:",accuracy)

    
    print('='*35)
    print('\n')

print("Best Model" , sorted(performance , key = lambda x: x[1])[-1])
model = models[sorted(performance , key = lambda x: x[1])[-1][0]]
print(model)

metrics = pd.DataFrame({"models": model_list , "accuracy" : acc , "roc_auc_score": roc , "f1_score" : f_1})
print(metrics.sort_values('accuracy',ascending=False))


LogisticRegression
Model Training Performance
ROC: 0.9607843137254901
Accuracy: 0.9607843137254902


RidgeClassifier
Model Training Performance
ROC: 0.9558823529411765
Accuracy: 0.9558823529411765


BernoulliNB
Model Training Performance
ROC: 0.8529411764705883
Accuracy: 0.8529411764705882


DecisionTreeClassifier
Model Training Performance
ROC: 0.9117647058823529
Accuracy: 0.9117647058823529


KNeighborsClassifier
Model Training Performance
ROC: 0.9313725490196079
Accuracy: 0.9313725490196079


AdaBoostClassifier
Model Training Performance
ROC: 0.9509803921568627
Accuracy: 0.9509803921568627


GradientBoostingClassifier
Model Training Performance
ROC: 0.9509803921568627
Accuracy: 0.9509803921568627


BaggingClassifier
Model Training Performance
ROC: 0.9313725490196079
Accuracy: 0.9313725490196079


RandomForestClassifier
Model Training Performance
ROC: 0.9460784313725491
Accuracy: 0.946078431372549


SVC
Model Training Performance
ROC: 0.9705882352941176
Accuracy: 0.9705882352941176



In [235]:
model