In [106]:
#Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
import warnings
warnings.filterwarnings("ignore")

In [89]:
df=pd.read_csv('./datasets/BreastCancer.csv')
df.drop(['id','Unnamed: 32'],axis=1,inplace=True)

In [90]:
df.shape

(569, 31)

In [91]:
df.tail(5).T

Unnamed: 0,564,565,566,567,568
diagnosis,M,M,M,M,B
radius_mean,21.56,20.13,16.6,20.6,7.76
texture_mean,22.39,28.25,28.08,29.33,24.54
perimeter_mean,142.0,131.2,108.3,140.1,47.92
area_mean,1479.0,1261.0,858.1,1265.0,181.0
smoothness_mean,0.111,0.0978,0.08455,0.1178,0.05263
compactness_mean,0.1159,0.1034,0.1023,0.277,0.04362
concavity_mean,0.2439,0.144,0.09251,0.3514,0.0
concave points_mean,0.1389,0.09791,0.05302,0.152,0.0
symmetry_mean,0.1726,0.1752,0.159,0.2397,0.1587


In [92]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
radius_mean,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.11
texture_mean,569.0,19.289649,4.301036,9.71,16.17,18.84,21.8,39.28
perimeter_mean,569.0,91.969033,24.298981,43.79,75.17,86.24,104.1,188.5
area_mean,569.0,654.889104,351.914129,143.5,420.3,551.1,782.7,2501.0
smoothness_mean,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
compactness_mean,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
concavity_mean,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
concave points_mean,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
symmetry_mean,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304
fractal_dimension_mean,569.0,0.062798,0.00706,0.04996,0.0577,0.06154,0.06612,0.09744


In [93]:
df.isnull().sum()

diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [94]:
def find_outlier_z_score(features,df):
    summary_outliers_df=pd.DataFrame()
    summary_outliers_data = {'Feature_Name':[],'Count_Outliers':[],'Percentage_Outliers':[]}
    feature_outliers_df = pd.DataFrame()
    for feature in features:
        z_scores = st.zscore(df[feature])
        threshold = 3
        outliers_mask = (np.absolute(z_scores)>threshold)
        outliers_indices = np.column_stack(np.where(outliers_mask))
        feature_outliers_df = df.iloc[outliers_indices[:, 0]]
        summary_outliers_data['Feature_Name'].append(feature)
        summary_outliers_data['Count_Outliers'].append(feature_outliers_df.shape[0])
        summary_outliers_data['Percentage_Outliers'].append(np.round((feature_outliers_df.shape[0]/df[feature].shape[0])*100,2))
        summary_outliers_df=pd.DataFrame(summary_outliers_data)
        summary_outliers_df
    summary_outliers_df=summary_outliers_df[summary_outliers_df['Count_Outliers'] !=0]
    summary_outliers_df.sort_values(by='Count_Outliers',ascending=False,inplace=True)
    return summary_outliers_df
def remove_outlier(feature,df):
    z_scores = st.zscore(df[feature])
    threshold = 3
    outliers_mask = (np.absolute(z_scores)>threshold)
    outliers_indices = np.column_stack(np.where(outliers_mask))
    feature_outliers_df = df.iloc[outliers_indices[:, 0]]
    return df[~df[feature].isin(feature_outliers_df[feature])]

In [95]:
l=df.columns.to_list()
print(l)

['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']


In [96]:
find_outlier_z_score(df.columns.to_list()[1:],df.drop('diagnosis',axis=1))

Unnamed: 0,Feature_Name,Count_Outliers,Percentage_Outliers
15,compactness_se,12,2.11
18,symmetry_se,11,1.93
25,compactness_worst,10,1.76
23,area_worst,10,1.76
19,fractal_dimension_se,10,1.76
29,fractal_dimension_worst,9,1.58
28,symmetry_worst,9,1.58
5,compactness_mean,9,1.58
6,concavity_mean,9,1.58
11,texture_se,9,1.58


In [97]:
columns = ['Model Name', 'CM','Accuracy','precision', 'recall', 'f1']
model_summary_stats = pd.DataFrame(columns=columns)
def UpdateSummaryTable(model_name,cm,acc,precision,recall,f1):
    global model_summary_stats
    new_row = pd.DataFrame([[model_name,cm,acc,precision,recall,f1]],columns=columns)
    model_summary_stats=pd.concat([model_summary_stats,new_row])
    return model_summary_stats

### Model Building

#### Full Model with all features

In [98]:
x=df.drop('diagnosis',axis=1)
y=df['diagnosis']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=234)
print("X-train:",x_train.shape)
print("X-test:",x_test.shape)
print("Y-train:",y_train.shape)
print("Y-test:",y_test.shape)

X-train: (398, 30)
X-test: (171, 30)
Y-train: (398,)
Y-test: (171,)


In [99]:
df.diagnosis.value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64

In [100]:
full_classifier=RandomForestClassifier(n_estimators=10,criterion='entropy')

In [101]:
full_classifier.fit(x_train,y_train)

In [102]:
full_classifier.feature_importances_

array([0.00139938, 0.02359325, 0.06618757, 0.        , 0.00196104,
       0.01003006, 0.00942355, 0.14689378, 0.00382   , 0.00210564,
       0.05992014, 0.00302521, 0.00620244, 0.00127085, 0.00074339,
       0.00788703, 0.00474339, 0.01219007, 0.00726241, 0.        ,
       0.26448776, 0.01703604, 0.15035651, 0.09731042, 0.00817468,
       0.01105745, 0.02414152, 0.02915913, 0.02297306, 0.00664423])

In [103]:
y_predict_full = full_classifier.predict(x_test)
full_cm=confusion_matrix(y_test,y_predict_full)
full_accuracy = accuracy_score(y_test,y_predict_full)
full_precision = precision_score(y_test,y_predict_full,pos_label='M')
full_recall = recall_score(y_test,y_predict_full,pos_label='M')
full_f1_score= f1_score(y_test,y_predict_full,pos_label='M')

In [104]:
UpdateSummaryTable("Full model",full_cm,full_accuracy,full_precision,full_recall,full_f1_score)

Unnamed: 0,Model Name,CM,Accuracy,precision,recall,f1
0,Full model,"[[106, 2], [5, 58]]",0.959064,0.966667,0.920635,0.943089


In [105]:
feature_importance = pd.DataFrame({'features':x_train.columns,"Importance":full_classifier.feature_importances_})
feature_importance.sort_values(by='Importance')

Unnamed: 0,features,Importance
19,fractal_dimension_se,0.0
3,area_mean,0.0
14,smoothness_se,0.000743
13,area_se,0.001271
0,radius_mean,0.001399
4,smoothness_mean,0.001961
9,fractal_dimension_mean,0.002106
11,texture_se,0.003025
8,symmetry_mean,0.00382
16,concavity_se,0.004743


#### Droping less Important features

In [109]:
less_importance_features=feature_importance.loc[feature_importance['Importance']==0]['features'].to_list()
less_importance_features

['area_mean', 'fractal_dimension_se']

In [112]:
importance_features_df=df.drop(less_importance_features,axis=1)

In [118]:
x=importance_features_df.drop('diagnosis',axis=1)
y=df['diagnosis']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=234)
print("X-train:",x_train.shape)
print("X-test:",x_test.shape)
print("Y-train:",y_train.shape)
print("Y-test:",y_test.shape)

X-train: (398, 28)
X-test: (171, 28)
Y-train: (398,)
Y-test: (171,)


In [120]:
importance_classifier=RandomForestClassifier(n_estimators=10,criterion='entropy')

In [121]:
importance_classifier.fit(x_train,y_train)

In [123]:
y_predict_imp = importance_classifier.predict(x_test)
imp_cm=confusion_matrix(y_test,y_predict_imp)
imp_accuracy = accuracy_score(y_test,y_predict_imp)
imp_precision = precision_score(y_test,y_predict_imp,pos_label='M')
imp_recall = recall_score(y_test,y_predict_imp,pos_label='M')
imp_f1_score= f1_score(y_test,y_predict_imp,pos_label='M')

In [124]:
UpdateSummaryTable("Importance features model",imp_cm,imp_accuracy,imp_precision,imp_recall,imp_f1_score)

Unnamed: 0,Model Name,CM,Accuracy,precision,recall,f1
0,Full model,"[[106, 2], [5, 58]]",0.959064,0.966667,0.920635,0.943089
0,Importance features model,"[[106, 2], [4, 59]]",0.964912,0.967213,0.936508,0.951613
