## Importing Libraries

## Loading the Data

In [43]:
%matplotlib inline
from pandas import read_csv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import jaccard_score
import plotly.express as px
import numpy as np
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [44]:
df = read_csv("Breast_Cancer_Processed.csv", header=0, delimiter=',')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Tumor Size,Regional Node Examined,Reginol Node Positive,Survival Months,Status,Race_Black,Race_Other,Race_White,...,Grade_ anaplastic; Grade IV,Grade_1,Grade_2,Grade_3,A Stage_Distant,A Stage_Regional,Estrogen Status_Negative,Estrogen Status_Positive,Progesterone Status_Negative,Progesterone Status_Positive
0,0,0.794872,0.42446,0.333333,0.0,0.613208,Alive,0,0,1,...,0,1,0,0,0,1,0,1,0,1
1,1,0.794872,0.115108,0.116667,0.0,0.443396,Alive,0,0,1,...,0,0,1,0,0,1,0,1,0,1
2,2,0.333333,0.856115,0.233333,0.177778,0.764151,Alive,0,0,1,...,0,0,0,1,0,1,1,0,1,0
3,3,0.641026,0.208633,0.7,0.133333,0.867925,Alive,0,0,1,...,0,0,0,1,0,1,0,1,0,1
4,4,0.512821,0.172662,0.266667,0.244444,0.226415,Dead,0,0,1,...,0,0,0,1,0,1,0,1,0,1


## Train Test Split

In [45]:
y = df.pop('Status').values
X = df.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

## Undersampling

In [46]:
print('Original dataset shape:', Counter(y_train))

Original dataset shape: Counter({'Alive': 3061, 'Dead': 560})


In [47]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_train, y_train = undersample.fit_resample(X_train, y_train)
print('Resampled dataset shape:', Counter(y_train))

Resampled dataset shape: Counter({'Alive': 560, 'Dead': 560})


## Random Forest

In [48]:
rfmodel=RandomForestClassifier(n_estimators=600)
rfmodel.fit(X_train, y_train)
y_pred=rfmodel.predict(X_test)

In [49]:
print('accuracy_score: ',accuracy_score(y_test,y_pred))
print('recall_score: ',recall_score(y_test,y_pred,pos_label='Dead'))
print('jaccard_score: ',jaccard_score(y_test,y_pred,pos_label='Dead'))
print('f1_score: ',f1_score(y_test,y_pred,pos_label='Dead'))
print('precision_score: ',precision_score(y_test,y_pred,pos_label='Dead'))

fig = px.imshow(np.around(confusion_matrix(y_test, y_pred, normalize='true'),2), 
                template='ggplot2',
                text_auto=True, 
                aspect="True", 
                labels=dict(x="Predicted Label", y="Correct Label", color='Value'))
fig.update_xaxes(side="bottom")
fig.show()

accuracy_score:  0.8089330024813896
recall_score:  0.6964285714285714
jaccard_score:  0.33620689655172414
f1_score:  0.503225806451613
precision_score:  0.3939393939393939
