## Importing Libraries

In [343]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, jaccard_score
import plotly.express as px
import numpy as np
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neighbors import LocalOutlierFactor


## Loading the Data

In [344]:
df = pd.read_csv("Breast_Cancer.csv", header=0, delimiter=',')
df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


## Train Test Split

In [345]:
X = df.loc[:,df.columns != 'Status']
y = df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## Preprocessing

In [346]:
def preprocessing(X, y, scaler=None):
    numerical_cols = ['Age','Tumor Size','Regional Node Examined','Reginol Node Positive','Survival Months']
    categorical_cols = ['Race','Marital Status','T Stage ','N Stage','6th Stage','differentiate','Grade','A Stage','Estrogen Status','Progesterone Status']
    # We scale all the columns
    if scaler is None: 
        # Outlier detection
        joined = X.join(y)
        local_outlier_factor = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
        result = local_outlier_factor.fit_predict(joined[['Age', 'Tumor Size','Regional Node Examined','Reginol Node Positive', 'Survival Months']])
        no_outliers = result == 1
        joined = joined[no_outliers]
        y = joined.pop('Status')
        X = joined
        # We only want the scaler to fit the train data
        scaler = MinMaxScaler()
        X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
    else: 
        X[numerical_cols] = scaler.transform(X[numerical_cols])
    # One Hot Encoding
    X = pd.get_dummies(X, columns = categorical_cols)
    # Encode output
    status_map = {'Dead': 1, 'Alive': 0}
    y = y.map(status_map)
    y = y.rename('Dead')
    return X, y, scaler

X_train, y_train, scaler = preprocessing(X_train,y_train)
X_test, y_test, _ = preprocessing(X_test,y_test,scaler)

## Undersampling

In [347]:
print('Original dataset shape:', Counter(y_train))

Original dataset shape: Counter({0: 3075, 1: 546})


In [348]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_train, y_train = undersample.fit_resample(X_train, y_train)
print('Resampled dataset shape:', Counter(y_train))

Resampled dataset shape: Counter({0: 546, 1: 546})


## Random Forest

In [349]:
rfmodel=RandomForestClassifier(n_estimators=600)
rfmodel.fit(X_train, y_train)
y_pred=rfmodel.predict(X_test)

In [350]:
print('accuracy_score: ',accuracy_score(y_test,y_pred))
print('recall_score: ',recall_score(y_test,y_pred))
print('jaccard_score: ',jaccard_score(y_test,y_pred))
print('f1_score: ',f1_score(y_test,y_pred))
print('precision_score: ',precision_score(y_test,y_pred))

fig = px.imshow(np.around(confusion_matrix(y_test, y_pred, normalize='true'),2), 
                template='ggplot2',
                text_auto=True, 
                aspect="True", 
                labels=dict(x="Predicted Label", y="Correct Label", color='Value'))
fig.update_xaxes(side="bottom")
fig.show()

accuracy_score:  0.8064516129032258
recall_score:  0.8285714285714286
jaccard_score:  0.4264705882352941
f1_score:  0.5979381443298969
precision_score:  0.46774193548387094


In [351]:
results_df = pd.DataFrame(index=[], columns= ['Accuracy', 'F1', 'Precision', 'Recall', 'Jaccard'])

In [352]:
from sklearn.model_selection import StratifiedKFold


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cross_val_results = pd.DataFrame(cross_validate(rfmodel , X_train, y_train, cv = cv, 
                            scoring = ['accuracy', 'f1', 'precision', 'recall', 'jaccard'] ))

results_df.loc['Random Forest',:] = cross_val_results[['test_accuracy', 'test_f1',
       'test_precision', 'test_recall', 'test_jaccard']].mean().values
results_df

Unnamed: 0,Accuracy,F1,Precision,Recall,Jaccard
Random Forest,0.780227,0.770402,0.802655,0.743486,0.62781
