## Importing Libraries

In [86]:
%matplotlib inline
from pandas import read_csv
from pandas import get_dummies
import matplotlib.pyplot as plt
from numpy import random
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from sklearn.neighbors import LocalOutlierFactor

## Loading the Data

In [87]:
data = read_csv("Breast_Cancer.csv", header=0, delimiter=',')
data.shape

(4024, 16)

In [88]:
data.columns

Index(['Age', 'Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage',
       'differentiate', 'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status',
       'Progesterone Status', 'Regional Node Examined',
       'Reginol Node Positive', 'Survival Months', 'Status'],
      dtype='object')

In [89]:
data.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


## Inspection of the Dataset

In [90]:
data.describe()

Unnamed: 0,Age,Tumor Size,Regional Node Examined,Reginol Node Positive,Survival Months
count,4024.0,4024.0,4024.0,4024.0,4024.0
mean,53.972167,30.473658,14.357107,4.158052,71.297962
std,8.963134,21.119696,8.099675,5.109331,22.92143
min,30.0,1.0,1.0,1.0,1.0
25%,47.0,16.0,9.0,1.0,56.0
50%,54.0,25.0,14.0,2.0,73.0
75%,61.0,38.0,19.0,5.0,90.0
max,69.0,140.0,61.0,46.0,107.0


In [91]:
data['Status'].value_counts()

Alive    3408
Dead      616
Name: Status, dtype: int64

## Outlier Detection

In [92]:
local_outlier_factor = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
result = local_outlier_factor.fit_predict(data[['Age', 'Tumor Size','Regional Node Examined','Reginol Node Positive', 'Survival Months']])

outliers = result == -1 
no_outliers = result == 1

Counter(result)

Counter({-1: 403, 1: 3621})

In [93]:
data = data[no_outliers]
data.shape

(3621, 16)

## Normalization

In [94]:
min_max_scaler = MinMaxScaler()
data[['Age','Tumor Size','Regional Node Examined','Reginol Node Positive','Survival Months']] = min_max_scaler.fit_transform(data[['Age','Tumor Size','Regional Node Examined','Reginol Node Positive','Survival Months']])
data.describe()

Unnamed: 0,Age,Tumor Size,Regional Node Examined,Reginol Node Positive,Survival Months
count,3621.0,3621.0,3621.0,3621.0,3621.0
mean,0.620943,0.20571,0.364914,0.089439,0.667555
std,0.219825,0.148545,0.203475,0.139875,0.213465
min,0.0,0.0,0.0,0.0,0.0
25%,0.461538,0.101449,0.235294,0.0,0.528302
50%,0.615385,0.166667,0.352941,0.035714,0.679245
75%,0.794872,0.26087,0.5,0.107143,0.839623
max,1.0,1.0,1.0,1.0,1.0


## One Hot Encoding

In [95]:
categorical_cols = ['Race','Marital Status','T Stage ','N Stage','6th Stage','differentiate','Grade','A Stage','Estrogen Status','Progesterone Status']
data = get_dummies(data, columns = categorical_cols)
data.head()

Unnamed: 0,Age,Tumor Size,Regional Node Examined,Reginol Node Positive,Survival Months,Status,Race_Black,Race_Other,Race_White,Marital Status_Divorced,...,Grade_ anaplastic; Grade IV,Grade_1,Grade_2,Grade_3,A Stage_Distant,A Stage_Regional,Estrogen Status_Negative,Estrogen Status_Positive,Progesterone Status_Negative,Progesterone Status_Positive
1,0.512821,0.23913,0.382353,0.142857,0.575472,Alive,0,0,1,0,...,0,0,1,0,0,1,0,1,0,1
2,0.717949,0.442029,0.382353,0.214286,0.698113,Alive,0,0,1,1,...,0,0,1,0,0,1,0,1,0,1
3,0.717949,0.115942,0.029412,0.0,0.783019,Alive,0,0,1,0,...,0,0,0,1,0,1,0,1,0,1
4,0.435897,0.282609,0.058824,0.0,0.462264,Alive,0,0,1,0,...,0,0,0,1,0,1,0,1,0,1
5,0.538462,0.130435,0.5,0.035714,0.830189,Alive,0,0,1,0,...,0,0,1,0,0,1,0,1,0,1


## Saving the Data

In [96]:
random.seed(144)
data = data.sample(frac=1).reset_index(drop=True)
data.to_csv('Breast_Cancer_Processed.csv')