In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [10]:
df = pd.read_csv("Breast_Cancer.csv", header=0, delimiter=',')
df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [11]:
X = df.loc[:,df.columns != 'Status']
y = df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.columns

Index(['Age', 'Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage',
       'differentiate', 'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status',
       'Progesterone Status', 'Regional Node Examined',
       'Reginol Node Positive', 'Survival Months'],
      dtype='object')

In [12]:
def preprocessing(X, y, scaler=None):
    numerical_cols = ['Age','Tumor Size','Regional Node Examined','Reginol Node Positive','Survival Months']
    categorical_cols = ['Race','Marital Status','T Stage ','N Stage','6th Stage','differentiate','Grade','A Stage','Estrogen Status','Progesterone Status']
    # We scale all the columns
    if scaler is None: 
        # We only want the scaler to fit the train data
        scaler = MinMaxScaler()
        X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
    else: 
        X[numerical_cols] = scaler.transform(X[numerical_cols])
    # One Hot Encoding
    X = pd.get_dummies(X, columns = categorical_cols)
    return X, y, scaler

X_train, y_train, scaler = preprocessing(X_train,y_train)
X_test, y_test, _ = preprocessing(X_test,y_test,scaler)

Unnamed: 0,Age,Tumor Size,Regional Node Examined,Reginol Node Positive,Survival Months,Race_Black,Race_Other,Race_White,Marital Status_Divorced,Marital Status_Married,...,Grade_ anaplastic; Grade IV,Grade_1,Grade_2,Grade_3,A Stage_Distant,A Stage_Regional,Estrogen Status_Negative,Estrogen Status_Positive,Progesterone Status_Negative,Progesterone Status_Positive
962,0.461538,0.208633,0.166667,0.177778,0.990476,0,1,0,0,1,...,0,0,1,0,0,1,0,1,0,1
945,0.692308,0.043165,0.416667,0.044444,0.895238,0,0,1,0,1,...,0,0,1,0,0,1,0,1,0,1
462,0.820513,0.172662,0.283333,0.000000,0.819048,0,0,1,0,1,...,0,0,0,1,0,1,0,1,0,1
1830,0.256410,0.093525,0.000000,0.000000,0.504762,0,0,1,0,1,...,0,0,0,1,0,1,0,1,0,1
328,0.769231,0.316547,0.133333,0.000000,0.828571,1,0,0,0,1,...,0,0,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,0.846154,0.100719,0.266667,0.000000,0.476190,0,0,1,0,1,...,0,0,1,0,0,1,1,0,1,0
1294,0.410256,0.136691,0.083333,0.022222,0.533333,0,0,1,0,1,...,0,0,1,0,0,1,0,1,0,1
860,0.692308,0.151079,0.500000,0.022222,0.409524,0,0,1,0,0,...,0,1,0,0,0,1,0,1,0,1
3507,0.461538,0.194245,0.433333,0.000000,0.952381,0,1,0,0,1,...,0,0,0,1,0,1,0,1,0,1
