In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder


In [2]:

# names of columns
cols_names = ['Class', 'age', 'menopause', 'tumor-size', 
              'inv-nodes', 'node-caps', 'deg-malig', 'breast', 
              'breast-quad', 'irradiat']

# read the data
df = (pd.read_csv('data/brc/breast-cancer.data', header=None, names=cols_names).replace({'?': 'unknown'}))  # NaN are represented by '?'

In [39]:
df.head(10)

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
5,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no
6,no-recurrence-events,50-59,premeno,25-29,0-2,no,2,left,left_low,no
7,no-recurrence-events,60-69,ge40,20-24,0-2,no,1,left,left_low,no
8,no-recurrence-events,40-49,premeno,50-54,0-2,no,2,left,left_low,no
9,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,left_up,no


In [9]:
X = df.drop(['Class'],axis = 1)
y = df['Class'].copy()


In [41]:
X.head(10)

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,menopause_ge40,menopause_lt40,menopause_premeno
0,1,premeno,30-34,0-2,no,3,left,left_low,no,0,0,1
1,2,premeno,20-24,0-2,no,2,right,right_up,no,0,0,1
2,2,premeno,20-24,0-2,no,2,left,left_low,no,0,0,1
3,4,ge40,15-19,0-2,no,2,right,left_up,no,1,0,0
4,2,premeno,0-4,0-2,no,2,right,right_low,no,0,0,1
5,4,ge40,15-19,0-2,no,2,left,left_low,no,1,0,0
6,3,premeno,25-29,0-2,no,2,left,left_low,no,0,0,1
7,4,ge40,20-24,0-2,no,1,left,left_low,no,1,0,0
8,2,premeno,50-54,0-2,no,2,left,left_low,no,0,0,1
9,2,premeno,20-24,0-2,no,2,right,left_up,no,0,0,1


In [43]:
le_age = LabelEncoder()

In [44]:
le_age.fit(df['age'])

LabelEncoder()

In [45]:
list(le_age.classes_)

['20-29', '30-39', '40-49', '50-59', '60-69', '70-79']

In [46]:
le_age.transform(df['age'])

array([1, 2, 2, 4, 2, 4, 3, 4, 2, 2, 2, 3, 4, 3, 2, 4, 2, 3, 4, 3, 3, 4, 1,
       3, 3, 2, 3, 4, 2, 4, 3, 3, 3, 3, 3, 1, 3, 3, 2, 2, 3, 4, 4, 2, 3, 3,
       2, 3, 2, 2, 3, 1, 3, 5, 5, 5, 3, 3, 4, 4, 2, 2, 3, 0, 2, 2, 2, 3, 3,
       4, 4, 2, 4, 3, 1, 3, 3, 1, 3, 2, 3, 4, 4, 3, 2, 3, 4, 5, 3, 2, 1, 3,
       3, 4, 3, 2, 4, 4, 2, 1, 2, 3, 3, 2, 2, 2, 2, 1, 2, 4, 3, 3, 2, 2, 2,
       3, 1, 2, 1, 4, 4, 3, 3, 3, 4, 5, 1, 1, 3, 2, 2, 2, 2, 3, 4, 1, 1, 2,
       1, 2, 3, 3, 4, 2, 4, 2, 4, 3, 1, 3, 3, 4, 3, 4, 1, 4, 3, 3, 3, 2, 2,
       2, 4, 4, 4, 2, 2, 2, 3, 2, 1, 1, 4, 3, 3, 2, 2, 4, 3, 2, 2, 2, 2, 3,
       3, 2, 3, 4, 2, 3, 2, 2, 3, 1, 3, 3, 3, 2, 3, 3, 4, 3, 2, 3, 3, 1, 3,
       3, 3, 2, 2, 3, 2, 3, 4, 2, 3, 2, 4, 1, 2, 1, 4, 4, 1, 2, 2, 3, 4, 4,
       3, 2, 1, 5, 4, 3, 2, 2, 1, 2, 4, 2, 3, 3, 2, 1, 1, 3, 4, 1, 2, 2, 1,
       4, 2, 2, 2, 2, 3, 3, 4, 2, 4, 3, 3, 1, 2, 4, 1, 2, 3, 3, 2, 4, 4, 2,
       1, 4, 3, 3, 3, 1, 1, 4, 2, 3])

In [47]:
X['age'] = le_age.transform(df['age'])

In [21]:
X.head(10)

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,1,premeno,30-34,0-2,no,3,left,left_low,no
1,2,premeno,20-24,0-2,no,2,right,right_up,no
2,2,premeno,20-24,0-2,no,2,left,left_low,no
3,4,ge40,15-19,0-2,no,2,right,left_up,no
4,2,premeno,0-4,0-2,no,2,right,right_low,no
5,4,ge40,15-19,0-2,no,2,left,left_low,no
6,3,premeno,25-29,0-2,no,2,left,left_low,no
7,4,ge40,20-24,0-2,no,1,left,left_low,no
8,2,premeno,50-54,0-2,no,2,left,left_low,no
9,2,premeno,20-24,0-2,no,2,right,left_up,no


In [48]:
list(le_age.inverse_transform(X.age))

['30-39',
 '40-49',
 '40-49',
 '60-69',
 '40-49',
 '60-69',
 '50-59',
 '60-69',
 '40-49',
 '40-49',
 '40-49',
 '50-59',
 '60-69',
 '50-59',
 '40-49',
 '60-69',
 '40-49',
 '50-59',
 '60-69',
 '50-59',
 '50-59',
 '60-69',
 '30-39',
 '50-59',
 '50-59',
 '40-49',
 '50-59',
 '60-69',
 '40-49',
 '60-69',
 '50-59',
 '50-59',
 '50-59',
 '50-59',
 '50-59',
 '30-39',
 '50-59',
 '50-59',
 '40-49',
 '40-49',
 '50-59',
 '60-69',
 '60-69',
 '40-49',
 '50-59',
 '50-59',
 '40-49',
 '50-59',
 '40-49',
 '40-49',
 '50-59',
 '30-39',
 '50-59',
 '70-79',
 '70-79',
 '70-79',
 '50-59',
 '50-59',
 '60-69',
 '60-69',
 '40-49',
 '40-49',
 '50-59',
 '20-29',
 '40-49',
 '40-49',
 '40-49',
 '50-59',
 '50-59',
 '60-69',
 '60-69',
 '40-49',
 '60-69',
 '50-59',
 '30-39',
 '50-59',
 '50-59',
 '30-39',
 '50-59',
 '40-49',
 '50-59',
 '60-69',
 '60-69',
 '50-59',
 '40-49',
 '50-59',
 '60-69',
 '70-79',
 '50-59',
 '40-49',
 '30-39',
 '50-59',
 '50-59',
 '60-69',
 '50-59',
 '40-49',
 '60-69',
 '60-69',
 '40-49',
 '30-39',


In [49]:
X['menopause'] = X['menopause'].astype('category')
X['menopause'] = X['menopause'].cat.reorder_categories(['lt40', 'ge40', 'premeno'], ordered=True)
X['menopause_cat'] = X['menopause'].cat.codes

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,menopause_ge40,menopause_lt40,menopause_premeno,menopause_cat
0,1,premeno,30-34,0-2,no,3,left,left_low,no,0,0,1,2
1,2,premeno,20-24,0-2,no,2,right,right_up,no,0,0,1,2
2,2,premeno,20-24,0-2,no,2,left,left_low,no,0,0,1,2
3,4,ge40,15-19,0-2,no,2,right,left_up,no,1,0,0,1
4,2,premeno,0-4,0-2,no,2,right,right_low,no,0,0,1,2
5,4,ge40,15-19,0-2,no,2,left,left_low,no,1,0,0,1
6,3,premeno,25-29,0-2,no,2,left,left_low,no,0,0,1,2
7,4,ge40,20-24,0-2,no,1,left,left_low,no,1,0,0,1
8,2,premeno,50-54,0-2,no,2,left,left_low,no,0,0,1,2
9,2,premeno,20-24,0-2,no,2,right,left_up,no,0,0,1,2


In [51]:
pd.get_dummies(X['breast'],prefix="breast").head(10)

Unnamed: 0,breast_left,breast_right
0,1,0
1,0,1
2,1,0
3,0,1
4,0,1
5,1,0
6,1,0
7,1,0
8,1,0
9,0,1


In [52]:
X = pd.concat([X,pd.get_dummies(X['breast'],prefix="breast")],axis=1)

In [53]:
X.head(10)

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,menopause_ge40,menopause_lt40,menopause_premeno,menopause_cat,breast_left,breast_right
0,1,premeno,30-34,0-2,no,3,left,left_low,no,0,0,1,2,1,0
1,2,premeno,20-24,0-2,no,2,right,right_up,no,0,0,1,2,0,1
2,2,premeno,20-24,0-2,no,2,left,left_low,no,0,0,1,2,1,0
3,4,ge40,15-19,0-2,no,2,right,left_up,no,1,0,0,1,0,1
4,2,premeno,0-4,0-2,no,2,right,right_low,no,0,0,1,2,0,1
5,4,ge40,15-19,0-2,no,2,left,left_low,no,1,0,0,1,1,0
6,3,premeno,25-29,0-2,no,2,left,left_low,no,0,0,1,2,1,0
7,4,ge40,20-24,0-2,no,1,left,left_low,no,1,0,0,1,1,0
8,2,premeno,50-54,0-2,no,2,left,left_low,no,0,0,1,2,1,0
9,2,premeno,20-24,0-2,no,2,right,left_up,no,0,0,1,2,0,1


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)