In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer


#### 1 - Count the number of unique values per feature in the train set.

In [6]:
df_data = pd.read_csv("breast-cancer.csv", delimiter=",")
df_data.columns = ["age", "menopause", "tumor-size", "inv-nodes", "node-caps", "deg-malig", 
                   "breast", "breast-quad", "irradiat", "Class"]

df_data.dropna(inplace=True)

df_target = pd.DataFrame(df_data['Class'])
df_data.drop('Class', axis=1, inplace=True)
df_data


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,50-59,ge40,15-19,0-2,no,1,right,central,no
1,50-59,ge40,35-39,0-2,no,2,left,left_low,no
2,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes
3,40-49,premeno,30-34,3-5,yes,2,left,right_up,no
4,50-59,premeno,25-29,3-5,no,2,right,left_up,yes
...,...,...,...,...,...,...,...,...,...
280,50-59,ge40,30-34,6-8,yes,2,left,left_low,no
281,50-59,premeno,25-29,3-5,yes,2,left,left_low,yes
282,30-39,premeno,30-34,6-8,yes,2,right,right_up,no
283,50-59,premeno,15-19,0-2,no,2,right,left_low,no


#### 2 - Fit on the train set



In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_data, df_target, test_size=0.2, random_state=43)
X_train.nunique()

age             6
menopause       3
tumor-size     11
inv-nodes       6
node-caps       2
deg-malig       3
breast          2
breast-quad     5
irradiat        2
dtype: int64

In [8]:
one_hot_encoder = OneHotEncoder()
one_hot_encoder.fit(X_train[['node-caps', 'breast', 'breast-quad', 'irradiat']])
encoded_features = one_hot_encoder.transform(X_test[['node-caps', 'breast', 'breast-quad', 'irradiat']]).toarray()
encoded_features[:10]
Ordinal_encoder = OrdinalEncoder()
Ordinal_encoder.fit(X_train[["menopause", "age", "tumor-size", "inv-nodes", "deg-malig"]])
ordinal_features = Ordinal_encoder.transform(X_test[["menopause", "age", "tumor-size", "inv-nodes", "deg-malig"]])
ordinal_features[:10]
column_transformer = make_column_transformer(
    (OneHotEncoder(), ['node-caps', 'breast', 'breast-quad', 'irradiat']),
    (OrdinalEncoder(), ["menopause", "age", "tumor-size", "inv-nodes", "deg-malig"]),
    remainder='passthrough'
)

column_transformer.fit(X_train)
column_transformer.transform(X_test)[:2]


array([[1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 4., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 2., 1., 5., 5., 1.]])