# Feature Engineering II
putting things together

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline


### 1. Load the data

In [2]:
df = pd.read_csv('penguins_simple.csv', sep=';')
df.shape

(333, 6)

### 2. Train-Test Split

In [3]:
X = df.iloc[:, 1:]
y = df['Species']

In [4]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42)
Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape

((249, 5), (84, 5), (249,), (84,))

### 3. Define a ColumnTransformer

In [5]:
pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'), 
    MinMaxScaler()
)
"""
使用方法:
输入一连串数据挖掘步骤,最后一步必须是估计器,前几步是转换器
输入的数据集经过转换器的处理后,输出的结果作为下一步的输入
最后用于估计器进行分类
每一步都是元祖(‘名称’，步骤)来表示
流水线功能:
跟踪记录各步骤操作
对各步骤进行封装
确保代码的复杂程度不至于超出掌控范围
"""

'\n使用方法:\n输入一连串数据挖掘步骤,最后一步必须是估计器,前几步是转换器\n输入的数据集经过转换器的处理后,输出的结果作为下一步的输入\n最后用于估计器进行分类\n每一步都是元祖(‘名称’，步骤)来表示\n流水线功能:\n跟踪记录各步骤操作\n对各步骤进行封装\n确保代码的复杂程度不至于超出掌控范围\n'

In [6]:
trans = ColumnTransformer([
    ('kristians_onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Sex']),
    ('kristians_scale', MinMaxScaler(), ['Body Mass (g)', 'Culmen Depth (mm)']),
    ('impute_then_scale', pipeline, ['Flipper Length (mm)']),
    ('do_nothing', 'passthrough', ['Culmen Length (mm)']),
])

### 4. fit + transform training data

In [7]:
trans.fit(Xtrain)

Xtrain_transformed = trans.transform(Xtrain)  # result is a single numpy array
Xtrain_transformed.shape

(249, 6)

### 5. fit a LogReg model

In [8]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(Xtrain_transformed, ytrain)

LogisticRegression(max_iter=1000)

### 6. transform test data

In [9]:
Xtest_transform = trans.transform(Xtest)
Xtest_transform.shape

(84, 6)

### 7. predict

In [10]:
ypred = model.predict(Xtest_transform)
ypred[:5]

array(['Adelie', 'Gentoo', 'Adelie', 'Chinstrap', 'Adelie'], dtype=object)