In [1]:
import os
os.chdir("../../")#与easymlops同级目录

### 拆分数据

In [2]:
import pandas as pd
data=pd.read_csv("./data/demo.csv")
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
x_train=data[:500]
x_test=data[500:]
y_train=x_train["Survived"]
y_test=x_test["Survived"]
del x_train["Survived"]
del x_test["Survived"]

### pipeline分拆
- 建模的过程通常式逐步迭代的，在每一层可能都要做多次调整，再进行下一步建模，但按照上面的建模方式，每次调整了最后一层，都要将前面的所有层再次运行一次，这样很费时费力；
- 所以如果能将整个pipeline分拆成为多个子pipeline或pipe分别训练后再组合

In [4]:
from easymlops import TablePipeLine
from easymlops.table.preprocessing import *
from easymlops.table.encoding import *
from easymlops.table.perfopt import *
from easymlops.table.decomposition import *

In [5]:
pipe_fix_input=FixInput()
x_train_new=pipe_fix_input.fit(x_train).transform(x_train)

In [6]:
pipe_fillna=FillNa()
x_train_new=pipe_fillna.fit(x_train_new).transform(x_train_new)
x_train_new.head(1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [7]:
pipe_onehot_encoding=OneHotEncoding(cols=["Pclass","Sex"],drop_col=False)
x_train_new=pipe_onehot_encoding.fit(x_train_new).transform(x_train_new)
x_train_new.head(1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pclass_3,Pclass_1,Pclass_2,Sex_male,Sex_female
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,0,0,1,0


In [8]:
pipe_label_encoding=LabelEncoding(cols=["Sex","Pclass"])
x_train_new=pipe_label_encoding.fit(x_train_new).transform(x_train_new)
x_train_new.head(1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pclass_3,Pclass_1,Pclass_2,Sex_male,Sex_female
0,1,1,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,1,0,0,1,0


In [9]:
pipe_target_encoding=TargetEncoding(cols=["Name","Ticket","Embarked","Cabin"],y=y_train)
x_train_new=pipe_target_encoding.fit(x_train_new).transform(x_train_new)
x_train_new.head(1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pclass_3,Pclass_1,Pclass_2,Sex_male,Sex_female
0,1,1,0.0,1,22.0,1,0,0.0,7.25,0.317829,0.334254,1,0,0,1,0


In [10]:
pipe_normal=Normalizer()
x_train_new=pipe_normal.fit(x_train_new).transform(x_train_new)
x_train_new.head(1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pclass_3,Pclass_1,Pclass_2,Sex_male,Sex_female
0,-1.73,-0.81,-0.79,-0.77,-0.07,0.37,-0.49,-0.84,-0.52,-0.29,-0.6,0.89,-0.55,-0.52,0.77,-0.77


In [11]:
pipe_pca=PCADecomposition(n_components=8)
x_train_new=pipe_pca.fit(x_train_new).transform(x_train_new)
x_train_new.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7
0,-2.336829,0.709077,0.076081,0.074529,0.056423,1.243462,1.084358,-0.621034


In [12]:
from easymlops.table.extend import Normalization
from easymlops.table.classification import LogisticRegressionClassification

In [13]:
#再构建一个LR模型+归一化的pipeline
pipeline_lr=TablePipeLine()
pipeline_lr.pipe(LogisticRegressionClassification(y=y_train))\
           .pipe(Normalization())

x_train_new=pipeline_lr.fit(x_train_new).transform(x_train_new)
x_train_new.head(1)

Unnamed: 0,0,1
0,50.03,49.97


### pipeline组合  
新建一个pipeline将添加前面的pipe

In [14]:
pipeline_combine=TablePipeLine()
pipeline_combine.pipe(pipe_fix_input)\
                .pipe(pipe_fillna)\
                .pipe(pipe_onehot_encoding)\
                .pipe(pipe_label_encoding)\
                .pipe(pipe_target_encoding)\
                .pipe(pipe_normal)\
                .pipe(pipe_pca)

<easymlops.table.core.pipeline_object.TablePipeLine at 0x224a6f90848>

In [15]:
pipeline_combine.transform(x_test[:1])

Unnamed: 0,0,1,2,3,4,5,6,7
500,-2.286689,0.370987,0.023063,-0.609253,-0.039276,-1.774035,-0.591055,-0.276472


pipeline也可以pipe一个pipeline，这样就可以做多层嵌套

In [16]:
pipeline_combine.pipe(pipeline_lr)

<easymlops.table.core.pipeline_object.TablePipeLine at 0x224a6f90848>

In [17]:
pipeline_combine.transform(x_test[:1])

Unnamed: 0,0,1
500,50.02,49.98


### 性能、一致性测试

In [18]:
pipeline_combine.auto_test(x_test[:10])


###################################################################
 1.一致性测试和性能测试:check_transform_function                      
###################################################################
(<class 'easymlops.table.preprocessing.core.FixInput'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.1ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.OneHotEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.LabelEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.TargetEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.Normalizer'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.tabl

### 持久化测试

In [19]:
pipeline_combine.save("ml.pkl")

In [20]:
pipeline_combine=TablePipeLine()
pipeline_combine.pipe(FixInput())\
                .pipe(FillNa())\
                .pipe(OneHotEncoding())\
                .pipe(LabelEncoding())\
                .pipe(TargetEncoding())\
                .pipe(Normalizer())\
                .pipe(PCADecomposition())\
                .pipe(TablePipeLine().pipe(LogisticRegressionClassification()).pipe(Normalization()))


pipeline_combine.load("ml.pkl")

In [21]:
pipeline_combine.auto_test(x_test[:10])


###################################################################
 1.一致性测试和性能测试:check_transform_function                      
###################################################################
(<class 'easymlops.table.preprocessing.core.FixInput'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.OneHotEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.LabelEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.TargetEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.Normalizer'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.tabl

In [22]:
pipeline_combine.transform(x_test[:5])

Unnamed: 0,0,1
500,50.02,49.98
501,49.91,50.09
502,49.84,50.16
503,49.95,50.05
504,48.38,51.62
