In [1]:
import os
os.chdir("../../")#与easymlops同级目录

### 拆分数据

In [2]:
import pandas as pd
data=pd.read_csv("./data/demo.csv")
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
x_train=data[:500]
x_test=data[500:]
y_train=x_train["Survived"]
y_test=x_test["Survived"]
del x_train["Survived"]
del x_test["Survived"]

### 获取特定pipe模块
有时候我们向获取指定pipe模块，并调用其函数接口,这里可以通过`下标索引`(从0开始),也可以通过`name`进行索引

In [4]:
from easymlops import TablePipeLine
from easymlops.table.preprocessing import *
from easymlops.table.encoding import *
from easymlops.table.perfopt import *
from easymlops.table.decomposition import *

In [5]:
table=TablePipeLine()
table.pipe(FixInput())\
  .pipe(FillNa())\
  .pipe(OneHotEncoding(cols=["Pclass","Sex"],drop_col=False))\
  .pipe(WOEEncoding(cols=["Sex","Pclass"],y=y_train))\
  .pipe(LabelEncoding(cols=["Name","Ticket"]))\
  .pipe(TablePipeLine().pipe(TargetEncoding(cols=["Embarked","Cabin"],y=y_train,name="target_encoding")).pipe(FillNa()))\
  .pipe(FillNa())\
  .pipe(Normalizer())\
  .pipe(PCADecomposition(n_components=8))

x_test_new=table.fit(x_train).transform(x_test)
x_test_new.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7
500,-2.14842,0.307872,0.526932,0.088112,0.324999,-0.238535,-0.432583,-0.424289
501,0.212949,2.322017,-1.286037,-0.469809,1.537922,-0.098157,0.775934,0.459946
502,0.053812,2.543002,-1.504793,-0.263976,1.525352,0.729823,0.737169,-0.240717
503,0.089901,2.005645,-1.130925,-0.942879,0.544493,-1.649309,-0.119709,-0.125324
504,2.673423,0.110068,-0.091175,-0.154845,0.265834,-0.959033,1.61703,-2.494035


In [6]:
#比如调用WOEEncoding的show_detail函数
table[3].show_detail()

Unnamed: 0,col,bin_value,bad_num,bad_rate,good_num,good_rate,woe,iv
0,Sex,male,54,0.279793,261,0.850163,1.111379,0.633897
1,Sex,female,139,0.720207,46,0.149837,-1.56999,0.895475
2,Pclass,3,78,0.404145,201,0.654723,0.482439,0.120889
3,Pclass,1,66,0.341969,50,0.162866,-0.741789,0.132856
4,Pclass,2,49,0.253886,56,0.18241,-0.330626,0.023632


In [7]:
#name="taget_encoding"的show_detail函数
table["target_encoding"].show_detail().head()

Unnamed: 0,col,bin_value,target_value
0,Embarked,C,0.521739
1,Embarked,Q,0.511111
2,Embarked,S,0.334254
3,Embarked,,1.0
4,Cabin,A14,0.0


In [10]:
# target_encoding也可以嵌套索引
table[5,0].show_detail().head()

Unnamed: 0,col,bin_value,target_value
0,Embarked,C,0.521739
1,Embarked,Q,0.511111
2,Embarked,S,0.334254
3,Embarked,,1.0
4,Cabin,A14,0.0


### 性能、一致性测试

In [11]:
table.auto_test(x_test[:10])


###################################################################
 1.一致性测试和性能测试:check_transform_function                      
###################################################################
(<class 'easymlops.table.preprocessing.core.FixInput'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.OneHotEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.WOEEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.LabelEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(target_encoding) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check

### 持久化测试

In [12]:
table.save("ml.pkl")

In [13]:
table=TablePipeLine()
table.pipe(FixInput())\
  .pipe(FillNa())\
  .pipe(OneHotEncoding(cols=["Pclass","Sex"],drop_col=False))\
  .pipe(WOEEncoding(cols=["Sex","Pclass"],y=y_train))\
  .pipe(LabelEncoding(cols=["Name","Ticket"]))\
  .pipe(TablePipeLine().pipe(TargetEncoding(cols=["Embarked","Cabin"],y=y_train,name="target_encoding")).pipe(FillNa()))\
  .pipe(FillNa())\
  .pipe(Normalizer())\
  .pipe(PCADecomposition(n_components=8))


table.load("ml.pkl")

In [14]:
table.auto_test(x_test[:10])


###################################################################
 1.一致性测试和性能测试:check_transform_function                      
###################################################################
(<class 'easymlops.table.preprocessing.core.FixInput'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.OneHotEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.WOEEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.LabelEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(target_encoding) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check

In [15]:
table.transform(x_test[:5])

Unnamed: 0,0,1,2,3,4,5,6,7
500,-2.14842,0.307872,0.526932,0.088112,0.324999,-0.238535,-0.432583,-0.424289
501,0.212949,2.322017,-1.286037,-0.469809,1.537922,-0.098157,0.775934,0.459946
502,0.053812,2.543002,-1.504793,-0.263976,1.525352,0.729823,0.737169,-0.240717
503,0.089901,2.005645,-1.130925,-0.942879,0.544493,-1.649309,-0.119709,-0.125324
504,2.673423,0.110068,-0.091175,-0.154845,0.265834,-0.959033,1.61703,-2.494035


In [16]:
x_test_new.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7
500,-2.14842,0.307872,0.526932,0.088112,0.324999,-0.238535,-0.432583,-0.424289
501,0.212949,2.322017,-1.286037,-0.469809,1.537922,-0.098157,0.775934,0.459946
502,0.053812,2.543002,-1.504793,-0.263976,1.525352,0.729823,0.737169,-0.240717
503,0.089901,2.005645,-1.130925,-0.942879,0.544493,-1.649309,-0.119709,-0.125324
504,2.673423,0.110068,-0.091175,-0.154845,0.265834,-0.959033,1.61703,-2.494035
