In [1]:
import os
os.chdir("../../")#与easymlops同级目录

### 拆分数据

In [2]:
import pandas as pd
data=pd.read_csv("./data/demo.csv")
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
x_train=data[:500]
x_test=data[500:]
y_train=x_train["Survived"]
y_test=x_test["Survived"]
del x_train["Survived"]
del x_test["Survived"]

### pipe构建

In [4]:
from easymlops import TablePipeLine
from easymlops.table.preprocessing import *
from easymlops.table.encoding import *
from easymlops.table.feature_selection import *
from easymlops.table.ensemble import Parallel

In [5]:
table=TablePipeLine()
table.pipe(FixInput())\
  .pipe(FillNa())\
  .pipe(MissRateFilter(max_threshold=0.1))\
  .pipe(VarianceFilter(min_threshold=0.1))\
  .pipe(PersonCorrFilter(min_threshold=0.1,y=y_train,name="person"))\
  .pipe(PSIFilter(oot_x=x_test,cols=["Pclass","Sex","Embarked"],name="psi",max_threshold=0.5))\
  .pipe(LabelEncoding(cols=["Sex","Ticket","Embarked","Pclass"]))\
  .pipe(TargetEncoding(cols=["Name"],y=y_train))\
  .pipe(Chi2Filter(y=y_train,name="chi2"))\
  .pipe(MutualInfoFilter(y=y_train))\
  .pipe(IVFilter(y=y_train,name="iv",cols=["Sex","Fare"],min_threshold=0.05))

x_test_new=table.fit(x_train).transform(x_test)
x_test_new.head(5)

Unnamed: 0,Pclass,Name,Sex,Ticket,Fare,Embarked
500,1,0.0,1,0,8.664062,1
501,1,0.0,2,0,7.75,3
502,1,0.0,2,0,7.628906,3
503,1,0.0,2,0,9.585938,1
504,2,0.0,2,231,86.5,1


In [6]:
#查看psi计算详情
table["psi"].show_detail().head()

Unnamed: 0,col,bin_value,ins_num,ins_rate,oot_num,oot_rate,psi
0,Pclass,3,279,0.558,212,0.542199,0.000454
1,Pclass,1,116,0.232,100,0.255754,0.002316
2,Pclass,2,105,0.21,79,0.202046,0.000307
3,Sex,male,315,0.63,262,0.670077,0.002472
4,Sex,female,185,0.37,129,0.329923,0.004595


### 性能、一致性测试

In [7]:
table.auto_test(x_test[:10])


###################################################################
 1.一致性测试和性能测试:check_transform_function                      
###################################################################
(<class 'easymlops.table.preprocessing.core.FixInput'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.feature_selection.filter.MissRateFilter'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.feature_selection.filter.VarianceFilter'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(person) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(psi) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.LabelEncoding'>) module check [transform] complete,speed:[0.1ms

### 持久化测试

In [8]:
table.save("ml.pkl")

In [9]:
table=TablePipeLine()
table.pipe(FixInput())\
  .pipe(FillNa())\
  .pipe(MissRateFilter(max_threshold=0.1))\
  .pipe(VarianceFilter(min_threshold=0.1))\
  .pipe(PersonCorrFilter(min_threshold=0.1,name="person"))\
  .pipe(PSIFilter(oot_x=x_test,cols=["Pclass","Sex","Embarked"],name="psi",max_threshold=0.5))\
  .pipe(LabelEncoding(cols=["Sex","Ticket","Embarked","Pclass"]))\
  .pipe(TargetEncoding(cols=["Name","Cabin"]))\
  .pipe(Chi2Filter(name="chi2"))\
  .pipe(MutualInfoFilter())\
  .pipe(IVFilter(name="iv",cols=["Sex","Fare"],min_threshold=0.05))

table.load("ml.pkl")

In [10]:
table.auto_test(x_test[:10])


###################################################################
 1.一致性测试和性能测试:check_transform_function                      
###################################################################
(<class 'easymlops.table.preprocessing.core.FixInput'>) module check [transform] complete,speed:[4.38ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.feature_selection.filter.MissRateFilter'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.feature_selection.filter.VarianceFilter'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(person) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(psi) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.LabelEncoding'>) module check [transform] complete,speed:[0.0m

In [11]:
table.transform(x_test[:5])

Unnamed: 0,Pclass,Name,Sex,Ticket,Fare,Embarked
500,1,0.0,1,0,8.664062,1
501,1,0.0,2,0,7.75,3
502,1,0.0,2,0,7.628906,3
503,1,0.0,2,0,9.585938,1
504,2,0.0,2,231,86.5,1


In [12]:
x_test_new.head(5)

Unnamed: 0,Pclass,Name,Sex,Ticket,Fare,Embarked
500,1,0.0,1,0,8.664062,1
501,1,0.0,2,0,7.75,3
502,1,0.0,2,0,7.628906,3
503,1,0.0,2,0,9.585938,1
504,2,0.0,2,231,86.5,1
