In [1]:
import os
os.chdir("../../")#与easymlops同级目录

### 拆分数据

In [2]:
import pandas as pd
data=pd.read_csv("./data/demo.csv")
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
x_train=data[:500]
x_test=data[500:]
y_train=x_train["Survived"]
y_test=x_test["Survived"]
del x_train["Survived"]
del x_test["Survived"]

### pipe构建

In [4]:
from easymlops import TablePipeLine
from easymlops.table.preprocessing import *
from easymlops.table.encoding import *
from easymlops.table.classification import *
from easymlops.table.decomposition import *
from easymlops.table.ensemble import Parallel

In [5]:
table=TablePipeLine()
table.pipe(FixInput()) \
  .pipe(FillNa()) \
  .pipe(Parallel([OneHotEncoding(cols=["Pclass", "Sex"]), LabelEncoding(cols=[("Sex","Sex_label"), ("Pclass","Pclass_label")]),
                    TargetEncoding(cols=["Name", "Ticket", "Embarked", "Cabin", "Sex"], y=y_train)])) \
  .pipe(Parallel([PCADecomposition(n_components=2, prefix="pca"), NMFDecomposition(n_components=2, prefix="nmf")]))

x_test_new=table.fit(x_train).transform(x_test)
x_test_new.head(5)

Unnamed: 0,pca_0,pca_1,nmf_0,nmf_1
500,250.111981,-27.101427,6.209713,0.178597
501,251.117366,-27.787004,6.22505,0.168701
502,252.017571,-29.154928,6.222639,0.096321
503,253.217991,-25.044689,6.260848,0.267427
504,255.220046,50.508226,6.249152,2.144727


In [6]:
table = TablePipeLine()
table.pipe(FixInput()) \
  .pipe(FillNa()) \
  .pipe(Parallel([OneHotEncoding(cols=["Pclass", "Sex"]), LabelEncoding(cols=["Sex", "Pclass"]),
                    TargetEncoding(cols=["Name", "Ticket", "Embarked", "Cabin", "Sex"], y=y_train)])) \
  .pipe(Parallel([PCADecomposition(n_components=2, prefix="pca"), NMFDecomposition(n_components=2, prefix="nmf")]))\
  .pipe(Parallel([LGBMClassification(y=y_train, prefix="lgbm"), LogisticRegressionClassification(y=y_train, prefix="lr")]))

x_test_new=table.fit(x_train).transform(x_test)
x_test_new.head(5)

Unnamed: 0,lgbm_0,lgbm_1,lr_0,lr_1
500,0.965218,0.034782,0.651417,0.348583
501,0.98153,0.01847,0.65506,0.34494
502,0.979139,0.020861,0.647266,0.352734
503,0.808796,0.191204,0.656613,0.343387
504,0.184484,0.815516,0.449149,0.550851


### 性能、一致性测试

In [7]:
table.auto_test(x_test[:10])


###################################################################
 1.一致性测试和性能测试:check_transform_function                      
###################################################################
(<class 'easymlops.table.preprocessing.core.FixInput'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.ensemble.Parallel'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.ensemble.Parallel'>) module check [transform] complete,speed:[7.81ms]/it,cpu:[39%],memory:[320K]
(<class 'easymlops.table.ensemble.Parallel'>) module check [transform] complete,speed:[5.59ms]/it,cpu:[100%],memory:[224K]

#########################################################################################
 2.空值测试:check_null_value                                                          

### 持久化测试

In [8]:
table.save("ml.pkl")

In [9]:
table=TablePipeLine()
table.pipe(FixInput()) \
  .pipe(FillNa()) \
  .pipe(Parallel([OneHotEncoding(cols=["Pclass", "Sex"]), LabelEncoding(cols=["Sex", "Pclass"]),
                    TargetEncoding(cols=["Name", "Ticket", "Embarked", "Cabin", "Sex"], y=y_train)])) \
  .pipe(Parallel([PCADecomposition(n_components=2, prefix="pca"), NMFDecomposition(n_components=2, prefix="nmf")]))\
  .pipe(Parallel([LGBMClassification(prefix="lgbm"), LogisticRegressionClassification(prefix="lr")]))

table.load("ml.pkl")

In [10]:
table.auto_test(x_test[:10])


###################################################################
 1.一致性测试和性能测试:check_transform_function                      
###################################################################
(<class 'easymlops.table.preprocessing.core.FixInput'>) module check [transform] complete,speed:[1.56ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.ensemble.Parallel'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.ensemble.Parallel'>) module check [transform] complete,speed:[9.38ms]/it,cpu:[22%],memory:[16K]
(<class 'easymlops.table.ensemble.Parallel'>) module check [transform] complete,speed:[4.69ms]/it,cpu:[100%],memory:[0K]

#########################################################################################
 2.空值测试:check_null_value                                                           


In [11]:
table.transform(x_test[:5])

Unnamed: 0,lgbm_0,lgbm_1,lr_0,lr_1
500,0.965218,0.034782,0.651417,0.348583
501,0.98153,0.01847,0.65506,0.34494
502,0.979139,0.020861,0.647266,0.352734
503,0.808796,0.191204,0.656613,0.343387
504,0.184484,0.815516,0.449149,0.550851


In [12]:
x_test_new.head(5)

Unnamed: 0,lgbm_0,lgbm_1,lr_0,lr_1
500,0.965218,0.034782,0.651417,0.348583
501,0.98153,0.01847,0.65506,0.34494
502,0.979139,0.020861,0.647266,0.352734
503,0.808796,0.191204,0.656613,0.343387
504,0.184484,0.815516,0.449149,0.550851
