In [1]:
import os
os.chdir("../../")#与easymlops同级目录

### 拆分数据

In [2]:
import pandas as pd
data=pd.read_csv("./data/demo.csv")
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
x_train=data[:500]
x_test=data[500:]
y_train=x_train["Survived"]
y_test=x_test["Survived"]
del x_train["Survived"]
del x_test["Survived"]

### 简单跑一个模型

In [12]:
from easymlops import TablePipeLine
from easymlops.table.preprocessing import *
from easymlops.table.encoding import *
from easymlops.table.classification import *

In [13]:
table=TablePipeLine()
table.pipe(FixInput())\
  .pipe(FillNa())\
  .pipe(OneHotEncoding(cols=["Pclass", "Sex"], drop_col=False)) \
  .pipe(WOEEncoding(cols=["Ticket", "Embarked", "Cabin", "Sex", "Pclass"], y=y_train)) \
  .pipe(LabelEncoding(cols=["Name"]))\
  .pipe(LGBMClassification(y=y_train,native_init_params={"max_depth":2},native_fit_params={"num_boost_round":128}))

x_test_new=table.fit(x_train).transform(x_test)
x_test_new.head(5)

Unnamed: 0,0,1
500,0.923326,0.076674
501,0.373652,0.626348
502,0.37838,0.62162
503,0.670166,0.329834
504,0.06847,0.93153


### transform/transform_single一致性测试&性能测试:check_transform_function
部署生产环境之前，我们通常要关注两点：  
- 离线训练模型和在线预测模型的一致性，即tranform和transform_single的一致性；  
- transform_single对当条数据的预测性能  

这些可以通过调用如下函数，进行自动化测试：  
- check_transform_function：只要有打印complete，则表示在当前测试数据上transform和transform_single的输出一致，性能测试表示为speed:[*]毫秒/每条数据，以及运行过程中cpu的最大使用率和内存变化(最大内存-最小内存)，如果有异常则会直接抛出，并中断后续pipe模块的测试

In [15]:
from easymlops.table.callback import check_transform_function_pipeline
table.callback(check_transform_function_pipeline,x_test[:10])

(<class 'easymlops.table.preprocessing.core.FixInput'>) module check [transform] complete,speed:[0.19ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.OneHotEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.WOEEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.LabelEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.classification.LGBMClassification'>) module check [transform] complete,speed:[1.56ms]/it,cpu:[0%],memory:[0K]


### 空值测试：check_null_value  

- 由于pandas在读取数据时会自动做类型推断，对空会有不同的处理，比如float设置为np.nan，对object设置为None或NaN  
- 而且pandas读取数据默认为批量读取批量推断，所以某一列数据空还不唯一，np.nan和None可能共存  

所以，这里对逐个column分别设置不同的空进行测试，测试内容：  
- 相同的空情况下，transform和transform_single是否一致  
- 不同的空的transform结果是否一致  

可通过`null_values=[None, np.nan, "null", "NULL", "nan", "NaN", "", "none", "None", " "]`(默认)设置自定义空值

In [16]:
from easymlops.table.callback import check_null_value
table.callback(check_null_value,x_test[:10])

column:[PassengerId] check [null value] complete,speed:[2.39ms]/it,cpu:[0%],memory:[0K]
column:[Pclass] check [null value] complete,speed:[2.41ms]/it,cpu:[0%],memory:[0K]
column:[Name] check [null value] complete,speed:[2.45ms]/it,cpu:[0%],memory:[0K]
column:[Sex] check [null value] complete,speed:[2.43ms]/it,cpu:[0%],memory:[0K]
column:[Age] check [null value] complete,speed:[2.3ms]/it,cpu:[0%],memory:[0K]
column:[SibSp] check [null value] complete,speed:[2.35ms]/it,cpu:[0%],memory:[0K]
column:[Parch] check [null value] complete,speed:[2.71ms]/it,cpu:[0%],memory:[0K]
column:[Ticket] check [null value] complete,speed:[2.35ms]/it,cpu:[0%],memory:[0K]
column:[Fare] check [null value] complete,speed:[2.13ms]/it,cpu:[0%],memory:[0K]
column:[Cabin] check [null value] complete,speed:[2.42ms]/it,cpu:[0%],memory:[0K]
column:[Embarked] check [null value] complete,speed:[2.27ms]/it,cpu:[0%],memory:[0K]


### 极端值测试：check_extreme_value  

通常用于训练的数据都是经过筛选的正常数据，但线上难免会有极端值混入，比如你训练的某列数据范围在`0~1`之间，如果传入一个`-1`，也许就会报错，目前

- 对两种类型的分别进行极端测试，设置如下：
  - 数值型:设置`number_extreme_values = [np.inf, 0.0, -1, 1, -1e-7, 1e-7, np.finfo(np.float64).min, np.finfo(np.float64).max]`(默认)
  - 离散型:设置`category_extreme_values = ["", "null", None, "1.0", "0.0", "-1.0", "-1", "NaN", "None"]`(默认)  

- 将全部特征设置为如上的极端值进行测试

注意：这里只检测了transform与transform_single的一致性，不要求各极端值输入下的输出一致性(注意和上面的空值检测不一样，空值检测要求所有类型的空的输出也要一致)

In [17]:
from easymlops.table.callback import check_extreme_value
table.callback(check_extreme_value,x_test[:10])

column:[PassengerId] check [extreme value] complete,speed:[2.57ms]/it,cpu:[100%],memory:[6261596K]
column:[Pclass] check [extreme value] complete,speed:[2.88ms]/it,cpu:[100%],memory:[6263196K]
column:[Name] check [extreme value] complete,speed:[2.69ms]/it,cpu:[100%],memory:[6283028K]
column:[Sex] check [extreme value] complete,speed:[2.41ms]/it,cpu:[100%],memory:[6285492K]
column:[Age] check [extreme value] complete,speed:[2.75ms]/it,cpu:[100%],memory:[6283340K]
column:[SibSp] check [extreme value] complete,speed:[2.47ms]/it,cpu:[100%],memory:[6283276K]
column:[Parch] check [extreme value] complete,speed:[2.82ms]/it,cpu:[100%],memory:[6279116K]
column:[Ticket] check [extreme value] complete,speed:[2.87ms]/it,cpu:[100%],memory:[6279124K]
column:[Fare] check [extreme value] complete,speed:[2.48ms]/it,cpu:[100%],memory:[6279256K]
column:[Cabin] check [extreme value] complete,speed:[2.52ms]/it,cpu:[100%],memory:[6279172K]
column:[Embarked] check [extreme value] complete,speed:[2.65ms]/it,c

### 数据类型反转测试：check_inverse_dtype  

某特征入模是数据是数值，但上线后传过来的是离散值，也有可能相反，这里就对这种情况做测试，对原是数值的替换为离散做测试，对原始离散值的替换为数值，替换规则如下：
- 原数值的，替换为：`number_inverse_values = ["", "null", None, "1.0", "0.0", "-1.0", "-1"]`(默认)  
- 原离散的，替换为：`category_inverse_values = [0.0, -1, 1, -1e-7, 1e-7, np.finfo(np.float64).min, np.finfo(np.float64).max]`(默认)  

同样，数据类型反转测试只对transform和transform_single的一致性有要求

In [18]:
from easymlops.table.callback import check_inverse_dtype
table.callback(check_inverse_dtype,x_test[:10])

column:[PassengerId] check [inverse type] complete,speed:[2.28ms]/it,cpu:[12%],memory:[6266280K]
column:[Pclass] check [inverse type] complete,speed:[2.76ms]/it,cpu:[100%],memory:[6266468K]
column:[Name] check [inverse type] complete,speed:[2.78ms]/it,cpu:[100%],memory:[6266552K]
column:[Sex] check [inverse type] complete,speed:[2.54ms]/it,cpu:[100%],memory:[6266936K]
column:[Age] check [inverse type] complete,speed:[2.14ms]/it,cpu:[100%],memory:[6267224K]
column:[SibSp] check [inverse type] complete,speed:[2.96ms]/it,cpu:[100%],memory:[6267220K]
column:[Parch] check [inverse type] complete,speed:[2.52ms]/it,cpu:[100%],memory:[6267144K]
column:[Ticket] check [inverse type] complete,speed:[2.8ms]/it,cpu:[100%],memory:[6267184K]
column:[Fare] check [inverse type] complete,speed:[2.66ms]/it,cpu:[100%],memory:[6266744K]
column:[Cabin] check [inverse type] complete,speed:[2.45ms]/it,cpu:[100%],memory:[6266788K]
column:[Embarked] check [inverse type] complete,speed:[2.66ms]/it,cpu:[100%],mem

###  int转float测试：check_int_trans_float  
pandas会将某些特征自动推断为int，而线上可能传输的是float，需要做如下测试：  
- 转float后transform和transform_single之间的一致性  
- int和float特征通过transform后的一致性

In [19]:
from easymlops.table.callback import check_int_trans_float
table.callback(check_int_trans_float,x_test[:10])

column:[PassengerId] check [int trans float] complete,speed:[3.13ms]/it,cpu:[0%],memory:[0K]
column:[Pclass] check [int trans float] complete,speed:[1.56ms]/it,cpu:[0%],memory:[0K]
column:[SibSp] check [int trans float] complete,speed:[3.13ms]/it,cpu:[0%],memory:[0K]
column:[Parch] check [int trans float] complete,speed:[3.13ms]/it,cpu:[0%],memory:[0K]


### 自动测试：auto_test
就是把上面的所有测试，整合到auto_test一个函数中

In [21]:
table.auto_test(x_test[:10])


###################################################################
 1.一致性测试和性能测试:check_transform_function                      
###################################################################
(<class 'easymlops.table.preprocessing.core.FixInput'>) module check [transform] complete,speed:[0.6ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.OneHotEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.WOEEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.encoding.LabelEncoding'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.classification.LGBMClassification'>) module check [transform] complete,speed:[1.56ms]/it,cpu:[0%],memory:[0K]

################################