In [1]:
import os
os.chdir("../../")#与easymlops同级目录

### 拆分数据

In [2]:
import pandas as pd
data=pd.read_csv("./data/demo.csv")
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
x_train=data[:500]
x_test=data[500:]
y_train=x_train["Survived"]
y_test=x_test["Survived"]
del x_train["Survived"]
del x_test["Survived"]

### 自定义pipe模块及其接口扩展

把需求分为如下几个层级：

- 最低需求，只做数据探索工作，只需要实现fit和transform接口
- 模型持久化需求，需要实现set_params和get_params来告诉PipeML,你的模型预测需要保留那些参数
- 生产上线需求，需要实现transform_single接口，实现与transform一样的预测结果，但处理的数据格式不一样，transform是dataframe，而transform_single是字典，而且transform_single的性能要求通常比transform高  
- 自定义扩展函数，可以添加自定义的其他函数方法，比较监测线上数据分布的变化，然后通过`4.4`介绍的方法调用

下面看一下TargetEncoding的简化版实现

In [6]:
from easymlops import TablePipeLine
from easymlops.table.preprocessing import *
from easymlops.table.encoding import *
from easymlops.table.perfopt import *
from easymlops.table.decomposition import *

In [7]:
#注意下面继承的是object
class TargetEncoding(object):
    def __init__(self,name="", y=None,cols=None, error_value=0):
        self.name=name
        self.y=y
        self.cols=cols
        self.error_value = error_value
        self.target_map_detail = dict()

    def show_detail(self):
        data = []
        for col, map_detail in self.target_map_detail.items():
            for bin_value, target_value in map_detail.items():
                data.append([col, bin_value, target_value])
        return pd.DataFrame(data=data, columns=["col", "bin_value", "target_value"])

    def fit(self, s):
        assert self.y is not None and len(self.y) == len(s)
        s["y_"] = self.y
        for col in self.cols:
            tmp_ = s[[col, "y_"]]
            col_map = list(tmp_.groupby([col]).agg({"y_": ["mean"]}).to_dict().values())[0]
            self.target_map_detail[col] = col_map
        del s["y_"]
        return self
    
    def transform(self, s):
        for col in self.cols:
            if col in s.columns:
                s[col] = s[col].apply(lambda x: self._user_defined_function(col, x))
        return s
    
    def transform_single(self, s):
        for col in self.cols:
            if col in s.keys():
                s[col] = self._user_defined_function(col, s[col])
        return s

    def _user_defined_function(self, col, x):
        map_detail_ = self.target_map_detail.get(col, dict())
        return map_detail_.get(x, self.error_value)

    def get_params(self):
        #获取父类的params
        params=super().get_params()
        #加入当前的参数
        params.update({"target_map_detail": self.target_map_detail, "error_value": self.error_value})
        return params

    def set_params(self, params):
        #设置父类的params
        super().set_params(params)
        #再设置当前层的params
        self.target_map_detail = params["target_map_detail"]
        self.error_value = params["error_value"]

In [8]:
table=TablePipeLine()
table.pipe(FixInput())\
  .pipe(SelectCols(cols=["Age","Fare","Embarked"]))\
  .pipe(TargetEncoding(cols=["Embarked"],y=y_train))\
  .pipe(FillNa())

x_test_new=table.fit(x_train).transform(x_test)
x_test_new.head(5)

Unnamed: 0,Age,Fare,Embarked
500,17.0,8.664062,0.334254
501,21.0,7.75,0.511111
502,0.0,7.628906,0.511111
503,37.0,9.585938,0.334254
504,16.0,86.5,0.334254


In [10]:
table.transform_single(x_test.to_dict("record")[0])

{'Age': 17.0, 'Fare': 8.664, 'Embarked': 0.3342541436464088}

In [11]:
table[-2].show_detail()

Unnamed: 0,col,bin_value,target_value
0,Embarked,C,0.521739
1,Embarked,Q,0.511111
2,Embarked,S,0.334254
3,Embarked,,1.0


#### 进阶接口
上面是简化版的，在实现fit\transform\transform_single\get_params\set_params时可能需要考虑更多：  

- 输入输出数据的类型是否需要校验一下？  
- 输入输出数据的顺序是否需要一致？  
- set_params和get_params时搞忘了父类可咋整？
- 当前命名的参数与底层的参数名称的冲突检测？    
- 在transform时候是否需要把数据拷贝一下？  

建议自定义时继承TablePipeObjectBase对象，然后实现udf_fit\ udf_transform\ udf_transform_single\ udf_get_params\ udf_set_params，这样自定义的Pipe模块更稳健，如下，调整后的TargetEncoding，代码几乎一样

In [12]:
from easymlops.table.core import TablePipeObjectBase
class TargetEncoding(TablePipeObjectBase):
    def __init__(self,y=None, cols=None, error_value=0):
        super().__init__()
        self.y=y
        self.cols=cols
        self.error_value = error_value
        self.target_map_detail = dict()

    def show_detail(self):
        data = []
        for col, map_detail in self.target_map_detail.items():
            for bin_value, target_value in map_detail.items():
                data.append([col, bin_value, target_value])
        return pd.DataFrame(data=data, columns=["col", "bin_value", "target_value"])

    def udf_fit(self, s):
        assert self.y is not None and len(self.y) == len(s)
        s["y_"] = self.y
        for col in self.cols:
            tmp_ = s[[col, "y_"]]
            col_map = list(tmp_.groupby([col]).agg({"y_": ["mean"]}).to_dict().values())[0]
            self.target_map_detail[col] = col_map
        del s["y_"]
        return self
    
    def udf_transform(self, s):
        for col in self.cols:
            if col in s.columns:
                s[col] = s[col].apply(lambda x: self._user_defined_function(col, x))
        return s
    
    def udf_transform_single(self, s):
        for col in self.cols:
            if col in s.keys():
                s[col] = self._user_defined_function(col, s[col])
        return s

    def _user_defined_function(self, col, x):
        map_detail_ = self.target_map_detail.get(col, dict())
        return map_detail_.get(x, self.error_value)

    def udf_get_params(self):
        return {"target_map_detail": self.target_map_detail, "error_value": self.error_value}

    def udf_set_params(self, params):
        self.target_map_detail = params["target_map_detail"]
        self.error_value = params["error_value"]

In [13]:
table=TablePipeLine()
table.pipe(FixInput())\
  .pipe(SelectCols(cols=["Age","Fare","Embarked"]))\
  .pipe(TargetEncoding(cols=["Embarked"],y=y_train))\
  .pipe(FillNa())

x_test_new=table.fit(x_train).transform(x_test)
x_test_new.head(5)

Unnamed: 0,Age,Fare,Embarked
500,17.0,8.664062,0.334254
501,21.0,7.75,0.511111
502,0.0,7.628906,0.511111
503,37.0,9.585938,0.334254
504,16.0,86.5,0.334254


In [14]:
table.transform_single(x_test.to_dict("record")[0])

{'Age': 17.0, 'Fare': 8.664, 'Embarked': 0.3342541436464088}

In [13]:
table.transform_single(input_dict)

{0: -2.1484200045451156,
 1: 0.30787183575981053,
 2: 0.5269317361325148,
 3: 0.08811175767317664,
 4: 0.324999350738831,
 5: -0.23853515810666273,
 6: -0.43258341930517075,
 7: -0.4242890049704961}

In [15]:
table[-2].show_detail()

Unnamed: 0,col,bin_value,target_value
0,Embarked,C,0.521739
1,Embarked,Q,0.511111
2,Embarked,S,0.334254
3,Embarked,,1.0
