In [1]:
import os
os.chdir("../../")#与easymlops同级目录

### 拆分数据

In [2]:
import pandas as pd
data=pd.read_csv("./data/demo.csv")
data["date1"]="2020-03-06"
data["date2"]="2023-01-04"
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,date1,date2
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2020-03-06,2023-01-04
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2020-03-06,2023-01-04
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2020-03-06,2023-01-04
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2020-03-06,2023-01-04
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,2020-03-06,2023-01-04


In [3]:
x_train=data[:500]
x_test=data[500:]
y_train=x_train["Survived"]
y_test=x_test["Survived"]
del x_train["Survived"]
del x_test["Survived"]

### pipe构建

In [4]:
from easymlops import TablePipeLine

In [5]:
from easymlops.table.preprocessing import *
from easymlops.table.ensemble import Parallel
table=TablePipeLine()
table.pipe(FixInput())\
  .pipe(Parallel([IsNull(),IsNotNull()]))\
  .pipe(FillNa(cols=["Cabin","Ticket","Parch","Fare","Sex"],fill_mode="mode"))\
  .pipe(FillNa(cols=["Age"],fill_mode="mean"))\
  .pipe(FillNa(fill_detail={"Embarked":"N"}))\
  .pipe(FillNa())\
  .pipe(TransToCategory(cols=["Cabin","Embarked","Name"]))\
  .pipe(TransToFloat(cols=["Age","Fare"]))\
  .pipe(TransToInt(cols=["Pclass","PassengerId","SibSp","Parch"]))\
  .pipe(ClipString(cols=["Name"],default_clip_index=(0,10)))\
  .pipe(Replace(cols=["Cabin"],source_values=["nan","N","B79"],target_value="nan"))\
  .pipe(TransToLower(cols=["Ticket","Cabin","Embarked","Name","Sex"]))\
  .pipe(MapValues(map_detail={"Cabin":[(["nan","NaN"],"n")],"Age":[("(-1,10)",10),("[10,20]",20)]}))\
  .pipe(Clip(cols=["Age"],default_clip=(1,99),name="clip_name"))\
  .pipe(Clip(cols=["Fare"],percent_range=(1,99),name="clip_fare"))\
  .pipe(MinMaxScaler(cols=[("Age","Age_minmax")]))\
  .pipe(Normalizer(cols=[("Fare","Fare_normal")]))\
  .pipe(Bins(n_bins=10,strategy="uniform",cols=[("Age","Age_uni")]))\
  .pipe(Bins(n_bins=10,strategy="quantile",cols=[("Age","Age_quan")]))\
  .pipe(Bins(n_bins=10,strategy="kmeans",cols=[("Fare","Fare_km")]))\
  .pipe(Parallel([Add(left_col_name="Pclass",right_col_name="SibSp"),
                  Subtract(left_col_name="Pclass",right_col_name="Fare"),
                  Multiply(left_col_name="Fare",right_col_name="Age"),
                  Divide(left_col_name="Age_minmax",right_col_name="Fare_normal"),
                  DivideExact(left_col_name="Age_minmax",right_col_name="Pclass"),
                  Mod(left_col_name="PassengerId",right_col_name="Pclass")]))\
  .pipe(Parallel([Equal(left_col_name="Pclass",right_col_name="SibSp"),
                  GreaterThan(left_col_name="Pclass",right_col_name="Fare"),
                  GreaterEqualThan(left_col_name="Fare",right_col_name="Age"),
                  LessThan(left_col_name="Age_minmax",right_col_name="Fare_normal"),
                  LessEqualThan(left_col_name="Age_minmax",right_col_name="Pclass"),
                  And(left_col_name="PassengerId",right_col_name="Pclass"),
                  Or(left_col_name="Pclass",right_col_name="SibSp")]))\
  .pipe(DateDayDiff(left_col_name="date2",right_col_name="date1"))


x_test_new=table.fit(x_train).transform(x_test)
x_test_new.head(5)

Unnamed: 0,Pclass_equal_SibSp,Pclass_greater_than_Fare,Fare_greater_equal_than_Age,Age_minmax_less_than_Fare_normal,Age_minmax_less_equal_than_Pclass,PassengerId_and_Pclass,Pclass_add_SibSp,Pclass_subtract_Fare,Fare_multiply_Age,Age_minmax_divide_Fare_normal,...,date1_is_not_null,date2_is_not_null,Age_minmax,Fare_normal,Age_uni,Age_quan,Fare_km,PassengerId_mod_Pclass,Pclass_or_SibSp,date2_day_diff_date1
500,0,0,0,0,1,1,3,-5.664062,173.25,-0.307692,...,1,1,0.16,-0.52,1,1,1,0,1,1034
501,0,0,0,0,1,1,3,-4.75,162.75,-0.333333,...,1,1,0.18,-0.54,1,2,1,1,1,1034
502,0,0,0,0,1,1,3,-4.628906,222.75,-0.574074,...,1,1,0.31,-0.54,3,5,1,2,1,1034
503,0,0,0,0,1,1,3,-6.585938,354.75,-0.88,...,1,1,0.44,-0.5,4,6,1,0,1,1034
504,0,0,1,1,1,1,1,-85.5,1730.0,0.125984,...,1,1,0.16,1.27,1,1,4,0,1,1034


### 性能、一致性测试

In [6]:
table.auto_test(x_test[:5])


###################################################################
 1.一致性测试和性能测试:check_transform_function                      
###################################################################
(<class 'easymlops.table.preprocessing.core.FixInput'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.ensemble.Parallel'>) module check [transform] complete,speed:[0.2ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<cl

column:[PassengerId] check [int trans float] complete,speed:[15.63ms]/it,cpu:[18%],memory:[0K]
column:[Pclass] check [int trans float] complete,speed:[19.4ms]/it,cpu:[52%],memory:[28K]
column:[SibSp] check [int trans float] complete,speed:[13.1ms]/it,cpu:[25%],memory:[6988K]
column:[Parch] check [int trans float] complete,speed:[15.62ms]/it,cpu:[19%],memory:[0K]


### 持久化测试

In [7]:
table.save("ml.pkl")

In [8]:
table=TablePipeLine()
table.pipe(FixInput())\
  .pipe(Parallel([IsNull(),IsNotNull()]))\
  .pipe(FillNa(cols=["Cabin","Ticket","Parch","Fare","Sex"],fill_mode="mode"))\
  .pipe(FillNa(cols=["Age"],fill_mode="mean"))\
  .pipe(FillNa(fill_detail={"Embarked":"N"}))\
  .pipe(FillNa())\
  .pipe(TransToCategory(cols=["Cabin","Embarked","Name"]))\
  .pipe(TransToFloat(cols=["Age","Fare"]))\
  .pipe(TransToInt(cols=["Pclass","PassengerId","SibSp","Parch"]))\
  .pipe(ClipString(cols=["Name"],default_clip_index=(0,10)))\
  .pipe(Replace(cols=["Cabin"],source_values=["nan","N","B79"],target_value="nan"))\
  .pipe(TransToLower(cols=["Ticket","Cabin","Embarked","Name","Sex"]))\
  .pipe(MapValues(map_detail={"Cabin":[(["nan","NaN"],"n")],"Age":[("(-1,10)",10),("[10,20]",20)]}))\
  .pipe(Clip(cols=["Age"],default_clip=(1,99),name="clip_name"))\
  .pipe(Clip(cols=["Fare"],percent_range=(1,99),name="clip_fare"))\
  .pipe(MinMaxScaler(cols=[("Age","Age_minmax")]))\
  .pipe(Normalizer(cols=[("Fare","Fare_normal")]))\
  .pipe(Bins(n_bins=10,strategy="uniform",cols=[("Age","Age_uni")]))\
  .pipe(Bins(n_bins=10,strategy="quantile",cols=[("Age","Age_quan")]))\
  .pipe(Bins(n_bins=10,strategy="kmeans",cols=[("Fare","Fare_km")]))\
  .pipe(Parallel([Add(left_col_name="Pclass",right_col_name="SibSp"),
                  Subtract(left_col_name="Pclass",right_col_name="Fare"),
                  Multiply(left_col_name="Fare",right_col_name="Age"),
                  Divide(left_col_name="Age_minmax",right_col_name="Fare_normal"),
                  DivideExact(left_col_name="Age_minmax",right_col_name="Pclass"),
                  Mod(left_col_name="PassengerId",right_col_name="Pclass")]))\
  .pipe(Parallel([Equal(left_col_name="Pclass",right_col_name="SibSp"),
                  GreaterThan(left_col_name="Pclass",right_col_name="Fare"),
                  GreaterEqualThan(left_col_name="Fare",right_col_name="Age"),
                  LessThan(left_col_name="Age_minmax",right_col_name="Fare_normal"),
                  LessEqualThan(left_col_name="Age_minmax",right_col_name="Pclass"),
                  And(left_col_name="PassengerId",right_col_name="Pclass"),
                  Or(left_col_name="Pclass",right_col_name="SibSp")]))\
  .pipe(DateDayDiff(left_col_name="date2",right_col_name="date1"))

table.load("ml.pkl")

In [9]:
table.auto_test(x_test[:10])


###################################################################
 1.一致性测试和性能测试:check_transform_function                      
###################################################################
(<class 'easymlops.table.preprocessing.core.FixInput'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.ensemble.Parallel'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<class 'easymlops.table.preprocessing.onevar_operation.FillNa'>) module check [transform] complete,speed:[0.0ms]/it,cpu:[0%],memory:[0K]
(<cl

column:[PassengerId] check [int trans float] complete,speed:[18.75ms]/it,cpu:[17%],memory:[316K]
column:[Pclass] check [int trans float] complete,speed:[15.63ms]/it,cpu:[17%],memory:[4K]
column:[SibSp] check [int trans float] complete,speed:[14.06ms]/it,cpu:[20%],memory:[0K]
column:[Parch] check [int trans float] complete,speed:[15.97ms]/it,cpu:[20%],memory:[0K]


In [10]:
table.transform(x_test[:5])

Unnamed: 0,Pclass_equal_SibSp,Pclass_greater_than_Fare,Fare_greater_equal_than_Age,Age_minmax_less_than_Fare_normal,Age_minmax_less_equal_than_Pclass,PassengerId_and_Pclass,Pclass_add_SibSp,Pclass_subtract_Fare,Fare_multiply_Age,Age_minmax_divide_Fare_normal,...,date1_is_not_null,date2_is_not_null,Age_minmax,Fare_normal,Age_uni,Age_quan,Fare_km,PassengerId_mod_Pclass,Pclass_or_SibSp,date2_day_diff_date1
500,0,0,0,0,1,1,3,-5.664062,173.25,-0.307692,...,1,1,0.16,-0.52,1,1,1,0,1,1034
501,0,0,0,0,1,1,3,-4.75,162.75,-0.333333,...,1,1,0.18,-0.54,1,2,1,1,1,1034
502,0,0,0,0,1,1,3,-4.628906,222.75,-0.574074,...,1,1,0.31,-0.54,3,5,1,2,1,1034
503,0,0,0,0,1,1,3,-6.585938,354.75,-0.88,...,1,1,0.44,-0.5,4,6,1,0,1,1034
504,0,0,1,1,1,1,1,-85.5,1730.0,0.125984,...,1,1,0.16,1.27,1,1,4,0,1,1034


In [11]:
x_test_new.head(5)

Unnamed: 0,Pclass_equal_SibSp,Pclass_greater_than_Fare,Fare_greater_equal_than_Age,Age_minmax_less_than_Fare_normal,Age_minmax_less_equal_than_Pclass,PassengerId_and_Pclass,Pclass_add_SibSp,Pclass_subtract_Fare,Fare_multiply_Age,Age_minmax_divide_Fare_normal,...,date1_is_not_null,date2_is_not_null,Age_minmax,Fare_normal,Age_uni,Age_quan,Fare_km,PassengerId_mod_Pclass,Pclass_or_SibSp,date2_day_diff_date1
500,0,0,0,0,1,1,3,-5.664062,173.25,-0.307692,...,1,1,0.16,-0.52,1,1,1,0,1,1034
501,0,0,0,0,1,1,3,-4.75,162.75,-0.333333,...,1,1,0.18,-0.54,1,2,1,1,1,1034
502,0,0,0,0,1,1,3,-4.628906,222.75,-0.574074,...,1,1,0.31,-0.54,3,5,1,2,1,1034
503,0,0,0,0,1,1,3,-6.585938,354.75,-0.88,...,1,1,0.44,-0.5,4,6,1,0,1,1034
504,0,0,1,1,1,1,1,-85.5,1730.0,0.125984,...,1,1,0.16,1.27,1,1,4,0,1,1034
