# sklearn_pandas

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
# frameworks for ML
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

# transformers for category variables
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# transformers for numerical variables
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

# transformers for combined variables
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures

# user-defined transformers
from sklearn.preprocessing import FunctionTransformer

# classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# evaluation
from sklearn.metrics import scorer 

In [11]:
data = pd.DataFrame({'pet':      ['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'],
                     'children': [4., 6, 3, 3, 2, 3, 5, 4],
                     'salary':   [90, 24, 44, 27, 32, 59, 36, 27]})

In [12]:
data

Unnamed: 0,children,pet,salary
0,4.0,cat,90
1,6.0,dog,24
2,3.0,dog,44
3,3.0,fish,27
4,2.0,cat,32
5,3.0,dog,59
6,5.0,cat,36
7,4.0,fish,27


In [16]:
mapper = DataFrameMapper(
    [
        ('pet', LabelBinarizer()), # 根据transformer所需选择1-D array，2-D array输入，输出都是 2-D array
        ('pet', [LabelBinarizer(), PCA(2)]), # 列变换支持级联：先二值化再pca (二值化不支持多列变换)
        (['children'], StandardScaler(), {'alias': 'children_scaled'}), # 映射df[['children']]
        (['children', 'salary'], [MinMaxScaler(), PolynomialFeatures(1)]), # 多列变换(PCA, LDA, 特征交叉)
        (['children', 'salary'], FunctionTransformer(np.log1p)), # 自定义transformer: log1p(作用等同于log(x+1))
       
    ], 
    default=False, # False: 全部丢弃（默认, None: 原封不动地保留, other transformer: 将transformer作用到所有剩余列上
    sparse=False, # 系数数据
    df_out=True, # 是否输出数据框
    input_df=False # 输入是否数据框
)
# 要满足：
# if (df_out and (sparse or default)):
#     raise ValueError("Can not use df_out with sparse or default")

In [17]:
mapper.fit_transform(data)

Unnamed: 0,pet_cat,pet_dog,pet_fish,pet_0,pet_1,children_scaled,children_salary_1,children_salary_x0,children_salary_x1,children_salary_0,children_salary_1.1
0,1.0,0.0,0.0,-0.7071068,-0.306186,0.208514,1.0,0.5,1.0,1.609438,4.51086
1,0.0,1.0,0.0,0.7071068,-0.306186,1.87663,1.0,1.0,0.0,1.94591,3.218876
2,0.0,1.0,0.0,0.7071068,-0.306186,-0.625543,1.0,0.25,0.30303,1.386294,3.806662
3,0.0,0.0,1.0,-5.134781e-16,0.918559,-0.625543,1.0,0.25,0.045455,1.386294,3.332205
4,1.0,0.0,0.0,-0.7071068,-0.306186,-1.459601,1.0,0.0,0.121212,1.098612,3.496508
5,0.0,1.0,0.0,0.7071068,-0.306186,-0.625543,1.0,0.25,0.530303,1.386294,4.094345
6,1.0,0.0,0.0,-0.7071068,-0.306186,1.042572,1.0,0.75,0.181818,1.791759,3.610918
7,0.0,0.0,1.0,-5.134781e-16,0.918559,0.208514,1.0,0.5,0.045455,1.609438,3.332205


In [8]:
# default = transformer
mapper = DataFrameMapper(
    [
        ('pet', LabelBinarizer()),
        ('children', None)
    ], 
    default=StandardScaler()
)
mapper.fit_transform(data.copy())
mapper.transformed_names_

array([[ 1.        ,  0.        ,  0.        ,  4.        ,  2.27500192],
       [ 0.        ,  1.        ,  0.        ,  6.        , -0.87775665],
       [ 0.        ,  1.        ,  0.        ,  3.        ,  0.07762474],
       [ 0.        ,  0.        ,  1.        ,  3.        , -0.73444944],
       [ 1.        ,  0.        ,  0.        ,  2.        , -0.49560409],
       [ 0.        ,  1.        ,  0.        ,  3.        ,  0.79416078],
       [ 1.        ,  0.        ,  0.        ,  5.        , -0.30452782],
       [ 0.        ,  0.        ,  1.        ,  4.        , -0.73444944]])

['pet_cat', 'pet_dog', 'pet_fish', 'children', 'salary']

In [9]:
# input_df=True
from sklearn.base import TransformerMixin
class DateEncoder(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        dt = X.dt
        return pd.concat([dt.year, dt.month, dt.day], axis=1)
    
dates_df = pd.DataFrame({'dates': pd.date_range('2015-10-30', '2015-11-02')})
mapper_dates = DataFrameMapper([
    ('dates', DateEncoder())
], input_df=True, df_out=True)
mapper_dates.fit_transform(dates_df)

Unnamed: 0,dates_0,dates_1,dates_2
0,2015,10,30
1,2015,10,31
2,2015,11,1
3,2015,11,2


In [10]:
# 类别缺失值用频繁项填充
from sklearn_pandas import CategoricalImputer
data = np.array(['a', 'b', 'b', np.nan], dtype=object)
imputer = CategoricalImputer()
imputer.fit_transform(data)

array(['a', 'b', 'b', 'b'], dtype=object)