In [1]:
import pandas as pd

In [18]:
import numpy as np
import pandas as pd
from functools import reduce

from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, RobustScaler
from sklearn.preprocessing import Imputer, MultiLabelBinarizer, LabelEncoder, OneHotEncoder

In [3]:
class ColumnExtractor(TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xcols = X[self.cols]
        return Xcols

In [54]:
class DFFeatureUnion(TransformerMixin):
    # FeatureUnion but for pandas DataFrames

    def __init__(self, transformer_list):
        self.transformer_list = transformer_list

    def fit(self, X, y=None):
        for (name, t) in self.transformer_list:
            t.fit(X, y)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xts = [t.transform(X) for _, t in self.transformer_list]
        Xunion = pd.concat(Xts, axis=1).reset_index().drop('index', axis=1)
        return Xunion

In [55]:
class DummyTransformer(TransformerMixin):

    def __init__(self):
        self.dv = None

    def fit(self, X, y=None):
        # assumes all columns of X are strings
        Xdict = X.to_dict('records')
        self.dv = DictVectorizer(sparse=False)
        self.dv.fit(Xdict)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xdict = X.to_dict('records')
        Xt = self.dv.transform(Xdict)
        cols = self.dv.get_feature_names()
        Xdum = pd.DataFrame(Xt, index=X.index, columns=cols)
        # drop column indicating NaNs
        nan_cols = [c for c in cols if '=' not in c]
        Xdum = Xdum.drop(nan_cols, axis=1)
        return Xdum

In [56]:
class DFStandardScaler(TransformerMixin):
    # StandardScaler but for pandas DataFrames

    def __init__(self):
        self.ss = None
        self.mean_ = None
        self.scale_ = None

    def fit(self, X, y=None):
        self.ss = StandardScaler()
        self.ss.fit(X)
        self.mean_ = pd.Series(self.ss.mean_, index=X.columns)
        self.scale_ = pd.Series(self.ss.scale_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xss = self.ss.transform(X)
        Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
        return Xscaled

In [57]:
train = pd.DataFrame({'id':[1,2,3],
                      'gender':['male','female','female'],
                      'browser':['chrome','chrome','safari'],
                      'source':[1,2,2]
                     })
train

Unnamed: 0,id,gender,browser,source
0,1,male,chrome,1
1,2,female,chrome,2
2,3,female,safari,2


In [58]:
test = pd.DataFrame({'id':[4,5,6],
                      'gender':['male','male','female'],
                      'browser':['chrome','safari','IE'],
                      'source':[1,2,3]
                    
                    })
test

Unnamed: 0,id,gender,browser,source
0,4,male,chrome,1
1,5,male,safari,2
2,6,female,IE,3


In [59]:
data = pd.concat([train, test], axis=0, keys=['train', 'test'], sort=False).reset_index(level=0).rename({'level_0':'group'}, axis=1)
data

Unnamed: 0,group,id,gender,browser,source
0,train,1,male,chrome,1
1,train,2,female,chrome,2
2,train,3,female,safari,2
0,test,4,male,chrome,1
1,test,5,male,safari,2
2,test,6,female,IE,3


In [60]:
cat_features = ['gender', 'browser']
num_features = ['source']

In [61]:
pipeline = Pipeline([
    ('features', DFFeatureUnion(transformer_list=[
        ('numericals', ColumnExtractor(num_features)),
        ('categoricals', Pipeline([
            ('extract', ColumnExtractor(cat_features)),
            ('encoder', DummyTransformer())
        ]))
    ]))
#     ('scale', DFStandardScaler())
])

In [62]:
tlist =[
        ('numericals', ColumnExtractor(num_features)),
        ('categoricals', Pipeline([
            ('extract', ColumnExtractor(cat_features)),
            ('encoder', DummyTransformer())
        ]))
    ]
Xts = [t.fit_transform(data) for _, t in tlist]

In [63]:
Xts[0]

Unnamed: 0,source
0,1
1,2
2,2
0,1
1,2
2,3


In [64]:
Xts[1]

Unnamed: 0,browser=IE,browser=chrome,browser=safari,gender=female,gender=male
0,0.0,1.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,1.0,0.0
0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0


In [65]:
*Xts

SyntaxError: can't use starred expression here (<ipython-input-65-cc09d02e0531>, line 1)

In [66]:
Xunion = pd.concat(Xts, axis=1).reset_index().drop('index', axis=1)
Xunion

Unnamed: 0,source,browser=IE,browser=chrome,browser=safari,gender=female,gender=male
0,1,0.0,1.0,0.0,0.0,1.0
1,2,0.0,1.0,0.0,1.0,0.0
2,2,0.0,0.0,1.0,1.0,0.0
3,1,0.0,1.0,0.0,0.0,1.0
4,2,0.0,0.0,1.0,0.0,1.0
5,3,1.0,0.0,0.0,1.0,0.0


In [67]:
data_pipeline = pipeline.fit_transform(data)
data_pipeline

Unnamed: 0,source,browser=IE,browser=chrome,browser=safari,gender=female,gender=male
0,1,0.0,1.0,0.0,0.0,1.0
1,2,0.0,1.0,0.0,1.0,0.0
2,2,0.0,0.0,1.0,1.0,0.0
3,1,0.0,1.0,0.0,0.0,1.0
4,2,0.0,0.0,1.0,0.0,1.0
5,3,1.0,0.0,0.0,1.0,0.0


In [24]:
ss = StandardScaler() 
ss.fit_transform(data_pipeline.loc[:,['browser=chrome']])

array([[ 1.],
       [ 1.],
       [-1.],
       [ 1.],
       [-1.],
       [-1.]])

In [26]:
ss.mean_, ss.scale_

(array([0.5]), array([0.5]))

In [14]:
dt = DummyTransformer()
dt.fit_transform(train[categoricals])

Unnamed: 0,browser=chrome,browser=safari,gender=female,gender=male
0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0
