In [1]:
import pandas as pd

In [9]:
import numpy as np
import pandas as pd
from functools import reduce

from sklearn.base import TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, RobustScaler
from sklearn.preprocessing import Imputer, MultiLabelBinarizer, LabelEncoder, OneHotEncoder

In [5]:
class ColumnExtractor(TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xcols = X[self.cols]
        return Xcols

In [6]:
class DFFeatureUnion(TransformerMixin):
    # FeatureUnion but for pandas DataFrames

    def __init__(self, transformer_list):
        self.transformer_list = transformer_list

    def fit(self, X, y=None):
        for (name, t) in self.transformer_list:
            t.fit(X, y)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xts = [t.transform(X) for _, t in self.transformer_list]
        Xunion = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xts)
        return Xunion

In [13]:
class DummyTransformer(TransformerMixin):

    def __init__(self):
        self.dv = None

    def fit(self, X, y=None):
        # assumes all columns of X are strings
        Xdict = X.to_dict('records')
        self.dv = DictVectorizer(sparse=False)
        self.dv.fit(Xdict)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xdict = X.to_dict('records')
        Xt = self.dv.transform(Xdict)
        cols = self.dv.get_feature_names()
        Xdum = pd.DataFrame(Xt, index=X.index, columns=cols)
        # drop column indicating NaNs
        nan_cols = [c for c in cols if '=' not in c]
        Xdum = Xdum.drop(nan_cols, axis=1)
        return Xdum

In [2]:
train = pd.DataFrame({'id':[1,2,3],
                      'gender':['male','female','female'],
                      'browser':['chrome','chrome','safari']})
train

Unnamed: 0,id,gender,browser
0,1,male,chrome
1,2,female,chrome
2,3,female,safari


In [3]:
test = pd.DataFrame({'id':[4,5,6],
                      'gender':['male','male','female'],
                      'browser':['chrome','safari','IE']})
test

Unnamed: 0,id,gender,browser
0,4,male,chrome
1,5,male,safari
2,6,female,IE


In [7]:
categoricals = ['gender', 'browser']

In [11]:
le = LabelEncoder()
le.fit_transform(train['gender'])

array([1, 0, 0], dtype=int64)

In [14]:
dt = DummyTransformer()
dt.fit_transform(train[categoricals])

Unnamed: 0,browser=chrome,browser=safari,gender=female,gender=male
0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0
