In [1]:
import pandas as pd
import numpy as np
import scipy
from sklearn.pipeline import Pipeline

class DataframeFunctionTransformer:
    def __init__(self, func):
        self.func=func

    def transform(self,input_df,**transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

# this function takes a dataframe as input and
# returns a modified version thereof
def process_dataframe(input_df):
    input_df["text"] = input_df["text"].map(lambda t: t.upper())
    return input_df

# sample dataframe
df = pd.DataFrame({
    "id":[1,2,3,4],
    "text":["foo","Bar","BAz","quux"]
})

pipeline = Pipeline([
    ("uppercase", DataframeFunctionTransformer(process_dataframe))
])

pipeline.fit_transform(df)




Unnamed: 0,id,text
0,1,FOO
1,2,BAR
2,3,BAZ
3,4,QUUX


In [2]:
# Custom Transformer: ToDense

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

class ToDenseTransformer():

    def transform(self,X,y=None,**fit_params):
        return X.todense()

    # just return self
    def fit(self, X, y=None,**fit_params):
        return self

# need to make matrices dense because PCA does not work with sparse vectors.
pipeline = Pipeline([
    ('to_dense',ToDenseTransformer()),
    ('pca',PCA()),
    ('clf',DecisionTreeClassifier())
])


data = scipy.sparse.csr_matrix([
    [1.,0.,0.,0.,0.,0.],
    [0.,1.,0.,0.,0.,0.],
    [1.,0.,0.,0.,0.,0.],
    [0.,0.,0.,0.,1.,0.],
    [0.,0.,0.,1.,0.,0.],
    [1.,0.,0.,0.,0.,0.],
    [1.,1.,0.,0.,0.,0.],
    [1.,1.,0.,0.,0.,0.],
])

target = np.array([1,1,1,0,0,0,1,1])

pipeline.fit(data,target)
pipeline.predict(data)
# >>> array([1, 1, 1, 0, 0, 1, 1, 1])



array([1, 1, 1, 0, 0, 1, 1, 1])