In [1]:
# Using external data in sklearn pipelines

# Sometimes when I am building a pipeline I want to include in it data which will
# not be available in the holdout dataset but which is still useful for the modeling
# task. For instance, from historical data I might want to build new features that
# describe trends that augment the holdout system data. In this assignment, you can
# imagine that historical trends for individual events, sequences of events, or
# runners might be useful.

# Here's an example, starting with our usual test/holdout data
import pandas as pd
import numpy as np
import cloudpickle
import sklearn
df=pd.read_csv("../../assets/assignment/df_train.csv.gz")

# We are going to regress on time
df['x_result.duration.chip']=pd.to_timedelta(df['result.duration.chip']).astype(int)

# For this demo we are going to use the sequence identifiers, let's look at a
# couple I hand picked
data=df.query('sequence_id in ["5e862221-758c-48b1-a7cf-11bcc0a80a41","57bdcd1f-a474-43e0-8e54-5f3a5206f5f9"]')
data.groupby(['sequence_id','event.id']).apply(len)

  exec(code_obj, self.user_global_ns, self.user_ns)
  df['x_result.duration.chip']=pd.to_timedelta(df['result.duration.chip']).astype(int)


sequence_id                           event.id                            
57bdcd1f-a474-43e0-8e54-5f3a5206f5f9  57bdcd1f-a474-43e0-8e54-5f3a5206f5f9    3259
                                      59a99331-e614-4004-8634-16bc5206f5f9    3772
5e862221-758c-48b1-a7cf-11bcc0a80a41  5e862221-758c-48b1-a7cf-11bcc0a80a41     194
                                      5e8622bc-0ef4-472a-a19b-6f4bc0a80a41     268
                                      5e862439-c108-40be-b3ec-36b6c0a80a41     404
dtype: int64

In [2]:
# We see that these two sequences have historical event data, and different numbers of runners
# so lets separate this into a training and validation set
train=data.query("`event.id` in ['57bdcd1f-a474-43e0-8e54-5f3a5206f5f9', '5e862221-758c-48b1-a7cf-11bcc0a80a41']")
test=data.query("`event.id` not in ['57bdcd1f-a474-43e0-8e54-5f3a5206f5f9', '5e862221-758c-48b1-a7cf-11bcc0a80a41']")

In [3]:
# Let's say that I want to include in my pipeline the ratio of men to non-men
# in the race. I can create that with the following:
sequence_stats=train.groupby(['sequence_id','event.id']).apply(lambda x: sum(x['sex']=='Male')/len(x)).groupby(['sequence_id']).apply(np.mean)
sequence_stats

sequence_id
57bdcd1f-a474-43e0-8e54-5f3a5206f5f9    0.320344
5e862221-758c-48b1-a7cf-11bcc0a80a41    0.376289
dtype: float64

In [None]:
# To embed this data in my pipeline, I need to add it to one of the steps - either
# a transformer or an estimator - as an object. Then when this gets serialized as
# part of the pickle process it will be there for estimation. Let's create a new
# transformer which does this, our strategy will be that the __init__ function
# will calculate our sequence stats data and store it in the object, while our
# transform function will add that to unseen data as a column
import numpy as np

class SequenceSexRatio(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator):
    
    # this will be called when we first make our pipeline, so we can store things
    def __init__(self, train):
        # as we want to visualize this later we have to have something in the train
        # attribute, which is the parameter to this function. I'll just make this the 
        # first row of the data coming in.
        self.train=train.iloc[0]
        
        self.sequence_stats=train.groupby(['sequence_id','event.id']).apply(
            lambda x: sum(x['sex']=='Male')/len(x)).groupby(['sequence_id']).apply(np.mean)
        # you need to name a series in order to merge it later
        self.sequence_stats.name='sex_sequence_ratio'
    
    # this does nothing interesting
    def fit(self, data=None, y=None):
        return self
    
    # this will be called when we want to predict our data, since it will transform
    def transform(self, data):
        # we can print out some diagnostics here, let's check how many sequences in the
        # data we are trying to transform existing in our historical dataset
        print(f'The number of sequences which are also in our historical data are {len(set(data["sequence_id"].unique()).intersection(self.sequence_stats.index))}')
        # align on index via a left join
        newdata=pd.merge(data,self.sequence_stats,left_on='sequence_id',right_index=True,how='left')
        # set our new sex_sequence_ratio column 
        data['SequenceSexRatio']=newdata['sex_sequence_ratio']
        # return all of the data to the next stage of the pipeline
        return data

In [None]:
# Now we can build a little pipeline and use this column as a predictor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

pipe = Pipeline(
    steps=[
        # this will initialize our dataframe with training data
        ("SequenceSexRatio", SequenceSexRatio(train)),
        # this will cut out all parameters except a few
        ("reduce_columns", 
         ColumnTransformer(transformers=[
            ("sex", OneHotEncoder(categories=[['Male','Female']], handle_unknown='ignore'), ['sex']),
            ('cols_to_keep', 'passthrough', ['age','SequenceSexRatio']),
        ], remainder='drop')),
        # we'll just use this questionable method for dealing with missing values across all columns
        ('fill missing', SimpleImputer(missing_values=np.nan, strategy='mean')),
        # this is our final estimator
        ("regressor", LinearRegression())
    ])

# now let's visually inspect our pipeline
from sklearn import set_config
set_config(display="diagram")
display(pipe)

In [None]:
# Now we have a pipeline that will first add in our new column of data then
# pass this on to the rest of the pipeline. Importantly, the new data is
# added when we create the object, through the constructor, and is merged
# with the data we are fitting to or predicting on when the transform()
# function is called. Since the object is *not* created when we predict, and
# instead is loaded through the pickle process, it means we can embed historical
# data in the pipeline for use in the future.

# We now have to fit out pipeline, this will just call the transform() and fit()
# functions of the objects in the pipeline, but will not create new objects.
fitted_pipe=pipe.fit(train, train['x_result.duration.chip'])

# And we can take that final regression object and observe the coefficients
# to verify that we have four, two for sex, one for sexsequenceratio, and
# one age
fitted_pipe.steps[-1][1].coef_

In [None]:
# Finally we can now try this on unseen data
fitted_pipe.score(test, test['x_result.duration.chip'])