# Advance pipelines

The main idea is to put to the test a complicated, real-world data science problem and solution.    
We want to simulate real-world work on the dirty [titanic](https://www.kaggle.com/c/titanic) data.
* [Inpiried heavily  by this notebook](https://www.kaggle.com/bombatkarvivek/pyspark-ml-pipeline-with-titanic-dataset-eda).   

Pipeline steps:    

 * Cleaned *Cabin* values that have illegal values are unnecessary, but it is crucial to take cleaning data into account, as it significantly affects the pipeline.    
 * Calculating *FamilySize = Parch + SibSp + 1* (self)
 * Get the Initials from the name, and map them to either "Mr", "Miss", "Mrs" and "Other".
 * Calculate the mean *Age* for each Initial and use it to fill missing values for *Age*.
 * Create *AgeGroup* for each male/female and under/over the age of 15.
 * Bin *FamilySize* to the [0,1, 2, 5, 7, 100,1000] bins.
 * Encode *Embarked, Sex, FamilyBin, AgeGroup* with a label/one-hot encoder.
 * Use [LightGBM](https://lightgbm.readthedocs.io/en/latest/) (or Random Forest for PySpark) for modelling.
 * Add the survived/died probability in a consumable way.

 
 


In [16]:
import vaex
df = vaex.open('data/titanic.csv')
df.head(2)

#,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,--,S
1,2,1,1,"'Cumings, Mrs. John Bradley (Florence Briggs Tha...",female,38,1,0,PC 17599,71.2833,C85,C


# Vaex solution

In [17]:
import vaex
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from lightgbm.sklearn import LGBMClassifier
from vaex.ml.lightgbm import LightGBMModel
from sklearn.metrics import accuracy_score
from vaex.ml import LabelEncoder
from goldilox import Pipeline
import vaex.ml
import pyarrow as pa
import re
import warnings
warnings.filterwarnings('ignore')


def fit(df):
  numeric_cols = ['PassengerId','Survived', 'Pclass', 'Age', 'SibSp','Parch','Ticket','Fare'] 
  numeric_features = ['PassengerId','Pclass','Age', 'SibSp','Parch','Fare'] 
  string_features = [ 'Embarked', 'Sex', 'FamilyBin'] 
  features = numeric_features

  df = df[df['Cabin'].str.contains(' ') != True]
  df['FamilySize'] = df['Parch'] + df['SibSp'] + 1
  df['Name'] = df['Name'].fillna('Mr.')
  pattern = re.compile('([A-Za-z]+)\.')

  df['Initial'] = df['Name'].str.extract_regex(r'(?P<initial>[A-Za-z]+)\.').apply(lambda x: x.get('initial','Other'))

  initials_map = {k:v for k,v in (zip(['Other','Miss','Mr','Mrs','Master','Mlle','Mme','Ms','Dr',
                                                'Major','Lady','Countess',
                                                'Jonkheer','Col','Rev',
                                                'Capt','Sir','Don'],
                                              ['Other','Miss','Mr','Mrs','Mrs','Miss','Miss','Miss',
                                                'Mr','Mr','Mrs','Mrs',
                                                'Other','Other','Other',
                                                'Mr','Mr','Mr']))}
  df['Initial'] = df['Initial'].map(initials_map)

  gb = df.groupby(['Initial']).agg({'value':vaex.agg.mean('Age')})
  means = {k:v for k,v in zip(gb['Initial'].tolist(), gb['value'].tolist())}

  for initial, value in means.items():    
      df['Age'] = df.func.where((df.Age.isna() & df.Initial.str.match(initial)), value, df.Age)

  df['AgeGroup'] = df.func.where(((df.Sex.str.match('male')) & (df.Age<=15)), 'boy', '')
  df['AgeGroup'] = df.func.where(((df.Sex.str.match('female')) & (df.Age <= 15)), 'girl', df.AgeGroup)
  df['AgeGroup'] = df.func.where(((df.Sex.str.match('male')) & (df.Age > 15)), 'adult male', df.AgeGroup)
  df['AgeGroup'] = df.func.where(((df.Sex.str.match('female')) & (df.Age > 15)), 'adult female', df.AgeGroup)
  df['FamilyBin'] = df['FamilySize'].digitize(bins= [0,1, 2, 5, 7, 100,1000])


  string_features = [ 'Embarked', 'Sex', 'FamilyBin', 'AgeGroup'] 
  encoder = LabelEncoder(features=string_features, prefix='le_', allow_unseen=True)
  df = encoder.fit_transform(df)

  features = ['PassengerId','Pclass','Age', 'SibSp','Parch','Fare'] +[f"{encoder.prefix}{column}" for column in string_features]
  target = 'Survived'
  model = LightGBMModel(features=features, 
                          target=target,                         
                          prediction_name='lgm_predictions', 
                          num_boost_round=500,params={'verbose': -1,
                                                    'application':'binary'})
  model.fit(df)
  df = model.transform(df)
  df['prediction'] = df.func.where(df['lgm_predictions'] > 0.5, 1,0)
  df['target_label'] = df.func.where(df['lgm_predictions'] > 0.5, 'survived','died')
  return df

train, test = df.ml.train_test_split()
pipeline = Pipeline.from_vaex(df, fit=fit)
pipeline.fit(train)
accuracy = accuracy_score(test['Survived'], pipeline.inference(test)['prediction'])
pipeline.fit(df)
pipeline.set_variable('accuracy',accuracy)
assert pipeline.validate()
print(f"Accuracy: {accuracy}")


ERROR:goldilox.vaex.pipeline:could not sample first: 'float' object is not iterable
ERROR:goldilox.vaex.pipeline:could not sample first: 'float' object is not iterable


Accuracy: 0.7584269662921348


## Sklearn version

In [22]:
import numpy as np
import pandas as pd
import json
import sklearn.pipeline
from goldilox import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import accuracy_score
from joblib import dump
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('data/titanic.csv')

train, test = train_test_split(df)

target = 'Survived'
fetures = list(train.columns)
fetures.remove(target)


dizip_initials = {k:v for k,v in (zip(['Mlle','Mme','Ms','Dr',
                                               'Major','Lady','Countess',
                                               'Jonkheer','Col','Rev',
                                               'Capt','Sir','Don'],
                                             ['Miss','Miss','Miss',
                                              'Mr','Mr','Mrs','Mrs',
                                              'Other','Other','Other',
                                              'Mr','Mr','Mr']))}

class PandasTransformer(TransformerMixin, BaseEstimator):
    
     def fit(self, X, y=None, **fit_params):
        return self


class DropSome(PandasTransformer):
    
    def __init__(self, column):
        self.column = column
    
    def transform(self, df, **transform_params):
        return df[df[self.column].str.contains(' ')!=True]
    
    
class FamilySizeTransformer(PandasTransformer):
    def __init__(self, columns):
        self.columns = columns
        
    def transform(self, df, **transform_params):
        df['FamilySize'] = 1
        for column in self.columns:
            df['FamilySize'] = df['FamilySize']+df[column]
        return df

class InitialsTransformer(PandasTransformer):
    def __init__(self, column):
        self.column = column
        self.initials_map = {k:v for k,v in (zip(['Miss','Mr','Mrs','Mlle','Mme','Ms','Dr',
                                               'Major','Lady','Countess',
                                               'Jonkheer','Col','Rev',
                                               'Capt','Sir','Don'],
                                             ['Miss','Mr','Mrs','Miss','Miss','Miss',
                                              'Mr','Mr','Mrs','Mrs',
                                              'Other','Other','Other',
                                              'Mr','Mr','Mr']))}
        
    def transform(self, df, **transform_params):
        df['Initial'] = df[self.column].str.extract(r'([A-Za-z]+)\.')        
        df['Initial'] = df['Initial'].map(self.initials_map)
        return df   
    

class AgeImputer(PandasTransformer):
    def __init__(self, column):
        self.column = column
        self.means = {}
    
    def fit(self, X, y=None, **fit_params):
        self.means = X.groupby(['Initial'])['Age'].mean().round().astype(int).to_dict() 
        return self


    def transform(self, df, **transform_params):
        for initial, value in self.means.items():
            df['Age'] = np.where((df['Age'].isnull()) & (df['Initial'].str.match(initial)),value, df['Age'])
        return df   
    
class AgeGroupTransformer(PandasTransformer):
    def __init__(self, column):
        self.column = column
    

    def transform(self, df, **transform_params):
        df['AgeGroup'] = None
        df.loc[((df['Sex'] == 'male') & (df['Age'] <= 15)), 'AgeGroup'] = 'boy'
        df.loc[((df['Sex'] == 'female') & (df['Age'] <= 15)), 'AgeGroup'] = 'girl'
        df.loc[((df['Sex'] == 'male') & (df['Age'] > 15)), 'AgeGroup'] = 'adult male'
        df.loc[((df['Sex'] == 'female') & (df['Age'] > 15)), 'AgeGroup'] = 'adult female'
        return df
  
class BinTransformer(PandasTransformer):
    def __init__(self, column,bins=None):
        self.column = column
        self.bins = bins or [0,1, 2, 5, 7, 100,1000]
    

    def transform(self, df, **transform_params):
        df['FamilyBin'] = pd.cut(df[self.column], self.bins).astype(str)
        return df


class MultiColumnLabelEncoder(PandasTransformer):

    def __init__(self, columns = None, prefix='le_', fillna_value=''):
        self.columns = columns 
        self.encoders = {}
        self.prefix = prefix
        self.fillna_value = fillna_value
        
    def _add_prefix(self, col):
        return f"{self.prefix}{col}"
    
    def preprocess_series(self, s):
        return s.fillna(self.fillna_value).values.reshape(-1,1)
        
    def encode(self, column, X):
        return self.encoders[column].transform(self.preprocess_series(X[column])).reshape(-1)
        
    def fit(self,X, y=None):
        for column in self.columns:
            le = OrdinalEncoder(handle_unknown='use_encoded_value',
                                unknown_value=-1)
            self.encoders[column] = le
            le.fit(self.preprocess_series(X[column]))
        return self 

    def transform(self, X):
        output = X.copy()
        if self.columns is not None:
            for column in self.columns:
                output[self._add_prefix(column)] = self.encode(column, X)
        return output

        
class FeatureSelector(PandasTransformer):

    def __init__(self, columns):
        self.columns = columns
    

    def transform(self, df, **transform_params):        
        return df[self.columns]

class LGBMTransformer(PandasTransformer):

    def __init__(self, target, features, output_column='prediction', **params):
        self.features = features
        self.params = params
        self.model = None
        self.target = target
        self.output_column = output_column
        
    def fit(self,X, y):
        self.model = LGBMClassifier(**self.params).fit(X[self.features], X[self.target])
        return self
    
    def predict(self, X):
        if self.model is None:
            raise RuntimeError("Model is not trained")
        return self.model.predict(X[self.features])

    def transform(self, df, **transform_params):        
        if self.model is None:
            raise RuntimeError("Model is not trained")
        missing_features = [feature for feature in self.features if feature not in df]
        if len(missing_features)>0:
            raise RuntimeError(f"Features missing: {missing_features}")
        
        df['prediction'] = self.model.predict(df[self.features])
        probabilities = self.model.predict_proba(df[self.features])        
        df['probabilities'] = [{'died':p[0],'survived':p[1]} for p in probabilities]
        df['label'] = df['prediction'].map({1:'survived',0:'died'})
        return df
    

class CleaningTransformer(PandasTransformer):   
    def __init__(self, column):
        self.column = column

    def transform(self, df, **transform_params):        
        return df[df[self.column].str.contains(' ')!=True]
    
    
sk_pipeline = sklearn.pipeline.Pipeline([
    ('cleaning',CleaningTransformer('Cabin')),
    ('FamilySizeTransformer', FamilySizeTransformer(['Parch','SibSp'])),
    ('InitialsTransformer', InitialsTransformer('Name')),
    ('AgeImputer', AgeImputer('Age')),
    ('AgeGroupTransformer', AgeGroupTransformer('Age')),
    ('BinTransformer', BinTransformer('FamilySize')),
    ('MultiColumnLabelEncoder', MultiColumnLabelEncoder(columns=['Embarked', 'Sex', 'FamilyBin'])),
    ('model', LGBMTransformer(target='Survived', features=['PassengerId','Pclass', 'Age', 'SibSp', 
                                        'Parch', 'Fare', 'le_Embarked','le_Sex', 'le_FamilyBin'],verbose=-1)),
    ])


# train
target = 'Survived'
fetures = list(train.columns)
fetures.remove(target)

X = train[fetures]
y = train['Survived']


trained_pipeline = sk_pipeline.fit(train)
trained_pipeline.steps = trained_pipeline.steps[1:] # IMPORTANT - remove the filtering for inference
accuracy = accuracy_score(test[target], trained_pipeline.predict(test))
print(f"Accuracy: {accuracy}")


pipeline = Pipeline.from_sklearn(sk_pipeline).fit(df)
pipeline.variables['accuracy'] = accuracy

Accuracy: 0.7937219730941704


# Deploy

In [None]:
pipeline.raw.pop('Survived', None)
print(f"Saved to: {pipeline.save('pipeline.pkl')}")
print(f"Check out the docs: http://127.0.0.1:5000/docs\n")

!gl serve pipeline.pkl