# Machine Learning Pipeline

Based on [End-to-End Machine Learning Pipeline with scikit learn](https://www.youtube.com/watch?v=XvnkUg1yVmk) from [CodeEmporium](https://www.youtube.com/channel/UC5_6ZD6s8klmMu9TXEB_1IA) Youtube channel

In [1]:
!pip3 install sklearn_pandas # 2.0.4
!pip3 install catboost # 0.24.4



In [2]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.datasets import make_classification
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper

## Dataset Preparation

In [3]:
categorical_features = ['feat_5', 'feat_6', 'feat_7', 'feat_8']
numerical_features = ['feat_1', 'feat_2', 'feat_3', 'feat_4']

In [4]:
X, y = make_classification(n_samples=10000,
                          n_features=4,
                          n_redundant=0,
                          random_state=42,
                          weights=[0.5])

Add cathegorical columns

In [5]:
for col in range(4):
    num_classes = np.random.randint(2,10)
    cat_col = np.random.randint(num_classes, size=X.shape[0]).reshape(-1,1)
    X = np.hstack((X, cat_col))

To DataFrame

In [6]:
columns = [f'feat_{i+1}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=columns)
y = pd.DataFrame(y, columns=['label'])

Scale regressors, modify categoricals

In [7]:
for col in numerical_features:
    mean = np.random.randint(10, 1000)
    std = np.random.randint(1, 100)
    X[col] = X[col].apply(lambda x: mean + std * x).astype(int)
    
for col in categorical_features:
    X[col] = X[col].apply(lambda x: f'str_{x}' if np.isnan(x) == False else x)

Create Nans in the dataset

In [8]:
for col in categorical_features + numerical_features:
    X[col] = X[col].sample(frac=0.7)
    
df = X.merge(y, left_index=True, right_index=True)

In [9]:
df.sample(3)

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,label
8040,,,738.0,210.0,str_1.0,,,str_5.0,0
2698,,779.0,689.0,260.0,str_0.0,,str_0.0,str_0.0,1
1618,509.0,,770.0,,,str_6.0,str_1.0,str_5.0,0


In [10]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df['label']
X_test, y_test = test_df[categorical_features + numerical_features], test_df['label']

## Preprocessing + Training

In [11]:
# OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
cat = [([c], [SimpleImputer(strategy='constant', fill_value='UNK'),
             OrdinalEncoder()]) for c in categorical_features]

In [12]:
num = [([n], [SimpleImputer()]) for n in numerical_features]

In [13]:
mapper = DataFrameMapper(num + cat, df_out=True)

In [14]:
clf = CatBoostClassifier(iterations=1000,
                        learning_rate=0.01,
                         metric_period=100)

In [15]:
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

In [16]:
pipeline.fit(X_train, y_train)

0:	learn: 0.6869536	total: 49.9ms	remaining: 49.9s
100:	learn: 0.4355847	total: 368ms	remaining: 3.28s
200:	learn: 0.3955449	total: 668ms	remaining: 2.65s
300:	learn: 0.3812713	total: 1.14s	remaining: 2.65s
400:	learn: 0.3738819	total: 1.67s	remaining: 2.5s
500:	learn: 0.3678710	total: 2.22s	remaining: 2.21s
600:	learn: 0.3634466	total: 2.77s	remaining: 1.84s
700:	learn: 0.3595107	total: 3.3s	remaining: 1.41s
800:	learn: 0.3558111	total: 3.85s	remaining: 956ms
900:	learn: 0.3518780	total: 4.31s	remaining: 473ms
999:	learn: 0.3485613	total: 4.85s	remaining: 0us


Pipeline(steps=[('preprocess',
                 DataFrameMapper(df_out=True, drop_cols=[],
                                 features=[(['feat_1'], [SimpleImputer()]),
                                           (['feat_2'], [SimpleImputer()]),
                                           (['feat_3'], [SimpleImputer()]),
                                           (['feat_4'], [SimpleImputer()]),
                                           (['feat_5'],
                                            [SimpleImputer(fill_value='UNK',
                                                           strategy='constant'),
                                             OrdinalEncoder()]),
                                           (['feat_6'],
                                            [SimpleImputer(fill_value='UNK',
                                                           strategy='constant'),
                                             OrdinalEncoder()]),
                                           (['f

In [17]:
preprocessed_X_test = mapper.transform(X_test)

In [18]:
X_test[numerical_features+categorical_features].head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8
9000,,765.0,738.0,220.0,str_1.0,str_2.0,str_0.0,str_1.0
9001,508.0,793.0,858.0,,,str_3.0,,str_6.0
9002,511.0,819.0,,250.0,str_1.0,,,str_1.0
9003,510.0,775.0,738.0,216.0,str_0.0,,str_0.0,str_0.0
9004,505.0,809.0,760.0,269.0,str_1.0,str_4.0,str_1.0,


In [19]:
preprocessed_X_test[numerical_features+categorical_features].head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8
9000,507.493079,765.0,738.0,220.0,2.0,3.0,1.0,2.0
9001,508.0,793.0,858.0,246.424972,0.0,4.0,0.0,7.0
9002,511.0,819.0,775.584645,250.0,2.0,0.0,0.0,2.0
9003,510.0,775.0,738.0,216.0,1.0,0.0,1.0,1.0
9004,505.0,809.0,760.0,269.0,2.0,5.0,2.0,0.0


In [20]:
from joblib import dump, load
dump(pipeline, 'params/pipeline.joblib')
test_df.to_csv('params/test_df.csv')

In [21]:
def evaluation(pipeline, X, y):
    y_predit_proba = pipeline.predict_proba(X)[:, 1]
    return {
        'auc': roc_auc_score(y, y_predit_proba)
    }

In [22]:
evaluation(pipeline, X_train, y_train)

{'auc': 0.9268142040296534}

In [23]:
evaluation(pipeline, X_test, y_test)

{'auc': 0.9085349236121131}

## Alternative

In [24]:
cat = [([c], [SimpleImputer(strategy='constant', fill_value='UNK'),
             OneHotEncoder()]) for c in categorical_features]

In [25]:
num = [([n], [SimpleImputer(), StandardScaler()]) for n in numerical_features]

In [26]:
mapper = DataFrameMapper(num + cat, df_out=True)

In [27]:
clf = LogisticRegression()

In [28]:
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

In [29]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 DataFrameMapper(df_out=True, drop_cols=[],
                                 features=[(['feat_1'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['feat_2'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['feat_3'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['feat_4'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['feat_5'],
                                            [SimpleImputer(fill_value='UNK',
                                            

In [30]:
preprocessed_X_test = mapper.transform(X_test)

In [31]:
X_test[numerical_features+categorical_features].head().T

Unnamed: 0,9000,9001,9002,9003,9004
feat_1,,508.0,511.0,510.0,505.0
feat_2,765.0,793.0,819.0,775.0,809.0
feat_3,738.0,858.0,,738.0,760.0
feat_4,220.0,,250.0,216.0,269.0
feat_5,str_1.0,,str_1.0,str_0.0,str_1.0
feat_6,str_2.0,str_3.0,,,str_4.0
feat_7,str_0.0,,,str_0.0,str_1.0
feat_8,str_1.0,str_6.0,str_1.0,str_0.0,


In [32]:
preprocessed_X_test.head().T

Unnamed: 0,9000,9001,9002,9003,9004
feat_1,2.66048e-14,0.2372577,1.641368,1.173331,-1.166852
feat_2,-1.104785,0.2743617,1.554998,-0.612233,1.062445
feat_3,-0.8397372,1.84137,0.0,-0.839737,-0.348201
feat_4,-0.9774131,-1.051269e-15,0.132234,-1.125366,0.83501
feat_5_x0_UNK,0.0,1.0,0.0,0.0,0.0
feat_5_x0_str_0.0,0.0,0.0,0.0,1.0,0.0
feat_5_x0_str_1.0,1.0,0.0,1.0,0.0,1.0
feat_6_x0_UNK,0.0,0.0,1.0,1.0,0.0
feat_6_x0_str_0.0,0.0,0.0,0.0,0.0,0.0
feat_6_x0_str_1.0,0.0,0.0,0.0,0.0,0.0


In [33]:
evaluation(pipeline, X_train, y_train)

{'auc': 0.8775138419673723}

In [34]:
evaluation(pipeline, X_test, y_test)

{'auc': 0.8814184260728771}