# Pipeline Classes    

Import modules that may be needed afterwards to test the classes.

In [1]:
# general use modules
import pandas as pd
import numpy  as np

# The classes should help the use of `Pipeline` and `GridSearchCV`
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Sample DataSet and tools to test and debug the classes
from sklearn.datasets import load_breast_cancer
from sklearn.metrics  import confusion_matrix
from sklearn.model_selection import train_test_split

# Sample classes with `transform` method
from sklearn.preprocessing import StandardScaler, Binarizer

# Sample classes with  `predict`  method
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier

    
Preparations for testing & debugging.  
*Note: Computations and tests that have been made on the fly have been deleted*

In [2]:
cancer = load_breast_cancer()
ftrs = pd.DataFrame(cancer.data, columns= cancer.feature_names)
trgt = pd.Series(cancer.target)

# selected columns @ iloc 1,0,3 and 1,0,3,2 accordingly
cols_1 = ['mean texture', 'mean radius', 'mean area']
cols_2 = cols_1 + ['mean perimeter']

# Split the data as pd.DataFrame & pd.Series
X_train, X_test, y_train, y_test = train_test_split(ftrs, trgt,
                                                    test_size=0.3, random_state=42)

# Split the data as np.array
X_train2, X_test2, y_train2, y_test2 = train_test_split(cancer.data, cancer.target,
                                                        test_size=0.3, random_state=42)

# print dimensions
print(f"Train Dimensions: {X_train2.shape}",
      f"Test Dimensions: {X_test2.shape}", sep="\n")

Train Dimensions: (398, 30)
Test Dimensions: (171, 30)


---
### `class ClfSwitcher(BaseEstimator):`

```
ClfSwitcher is responsible for estimating multiple classifiers in a single pipeline.
By default it takes the default parameters for any Classifier using BaseEstimator sklearn class.

Args:
    estimator (Object) : Input Classifier
```
```python
Example:
    In [1]: pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('clf', ClfSwitcher())
            ])

            parameters = [
                {
                    'clf__estimator': [RandomForestClassifier()],
                    'clf__estimator__n_estimators': [1,3,6,9,12,15]
                },
                {
                    'clf__estimator': [LogisticRegression()],
                    'clf__estimator__solver': ['lbfgs']
                }
            ]

            gscv = GridSearchCV(pipeline, parameters, ...)
            gscv.fit(X_train)
            print(gscv.best_params_['clf__estimator'])
            
    Out[1]: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                               intercept_scaling=1, max_iter=100, multi_class='warn',
                               n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', 
                               tol=0.0001, verbose=0, warm_start=False)
```

In [3]:
# modules needed
from sklearn.base import BaseEstimator

class ClfSwitcher(BaseEstimator):

    def __init__(self, estimator = None):
        self.estimator = estimator

    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

### `class ColumnSelector(BaseEstimator, TransformerMixin):`    
 

```
ColumnSelector Class transforms the fitted Dataset and selects the desired the columns/features.
It can be used with sklearn `Pipeline` and `GridSearchCV`.

Args:
    subset (None):    Input columns/features to select from the fitted Dataset.
    reference (None): Input the full column/feature list if needed.
    args dtypes:      list, tuple, np.array, pd.Series, pd.DataFrame, None
```
```python
Examples:
    
    In [1]: df = pd.DataFrame({'A': [1,2,3], 'B': [4,5,6], 'C': [7,8,9]})
            cols = ['A','C']
            pipeline_A = Pipeline([
                ('select', ColumnSelector(cols)),
            ])
            print(pipeline_A.fit_transform(df))
            
    Out[1]:    A  C
            0  1  7
            1  2  8
            2  3  9
            
    In [2]: arr = np.array(df)
            lst = [[1,2,3],[4,5,6],[7,8,9]]
            
            pipeline_B = Pipeline([
                ('select', ColumnSelector(cols,df.columns)),
            ])
            
            pipeline_C = Pipeline([
                ('select', ColumnSelector([0,2])),
            ])
            
            # Pipeline A
            A0 = pipeline_A.transform(df)
            # Pipeline B
            B0 = pipeline_B.transform(arr)
            B1 = pipeline_B.transform(lst)
            # Pipeline C
            C0 = pipeline_C.transform(arr)
            C1 = pipeline_C.transform(lst)
            # Compare if all Transforms are the Same
            print(all(A0 & B0 & B1 & C0 & C1))
            
    Out[2]: True     
```

In [4]:
# modules needed
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy  as np

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, subset=None, reference=None):
        self.subset = subset
        self.ref = reference
    
    def fit(self,*args, **kwargs):
        return self
    
    def transform(self, X, *args, **kwargs):
        if 'subset' in kwargs:
            self.subset = kwargs['subset']
        if 'reference' in kwargs:
            self.ref = kwargs['reference']
            
        if self.subset is None:
            self.data = X
            
        elif isinstance(X, (list,tuple,np.ndarray)):
            subset = np.array(self.subset)

            if isinstance(X[0],(list,tuple)):
                data = np.array(X).T
            else:
                data = np.array(X)
              
            if self.ref is None:
                try:
                    subset = subset.astype(np.int8)
                except:
                    raise ValueError("`reference` param needs to be assigned")
            else:
                ref = np.array(self.ref)
                def subset_idx(x):
                    return np.where(ref == x)[0][0]
                subset = np.vectorize(subset_idx)(subset)
        
            self.data = data[:,subset]

        elif isinstance(X, pd.DataFrame):
            self.data = X.loc[:, self.subset]
        
        else:
            raise ValueError("Does not support this dtype. Consult help()")
        
        return self.data            

---
## Class Testing    
    
1. ***Set a sample `Pipeline` & parameters for a `GridSearchCV`***

In [5]:
pipeline = Pipeline([
    ('select', ColumnSelector()),
    ('scaler', StandardScaler()),
    ('clf', ClfSwitcher()),
])

parameters = [
    {
        'select__subset': [cols_1],
        'clf__estimator': [RandomForestClassifier()],
        'clf__estimator__n_estimators': [1,3,6,9,12,15],
    },
    {
        'select__subset': [cols_2],
        'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
        'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
        'clf__estimator__max_iter': [50, 80],
        'clf__estimator__tol': [1e-4],
        'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
    },
    {
        'select__subset': [cols_1[0:2]],
        'clf__estimator': [LogisticRegression()],
        'clf__estimator__solver': ['lbfgs'],
    }
]

2. ***Run `ColumnSelector` in the `Pipeline`***

In [6]:
X_train_short = pipeline.named_steps['select'].transform(X_train, subset=cols_2)

X_train_short.head(3)

Unnamed: 0,mean texture,mean radius,mean area,mean perimeter
149,17.91,13.74,585.0,88.12
124,16.39,13.37,553.5,86.1
421,13.98,14.69,656.1,98.22


3. ***Run the classes in a `GridSearchCV`***

In [7]:
gscv = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, return_train_score=True, iid= False, verbose=3)

gscv.fit(X_train, y_train)

print('USED FEATURES:', gscv.best_params_["select__subset"],
      'BEST MODEL & PARAMS:', gscv.best_params_['clf__estimator'],
      'PREDICTED VAVLUES:', gscv.predict(X_test),sep='\n\n')

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.0s


USED FEATURES:

['mean texture', 'mean radius', 'mean area', 'mean perimeter']

BEST MODEL & PARAMS:

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=50,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=None, shuffle=True, tol=0.0001,
       validation_fraction=0.1, verbose=0, warm_start=False)

PREDICTED VAVLUES:

[1 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 0 0 1 1 1 0 1 0
 1 1 1 1 1 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 0 1 0 1 1 0 0 1 1 1 0 1 1 0
 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 0 0 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0
 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1]


[Parallel(n_jobs=-1)]: Done  52 out of  75 | elapsed:    2.3s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    2.4s finished


4. ***Presenting the results in a Confusion Matrix***

In [8]:
prediction = gscv.predict(X_test)
conf_mtrx  = confusion_matrix(y_test, prediction)

pd.DataFrame(conf_mtrx,
             columns=cancer.target_names, index=cancer.target_names)

Unnamed: 0,malignant,benign
malignant,50,13
benign,2,106
