# Scikit Pipelines

## What is it?
Scikit learn module, that makes the modelling process more organized.

## How it works?
The modelling proces is devided into:
* Transformers
* Estimator

The pipeline object can then do fit & predict operations.

### Transformers
Alters the input data. Eg.: PCA, Imputer, Scaling, ...

### Estimator
Has the predict method. Does the response fitting. Eg.: Linear regression, NN, ...

### Ilustration
<img src="pipeline_example.png" width="600">

In [2]:
from sklearn.datasets import load_files
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.neural_network import MLPClassifier

In [3]:
# get bbc non-preprocessed data from http://mlg.ucd.ie/datasets/bbc.html
dir_path = './data/bbc/'
data = load_files(dir_path, encoding = 'utf-8', decode_error = 'replace')

In [4]:
topics, topic_counts = np.unique(data['target'], return_counts = True)
topic_names = data['target_names']
dict(zip(topic_names, topic_counts))

{'business': 510,
 'entertainment': 386,
 'politics': 417,
 'sport': 511,
 'tech': 401}

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)

In [8]:
pipeline = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', LogisticRegression(multi_class = 'auto'))])
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [9]:
pipeline.score(X_test, y_test)

0.9712746858168761

In [16]:
test_texts = ['Football match was excellent!', 
              'The prime minister has said the brexit is necessary.', 
              'Television box is smarter every day.', 
              'Computer science and artificial intelligence industry is going to change every industry.',
              'The hours are 9 to 15.'
             ]
predictions = pipeline.predict(test_texts)
[topic_names[i] for i in predictions]

['sport', 'politics', 'entertainment', 'business', 'business']

In [17]:
pipeline2 = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', LogisticRegression())])
hyper_params = {
    'vectorizer__stop_words': [None, 'english'],
    'vectorizer__max_features': [None, 1000, 10000],
    'classifier__C': [1, 10],
#    'classifier__penalty': ['l1', 'l2'],
#    'classifier__class_weight': [None, 'balanced']
               }

In [18]:
grid_search = GridSearchCV(pipeline2, hyper_params)

In [19]:
grid_search.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'vectorizer__stop_words': [None, 'english'], 'vectorizer__max_features': [None, 1000, 10000], 'classifier__C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
grid_search.best_estimator_.named_steps

{'vectorizer': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
         stop_words='english', strip_accents=None, sublinear_tf=False,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
         vocabulary=None),
 'classifier': LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='warn',
           tol=0.0001, verbose=0, warm_start=False)}

In [24]:
{param: grid_search.best_estimator_.get_params()[param] for param in hyper_params.keys()}

{'vectorizer__stop_words': 'english',
 'vectorizer__max_features': None,
 'classifier__C': 10}

In [25]:
grid_search.best_estimator_.score(X_test, y_test)

0.9766606822262118

In [26]:
pipeline3 = Pipeline([('vectorizer', TfidfVectorizer(stop_words = 'english', max_features = 10000)),('scaler', StandardScaler(with_mean=False)), ('classifier', SVC())])
hyper_params = {
    'classifier': [
        LogisticRegression(C = 1, multi_class = 'auto'), 
        SVC(gamma = 'scale'), 
        GradientBoostingClassifier(min_samples_leaf = 5, n_estimators = 50),
        SGDClassifier(),
        RandomForestClassifier(min_samples_leaf = 5, n_estimators = 50)
    ]
    }

grid_search = GridSearchCV(pipeline3, hyper_params)
grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_.named_steps)
print({param: grid_search.best_estimator_.get_params()[param] for param in hyper_params.keys()})
grid_search.best_estimator_.score(X_test, y_test)



{'vectorizer': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None), 'classifier': SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)}
{'classifier': SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0



0.9766606822262118

In [27]:
def cv_evaluate_pipeline(pipeline, hyper_params):
    grid_search = GridSearchCV(pipeline, hyper_params, cv=5)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_estimator_.named_steps)
    print({param: grid_search.best_estimator_.get_params()[param] for param in hyper_params.keys()})
    
    return(grid_search.best_estimator_.score(X_test, y_test))
    

In [28]:
pipeline_rf = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words = 'english')), 
    ('scaler', StandardScaler(with_mean=False)), 
    ('classifier', MLPClassifier(max_iter = 5000, alpha = 0.01))
    ])
hyper_params_rf = {
    'classifier__hidden_layer_sizes': [(10,),(10,5,)],
    'classifier__activation': ['logistic', 'relu'],
#    'vectorizer__max_features': [5000, 10000],
#    'vectorizer__strip_accents': [None, 'ascii']
    }

cv_evaluate_pipeline(pipeline_rf, hyper_params_rf)



{'vectorizer': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None), 'scaler': StandardScaler(copy=True, with_mean=False, with_std=True), 'classifier': MLPClassifier(activation='logistic', alpha=0.01, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=5000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start

0.9676840215439856