# Module 3 Pipeline

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split


iris = datasets.load_iris()
X,y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

scaler = StandardScaler().fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

clf = SVC().fit(X_train_std, y_train)
clf.score(X_test_std, y_test)  

## Standardizing Data

In [None]:
from sklearn.preprocessing import StandardScaler
data = [[1, 0], [2, 0], [3, 1], [4, 1]]

scaler = StandardScaler()
scaler.fit(data)
scaler.transform(data)

## Pipeline

In [None]:
from sklearn.pipeline import make_pipeline
clf = make_pipeline(StandardScaler(), SVC()).fit(X_train, y_train)

clf.score(X_test, y_test) 
#cross_val_score(clf, X, y, cv=5)

In [None]:
from sklearn.pipeline import Pipeline
clf = Pipeline([('scaler', StandardScaler()), ('svc', SVC())]).fit(X_train, y_train)
clf.score(X_test, y_test)

## Chain Multiple Pipelines

In [None]:
from sklearn.impute import SimpleImputer

preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

clf = Pipeline([('preprocessor', preprocessor), ('svc', SVC())]).fit(X_train, y_train)
clf.score(X_test, y_test)

## Evaluate Multiple Classifiers

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf"),
    NuSVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

for classifier in classifiers:
    pipe = Pipeline([('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))


In [1]:
import pandas as pd
training = pd.DataFrame([
    (0, "This movie is nice", 1.0),
    (1, "The plot is bad", 0.0),
    (2, "The actors are excellent", 1.0),
    (3, "The acting is lousy", 0.0),
    (4, "The plot is good", 1.0),
    (5, "plot is good", 1.0)
], columns= ["id", "text", "label"])


In [3]:
X_train = training.text.values
y_train = training.labe

array(['This movie is nice', 'The plot is bad',
       'The actors are excellent', 'The acting is lousy',
       'The plot is good', 'plot is good'], dtype=object)

## Pipeline for LDA

In [None]:
# Create a pipeline that standardizes the data then creates a model
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

# create pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('lda', LinearDiscriminantAnalysis()))
model = Pipeline(estimators)

# evaluate pipeline
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())