# About

test notebook to figure out how sklearn pipeslines work

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
# from sklearn.pipeline import Pipeline
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.preprocessing import OneHotEncoder

# Loads Data

In [4]:
# loads data
train_pd = pd.read_csv("../data/raw/train.csv.zip", compression="zip")
test_pd = pd.read_csv("../data/raw/test.csv.zip", compression="zip")

# the usual way

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# shuffles, then splits into train and dev sets
shuffled = train_pd.sample(frac=1)
split = round(train_pd.shape[0] * 0.8)
train_data = shuffled[:split]
dev_data = shuffled[split+1:]

# splits the labels from the features
features = ["X", "Y"]
train_features = train_data[features]
train_labels = train_data["Category"]

dev_features = dev_data[features]
dev_labels = dev_data["Category"]

test_features = test_pd[features]

# does a silly transformation
def silly(df):
    return df.X + df.Y

train_features["Z"] = silly(train_features)
dev_features["Z"] = silly(dev_features)
test_features["Z"] = silly(test_features)

# uses a simple knn
clsfr = KNeighborsClassifier(n_neighbors = 3)
clsfr.fit(train_features, train_labels)

# checks basic accuracy
score = clsfr.score(dev_features, dev_labels)
print(score)


# with pipelines

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier

# shuffles, then splits into train and dev sets
shuffled = train_pd.sample(frac=1)
split = round(train_pd.shape[0] * 0.8)
train_data = shuffled[:split]
dev_data = shuffled[split+1:]

# splits the labels from the features
features = ["X", "Y"]
train_features = train_data[features]
train_labels = train_data["Category"]

dev_features = dev_data[features]
dev_labels = dev_data["Category"]

test_features = test_pd[features]

# see https://stackoverflow.com/questions/33091376/python-what-is-exactly-sklearn-pipeline-pipeline

class MyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass # nothing to init
        
    def fit(self, X, y = None):
        return self # no fitting
    
    def transform(self, X, y = None):
        def silly(df):
            return df.X + df.Y
        X["Z"] = silly(X)
        return X
    
    
# use pipeline
pipeline = Pipeline([
    ('silly', MyTransformer()),
    ('clf', KNeighborsClassifier(n_neighbors = 3)),
])
_ = pipeline.fit(train_features, train_labels)
# Now evaluate all steps on test set
score = pipeline.score(dev_features, dev_labels)
print(score)

# with pipelines on both numeric and categorical features  

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier


# subsets data to make next steps faster
train_subset = train_pd.sample(n = 1000, random_state = 0)

# shuffles, then splits into train and dev sets
# TODO replace with sklearn's stratified splitter???
shuffled = train_subset.sample(frac=1)
split = round(train_subset.shape[0] * 0.8)
train_data = shuffled[:split]
dev_data = shuffled[split+1:]

# TODO is this the best way to separate out the labels from the features???
train_labels = train_data["Category"]
dev_labels = dev_data["Category"]

# see Chap 2 transformation pipelines

class DataFrameSelector( BaseEstimator, TransformerMixin): 
    def __init__( self, attribute_names): 
        self.attribute_names = attribute_names 
        
    def fit( self, X, y = None): 
        return self 
    
    def transform( self, X): 
        return X[ self.attribute_names].values

    
num_features = ["X", "Y"]
cat_features = ["DayOfWeek"]

num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_features))
])

cat_pipeline = Pipeline([
    ("selector", DataFrameSelector(cat_features)),
    ("cat_encoder", OneHotEncoder(sparse = False))
])

full_pipeline = FeatureUnion(transformer_list = [
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

train_prepared = full_pipeline.fit_transform(train_data)
print(train_prepared[0])
print(train_prepared[400])
print(train_prepared.shape)
print(train_labels.shape)


dev_prepared = full_pipeline.fit_transform(dev_data)
print(dev_prepared[0])
print(dev_prepared[50])
print(dev_prepared.shape)
print(dev_labels.shape)

# TODO how to get the list of learned encodings???
# print(cat_pipeline)
# print(full_pipeline)


# reuse pipeline code from previous section to classify
clf_pipeline = Pipeline([
    ('clf', KNeighborsClassifier(n_neighbors = 3))
])

_ = clf_pipeline.fit(train_prepared, train_labels)
# Now evaluate all steps on test set
score = clf_pipeline.score(dev_prepared, dev_labels)
print(score)

[-122.41967178   37.76505012    0.            0.            0.
    0.            0.            0.            1.        ]
[-122.40763352   37.78418935    0.            0.            0.
    0.            0.            0.            1.        ]
(800, 9)
(800,)
[-122.41765068   37.78801585    0.            1.            0.
    0.            0.            0.            0.        ]
[-122.4191831    37.78309982    0.            1.            0.
    0.            0.            0.            0.        ]
(199, 9)
(199,)
0.12562814070351758


# with pipelines and gscv

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


# subsets data to make next steps faster
train_subset = train_pd.sample(n = 1000, random_state = 0)

# shuffles, then splits into train and dev sets
# TODO replace with sklearn's stratified splitter???
shuffled = train_subset.sample(frac=1)
split = round(train_subset.shape[0] * 0.8)
train_data = shuffled[:split]
dev_data = shuffled[split+1:]

# TODO is this the best way to separate out the labels from the features???
train_labels = train_data["Category"]
dev_labels = dev_data["Category"]

# see Chap 2 transformation pipelines

class DataFrameSelector( BaseEstimator, TransformerMixin): 
    def __init__( self, attribute_names): 
        self.attribute_names = attribute_names 
        
    def fit( self, X, y = None): 
        return self 
    
    def transform( self, X): 
        return X[ self.attribute_names].values

    
num_features = ["X", "Y"]
cat_features = ["DayOfWeek"]

num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_features))
])

cat_pipeline = Pipeline([
    ("selector", DataFrameSelector(cat_features)),
    ("cat_encoder", OneHotEncoder(sparse = False))
])

full_pipeline = FeatureUnion(transformer_list = [
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

train_prepared = full_pipeline.fit_transform(train_data)
print(train_prepared[0])
print(train_prepared[400])
print(train_prepared.shape)
print(train_labels.shape)


dev_prepared = full_pipeline.fit_transform(dev_data)
print(dev_prepared[0])
print(dev_prepared[50])
print(dev_prepared.shape)
print(dev_labels.shape)

# TODO how to get the list of learned encodings???
# print(cat_pipeline)
# print(full_pipeline)


# reuse pipeline code from previous section to classify
knn = KNeighborsClassifier()

clf_pipeline = Pipeline(steps = [
    ('knn', knn)
])

# see https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
param_grid = {
    'knn__n_neighbors': [3, 5, 9, 17, 23]
}
search = GridSearchCV(clf_pipeline, param_grid, iid = False, cv = 5, return_train_score = False)

_ = search.fit(train_prepared, train_labels)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

# Now evaluate all steps on test set
score = search.score(dev_prepared, dev_labels)
print(score)

[-122.40114294   37.78900115    1.            0.            0.
    0.            0.            0.            0.        ]
[-122.44471296   37.77130219    0.            0.            0.
    0.            0.            0.            1.        ]
(800, 9)
(800,)
[-122.41095525   37.78413995    0.            0.            0.
    1.            0.            0.            0.        ]
[-122.40742204   37.76448809    0.            0.            1.
    0.            0.            0.            0.        ]
(199, 9)
(199,)




Best parameter (CV score=0.165):
{'knn__n_neighbors': 23}
0.19095477386934673


# with pipelines and ensembles

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier


# subsets data to make next steps faster
train_subset = train_pd.sample(n = 1000, random_state = 0)

# shuffles, then splits into train and dev sets
# TODO replace with sklearn's stratified splitter???
shuffled = train_subset.sample(frac=1)
split = round(train_subset.shape[0] * 0.8)
train_data = shuffled[:split]
dev_data = shuffled[split+1:]

# TODO is this the best way to separate out the labels from the features???
train_labels = train_data["Category"]
dev_labels = dev_data["Category"]

# see Chap 2 transformation pipelines

class DataFrameSelector( BaseEstimator, TransformerMixin): 
    def __init__( self, attribute_names): 
        self.attribute_names = attribute_names 
        
    def fit( self, X, y = None): 
        return self 
    
    def transform( self, X): 
        return X[ self.attribute_names].values

    
num_features = ["X", "Y"]
cat_features = ["DayOfWeek"]

num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_features))
])

cat_pipeline = Pipeline([
    ("selector", DataFrameSelector(cat_features)),
    ("cat_encoder", OneHotEncoder(sparse = False))
])

full_pipeline = FeatureUnion(transformer_list = [
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

train_prepared = full_pipeline.fit_transform(train_data)
print(train_prepared[0])
print(train_prepared[400])
print(train_prepared.shape)
print(train_labels.shape)


dev_prepared = full_pipeline.fit_transform(dev_data)
print(dev_prepared[0])
print(dev_prepared[50])
print(dev_prepared.shape)
print(dev_labels.shape)

# see Chap 7

log_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()
voting_clf = VotingClassifier( estimators =[("lr", log_clf), ("knn", knn_clf)], voting = "hard")

from sklearn.metrics import accuracy_score
for clf in (log_clf, knn_clf, voting_clf):
    clf.fit(train_prepared, train_labels)
    y_pred = clf.predict(dev_prepared)
    print(clf.__class__.__name__, accuracy_score( dev_labels, y_pred))
