# About

test notebook to figure out how sklearn pipeslines work

In [11]:
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder

# Loads Data

In [17]:
# loads data
train_pd = pd.read_csv("../data/raw/train.csv.zip", compression="zip")
test_pd = pd.read_csv("../data/raw/test.csv.zip", compression="zip")

# the usual way

In [29]:
# shuffles, then splits into train and dev sets
shuffled = train_pd.sample(frac=1)
split = round(train_pd.shape[0] * 0.8)
train_data = shuffled[:split]
dev_data = shuffled[split+1:]

# splits the labels from the features
features = ["X", "Y"]
train_features = train_data[features]
train_labels = train_data["Category"]

dev_features = dev_data[features]
dev_labels = dev_data["Category"]

test_features = test_pd[features]

# does a silly transformation
def silly(df):
    return df.X + df.Y

train_features["Z"] = silly(train_features)
dev_features["Z"] = silly(dev_features)
test_features["Z"] = silly(test_features)

# uses a simple knn
clsfr = KNeighborsClassifier(n_neighbors = 3)
clsfr.fit(train_features, train_labels)

# checks basic accuracy
score = clsfr.score(dev_features, dev_labels)
print(score)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.19817321435689514


# with pipelines

In [37]:
from sklearn.base import BaseEstimator, TransformerMixin

# shuffles, then splits into train and dev sets
shuffled = train_pd.sample(frac=1)
split = round(train_pd.shape[0] * 0.8)
train_data = shuffled[:split]
dev_data = shuffled[split+1:]

# splits the labels from the features
features = ["X", "Y"]
train_features = train_data[features]
train_labels = train_data["Category"]

dev_features = dev_data[features]
dev_labels = dev_data["Category"]

test_features = test_pd[features]

# see https://stackoverflow.com/questions/33091376/python-what-is-exactly-sklearn-pipeline-pipeline

class MyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass # nothing to init
        
    def fit(self, X, y = None):
        return self # no fitting
    
    def transform(self, X, y = None):
        def silly(df):
            return df.X + df.Y
        X["Z"] = silly(X)
        return X
    
    
# use pipeline
pipeline = Pipeline([
    ('silly', MyTransformer()),
    ('clf', KNeighborsClassifier(n_neighbors = 3)),
])
_ = pipeline.fit(train_features, train_labels)
# Now evaluate all steps on test set
score = pipeline.score(dev_features, dev_labels)
print(score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.20146461741710278
