# Building SciKit-Learn Compatible Transformers

Using the methods `.fit(self, X, y=None)` and `.transform(self, X)`, we can build transformers to create feature vectors in SciKit-Learn easily.

In [1]:
import csv
import re
import numpy as np
import random

from textblob import TextBlob
from collections import Counter

from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

We're going to use our SMS spam data because I want to demonstrate getting features from text.

In [2]:
sms_data = []
sms_results = []

with open("SMSSpamCollection") as file:
    reader = csv.reader(file, delimiter="\t")
    for row in reader:
        sms_data.append(row[1])
        sms_results.append(row[0])

This is a really bad featurizer/vectorizer. It just returns the vector `[1]`. It's a good demonstration, though.

Inheriting from `TransformerMixin` adds a `.fit_transform(self, X, y=None)` method. That's all, but it's nice to denote that we're building a transformer by using it.

In [3]:
class DumbFeaturizer(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [[1] for _ in X]

In [4]:
pipe = make_pipeline(DumbFeaturizer(), DecisionTreeClassifier())
pipe.fit(sms_data, sms_results)
# Our baseline
pipe.score(sms_data, sms_results)

0.86593682699210339

87% of our data is ham instead of spam, so we'll get a good score just by calling everything ham.

Let's build a better featurizer. We're going to send the text through a series of functions that return a value and put all those results together as a feature vector.

In [5]:
def longest_run_of_capitol_letters_feature(text):
    """Find the longest run of capitol letters and return their length."""
    runs = sorted(re.findall(r"[A-Z]+", text), key=len)
    if runs:
        return len(runs[-1])
    else:
        return 0

def percent_character_feature(char):
    """Return percentage of text that is a particular char compared to total text length."""
    def feature_fn(text):
        periods = text.count(char)
        return periods / len(text)
    return feature_fn

class FunctionFeaturizer(TransformerMixin):
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        """All SciKit-Learn compatible transformers and classifiers have the
        same interface. `fit` always returns the same object."""
        return self
        
    def transform(self, X):
        """Given a list of original data, return a list of feature vectors."""
        fvs = []
        for datum in X:
            fv = [f(datum) for f in self.featurizers]
            fvs.append(fv)
        return np.array(fvs)

In [16]:
sms_featurizer = FunctionFeaturizer(longest_run_of_capitol_letters_feature,
                                    percent_character_feature("."))
sms_featurizer.transform(sms_data[:10])

X_train, X_test, y_train, y_test = train_test_split(sms_data, sms_results)

pipe = make_pipeline(sms_featurizer, DecisionTreeClassifier())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.90811198851399855

In [7]:
print(classification_report(pipe.predict(X_test), y_test))

             precision    recall  f1-score   support

        ham       0.95      0.94      0.95      1230
       spam       0.59      0.66      0.62       163

avg / total       0.91      0.91      0.91      1393



Just by looking at capitol letters and periods, we get better results. If we added a bag of words featurizer, I bet we could get them even better.

In [8]:
class BagOfWordsFeaturizer(TransformerMixin):
    def __init__(self, num_words=None):
        self.num_words = num_words
        
    def fit(self, X, y=None):
        words = []
        for x in X:
            x = TextBlob(x.lower())
            words += [word.lemmatize() for word in x.words]
        if self.num_words:
            words = Counter(words)
            self._vocab = [word for word, _ in words.most_common(self.num_words)]
        else:
            self._vocab = list(set(words))
        return self
    
    def transform(self, X):
        vectors = []
        for x in X:
            x = TextBlob(x.lower())
            word_count = Counter(x.words)
            vector = [0] * len(self._vocab)
            for word, count in word_count.items():
                try:
                    idx = self._vocab.index(word)
                    vector[idx] = count
                except ValueError:
                    pass
            vectors.append(vector)
        return vectors
        

In [9]:
bow = BagOfWordsFeaturizer(10)
bow.fit(sms_data[:10])
print(bow._vocab)
bow.transform(sms_data[:10])

['to', 'a', "'s", 'your', 'u', 'free', 'been', 'like', 'for', 'mobile']


[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [3, 1, 2, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 2, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [2, 0, 2, 0, 0, 0, 1, 1, 1, 0],
 [1, 0, 0, 0, 0, 0, 0, 2, 0, 0],
 [1, 0, 0, 3, 0, 0, 1, 0, 1, 0],
 [2, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [2, 0, 0, 1, 1, 2, 0, 0, 1, 2]]

Ok, this will work! How do we combine the bag of words vector with the other vector?

`sklearn.pipeline.make_union` will do it. It takes multiple vectorizers, runs them in parallel, and combines the results.

In [10]:
sms_featurizer = make_union(
    BagOfWordsFeaturizer(20),
    FunctionFeaturizer(longest_run_of_capitol_letters_feature,
                       percent_character_feature(".")))

In [11]:
pipe = make_pipeline(sms_featurizer, DecisionTreeClassifier())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.93754486719310837

Finally, let's build a debugging transformer. It will just return the data it gets, but will print it out for us.

In [20]:
class PipelineDebugger(TransformerMixin):
    def __init__(self, transformer):
        self.transformer = transformer
        
    def fit(self, X, y=None):
        self.transformer.fit(X, y)
        return self
    
    def transform(self, X):
        print(self.transformer.__class__.__name__)
        idx = random.randrange(0, len(X))
        print("Before", "=" * 40)
        print(X[idx])
        X = self.transformer.transform(X)
        print("After ", "=" * 40)
        print(X[idx])
        return X

In [31]:
pipe = make_pipeline(PipelineDebugger(sms_featurizer), DecisionTreeClassifier())
pipe.fit(X_train, y_train)

FunctionFeaturizer
LOL .. *grins* .. I'm not babe, but thanks for thinking of me!
[ 3.          0.06451613]


Pipeline(steps=[('pipelinedebugger', <__main__.PipelineDebugger object at 0x10eaf4208>), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best'))])

Note the length of these vectors -- 22. That's the 20 words from Bag of Words, and the 2 features from FunctionFeaturizer.