# Introduction

This notebook creates a framework for future machine learning algorithms on the sample of Bessen and Hunt (2007) which is currently my only tagged patents corpora.

The strucuture has to be a pipeline inspired by scikit-learn's pipelines which goes to all steps from text processing to the prediction. Of course, we implement the design with a randomized gridsearch to find the best parameters for the prediction model as well as cross-validation to overcome overfitting.

The major problem is the sample size of only 400 patents where only about 40 are labelled manually as software patents. We have to alleviate this obstacle by good techniques or additional datasets.

In [1]:
#!/usr/bin/env python

import pickle
import string
import time

from itertools import compress
from nltk import WordNetLemmatizer
from nltk import pos_tag
from nltk import sent_tokenize
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from operator import itemgetter
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report as clsr
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import cohen_kappa_score


SEED = 12345


def timeit(func):
    """Simple timing decorator.
    """
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        delta = time.time() - start
        return result, delta
    return wrapper


def identity(arg):
    """Simple identity function works as a passthrough.
    """
    return arg


class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower = lower
        self.strip = strip
        self.stopwords = stopwords or set(sw.words('english'))
        self.punct = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue
                    
                # If token number, ignore and continue
                try:
                    int(token)
                except ValueError:
                    pass
                else:
                    continue
                # If token contains fig, ignore and continue
                if 'fig' in token:
                    continue
                    
                # If token has length one, ignore and continue
                if len(token) == 1:
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)


@timeit
def build_and_evaluate(X, y, classifier=ExtraTreesClassifier,
                       n_splits=2, test_size=0.2, seed=SEED,
                       outpath=None, verbose=True, debug=False):

    @timeit
    def build(classifier, X, y=None):
        """Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            classifier = classifier()

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer', TfidfVectorizer(
                tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 4),
                min_df=0.05
            )),
            ('feature_selection', SelectFromModel(RandomForestClassifier(n_jobs=3), threshold='mean')),
            ('classifier', classifier),
        ])

        model.fit(X, y)

        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Begin evaluation
    if verbose:
        print("Building for evaluation")
    sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size,
                                 random_state=seed)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model, secs = build(classifier, X_train, y_train)

        if verbose:
            print("Evaluation model fit in {:0.3f} seconds".format(secs))
            print("Classification Report:\n")

        y_pred = model.predict(X_test)
        if verbose:
            print(clsr(y_test, y_pred, target_names=labels.classes_))
            print('Kappa score: ', str(cohen_kappa_score(y_test, y_pred)))
    
    if not debug:
        if verbose:
            print("Building complete model and saving ...")
        model, secs = build(classifier, X, y)
        model.labels_ = labels

        if verbose:
            print("Complete model fit in {:0.3f} seconds".format(secs))

        if outpath:
            with open(outpath, 'wb') as f:
                pickle.dump(model, f)

            print("Model written out to {}".format(outpath))

    return model


def show_most_informative_features(model, text=None, n=20, outpath=None):
    # Extract the vectorizer and the classifier from the pipeline
    vectorizer = model.named_steps['vectorizer']
    classifier = model.named_steps['classifier']
    
    try:
        feature_selection = model.named_steps['feature_selection']
    except AttributeError:
        raise

    # Check to make sure that we can perform this computation
    if not (hasattr(classifier, 'coef_') | hasattr(classifier, 'feature_importances_')):
        raise TypeError(
            "Cannot compute most informative features on {}.".format(
                classifier.__class__.__name__
            )
        )

    if text is not None:
        # Compute the coefficients for the text
        tvec = model.transform([text]).toarray()
    else:
        # Otherwise simply use the coefficients
        try:
            tvec = classifier.coef_
        except AttributeError:
            tvec = classifier.feature_importances_
            
    # Determine the names from features which pass feature selection
    names = list(compress(vectorizer.get_feature_names(), feature_selection.get_support()))
    # Zip the feature names with the coefs and sort
    try:
        coefs = sorted(
            zip(tvec[0], names),
            key=itemgetter(0), reverse=True
        )
    except TypeError:
        coefs = sorted(
            zip(tvec, names),
            key=itemgetter(0), reverse=True
        )

    # Get the top n and bottom n coef, name pairs
    topn = zip(coefs[:n], coefs[:-(n + 1):-1])

    # Create the output string to return
    output = []

    # If text, add the predicted value to the output.
    if text is not None:
        output.append("\"{}\"".format(text))
        output.append(
            "Classified as: {}".format(model.predict([text]))
        )
        output.append("")

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        output.append(
            "{:0.4f}{: >20}\t\t{:0.4f}{: >20}".format(
                cp, fnp, cn, fnn
            )
        )

    table = "\n".join(output)

    if outpath:
        with open(outpath, 'w') as file:
            file.write(table)

    return table

In [11]:
model = Pipeline([
    ('preprocessor', NLTKPreprocessor()),
    ('vectorizer', TfidfVectorizer(
        tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 4),
        min_df=0.05
    )),
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_jobs=3), threshold='mean')),
    ('classifier', ExtraTreesClassifier()),
])

In [14]:
isinstance(Pipeline, model)

TypeError: isinstance() arg 2 must be a type or tuple of types

In [85]:
import pandas as pd
import os


with pd.HDFStore(os.path.join('..', '..', 'bld', 'out', 'data', 'db_data_preparation.hdf')) as store:
    print(store.keys())
    bh_class = store.get('bh2007')

['/bh2007', '/patents_catalogue']


In [86]:
y = bh_class.classification_manual.map({False: 'Non-Software', True: 'Software'})
x = bh_class.description

In [172]:
PATH = 'model.pkl'

clf = ExtraTreesClassifier(n_estimators=1000, n_jobs=3, max_features=0.3)
model = build_and_evaluate(x, y, classifier=clf, outpath=PATH, debug=True)

Building for evaluation
Evaluation model fit in 217.364 seconds
Classification Report:

              precision    recall  f1-score   support

Non-Software       0.95      1.00      0.97        69
    Software       1.00      0.64      0.78        11

 avg / total       0.95      0.95      0.95        80

Kappa score:  0.751166407465
Evaluation model fit in 205.573 seconds
Classification Report:

              precision    recall  f1-score   support

Non-Software       0.91      1.00      0.95        69
    Software       1.00      0.36      0.53        11

 avg / total       0.92      0.91      0.89        80

Kappa score:  0.496402877698


In [173]:
model[0].named_steps['feature_selection'].get_support().shape, model[0].named_steps['feature_selection'].get_support().sum()

((3192,), 169)

In [175]:
print(show_most_informative_features(model[0], n=20))

0.0665            computer		0.0003            elongate
0.0603            software		0.0005     invention apply
0.0552             program		0.0005          extraction
0.0411             command		0.0006          protection
0.0406             whether		0.0006             loading
0.0293           implement		0.0006          downwardly
0.0267     invention block		0.0007             respond
0.0226            data use		0.0008               press
0.0179                time		0.0008               touch
0.0159        store memory		0.0009             defined
0.0153           determine		0.0010                pull
0.0147      microprocessor		0.0010       consideration
0.0139          system use		0.0011        manufacturer
0.0135             display		0.0011                 cap
0.0131              memory		0.0012           fabricate
0.0129              detect		0.0013         engineering
0.0126invention block diagram		0.0013              induce
0.0120          determines		0.0013            energize
0.0113 