# Introduction

This notebook creates a framework for future machine learning algorithms on the sample of Bessen and Hunt (2007) which is currently my only tagged patents corpora.

The strucuture has to be a pipeline inspired by scikit-learn's pipelines which goes to all steps from text processing to the prediction. Of course, we implement the design with a randomized gridsearch to find the best parameters for the prediction model as well as cross-validation to overcome overfitting.

The major problem is the sample size of only 400 patents where only about 40 are labelled manually as software patents. We have to alleviate this obstacle by good techniques or additional datasets.

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer

SEED = 123

kappa_scorer = make_scorer(cohen_kappa_score)

In [2]:
with pd.HDFStore(os.path.join('..', '..', 'bld', 'out', 'data', 'db_analysis.hdf')) as store:
    print(store.keys())
    bh_class = store.get('bh2007_classified')

['/bh2007_classified', '/patents_catalogue_classified']


In [3]:
y = bh_class.classification_manual
x = bh_class.description

In [4]:
vectorizer = {
    'vectorizer__stop_words': ['english', None],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vectorizer__max_df': np.arange(0.8, 1, 0.04),
#     'vectorizer__min_df': np.arange(0, 0.2, 0.04),
#     'vectorizer__max_features': np.arange(0.8, 1.0, 0.04),
}

parameter_candidates = [
    {
        'classifier__alpha': [0.1, 0.5, 1, 2, 3],        
    },
    {
        'classifier__n_estimators': [200, 500, 1000],
        'classifier__max_features': np.arange(0.1, 1.1, 0.1),
    },
]

best_model = None

for i, model in enumerate([
        MultinomialNB(),
#         RandomForestClassifier()
    ]):
    
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('classifier', model),
    ])
    
    parameter_candidates[i].update(vectorizer)

    clf = RandomizedSearchCV(pipeline, parameter_candidates[i], cv=3, scoring=kappa_scorer,
                             random_state=SEED, n_jobs=-1, n_iter=10)
    model_fit = clf.fit(x, y)

    print('Best estimator')
    print(model_fit.best_estimator_)
    print('Best score')
    print(model_fit.best_score_)
    print('----------------------------------------------')
    if i == 0:
        best_model = model_fit
    else:
        best_model = model_fit if (model_fit.best_score_ > best_model.best_score_) else best_model

Best estimator
Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.84000000000000008, max_features=None,
        min_df=1, ngram_range=(1, 2), preprocessor=None,
        stop_words='english', strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)), ('classifier', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))])
Best score
0.595276667391
----------------------------------------------
