In [2]:
import os

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import cross_val_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from skll.metrics import kappa

In [3]:
os.chdir(
    r"C:\Users\Tobias\OneDrive\Bachelor Thesis\Empirical Look at Software Patents\test"
)

dftrain = pd.read_pickle("train_sample.pickle")
# dftrain = dftrain[['label', 'text']]
dftest = pd.read_pickle("test_sample.pickle")
# dftest = dftest[['label', 'text']]
df = pd.read_pickle("classified_sample_wtext.pickle")
# df = df[['label', 'text']]
dfover = pd.read_pickle("real_oversampling_400+luksoft.pickle")

kappa_scorer = make_scorer(kappa)

# Using only 400

## CV with StratifiedKFold

In [4]:
### Pipelining
text_clf = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1, 4), max_df=0.7, analyzer="word")),
        #                    ('to_dense', DenseTransformer()),
        #                    ('tfidf', TfidfVectorizer(max_df=0.8)),
        #                    ('scale', StandardScaler(copy=True, with_mean=False, with_std=True)),
        ("sel", SelectKBest(chi2, k=200)),
        #                    ('pca', PCA(n_components=300)),
        #                    ('tfidf_trans', TfidfTransformer(sublinear_tf=True)),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=5000,
                criterion="gini",
                #                                                   max_depth=10,
                max_features=0.3,
                #                                                   min_samples_leaf=5,
                #                                                   oob_score=True, warm_start=True,
            ),
        ),
    ]
)


### Cross-Validation
##### Define CV
cv = StratifiedKFold(
    dftest.label, n_folds=10, indices=None, shuffle=True, random_state=6
)
scores_f = cross_val_score(
    text_clf,
    dftest.text.values,
    dftest.software_man_class.values,
    cv=cv,
    scoring=kappa_scorer,
    n_jobs=3,
)

[ 0.63063063  0.8047619   0.46052632  0.63063063  0.68421053  0.44680851
  1.          0.87459807  0.72340426  0.62379421]
0.687936505704


## Grid-Search

In [4]:
class DenseTransformer:
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self


parameters = {
    #    'vect__max_df': (0.7, 0.8),
    #    'vect__min_df': (0),
    #    'vect__stop_words': ('english', None),
    #    'vect__analyzer': ('word', None),
    #    'vect__strip_accents': ('unicode', None)
    #    'vect__ngram_range': ((1,4), (1,5)),
    #    'scale__with_std': (True, False),
    #    'sel__score_func': (chi2, f_classif),
    #    'sel__k': (150, 200, 250),
    #    'pca__n_components': (100, 200, 500),
    #    'pca_whiten': (True, False),
    #    'clf__n_estimators': (2000, 5000, 10000),
    #    'clf__criterion': ('gini', 'entropy'),
    #    'clf__max_depth': (10, 11, 12, 15, 20, 25),
    #    'clf__min_samples_leaf': (1, 3, 5, 7),
    #    'clf__max_features': (2, 2.75, 0.3, 3.25, 3.5, 4),
}

text_clf = Pipeline(
    [
        (
            "vect",
            CountVectorizer(
                #                min_df=0
                max_df=0.7,
                analyzer="word",
                ngram_range=(1, 4),
            ),
        ),
        #                     ('scale', StandardScaler(
        #                copy=True,
        #                with_mean=False,
        #                with_std=True
        #            )),
        (
            "sel",
            SelectKBest(
                score_func=chi2,
                k=200,
            ),
        ),
        #                     ('to_dense', DenseTransformer()),
        #                     ('pca', PCA(
        #                n_components=500
        #            )),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=5000,
                criterion="gini",
                min_samples_leaf=5,
                max_features=0.3,
                max_depth=10,
                n_jobs=-1,
            ),
        ),
    ]
)

cvg = StratifiedKFold(
    dftest.label, n_folds=5, indices=None, shuffle=True, random_state=None
)
gs_clf = GridSearchCV(
    text_clf, parameters, n_jobs=3, scoring=kappa_scorer, error_score=0, cv=cvg
)  #### !!!ATTENTION: JOBS!!!
gs_clf = gs_clf.fit(dftest.text.values, dftest.software_man_class.values)

best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for _param_name in sorted(parameters.keys()):
    pass

clf__oob_score: False


## Output Confusion Matrix

In [6]:
class DenseTransformer:
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self


def func(a) -> int:
    if a == "nonsoftware":
        return 0
    return 1


text_clf = Pipeline(
    [
        ("vect", CountVectorizer(max_df=0.7, ngram_range=(1, 4), analyzer="word")),
        #                    ('to_dense', DenseTransformer()),
        #                    ('tfidf', TfidfVectorizer(max_df=0.8)),
        ("sel", SelectKBest(chi2, k=200)),
        #                    ('pca', PCA(n_components=300)),
        #                    ('tfidf_trans', TfidfTransformer(sublinear_tf=True)),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=5000,
                criterion="gini",
                max_features=0.3,
                n_jobs=-1,
                #                            max_depth=10, min_samples_leaf=5,)
            ),
        ),
    ]
)


sss = StratifiedShuffleSplit(
    dftest.label.values, n_iter=10, test_size=0.1, random_state=10
)
for train_index, test_index in sss:
    x_train, x_test = dftest.text.iloc[train_index], dftest.text.iloc[test_index]
    y_train, y_test = dftest.label.iloc[train_index], dftest.label.iloc[test_index]
    classifier = text_clf.fit(x_train, y_train)
    predicted = text_clf.predict(x_test)

    vfunc = np.vectorize(func)
    predicted_bool = vfunc(predicted)
    y_test_bool = vfunc(y_test)

             precision    recall  f1-score   support

nonsoftware       0.97      1.00      0.99        35
   software       1.00      0.80      0.89         5

avg / total       0.98      0.97      0.97        40

[[35  0]
 [ 1  4]]
0.875
------------------------------------------
             precision    recall  f1-score   support

nonsoftware       0.97      1.00      0.99        35
   software       1.00      0.80      0.89         5

avg / total       0.98      0.97      0.97        40

[[35  0]
 [ 1  4]]
0.875
------------------------------------------
             precision    recall  f1-score   support

nonsoftware       0.97      1.00      0.99        35
   software       1.00      0.80      0.89         5

avg / total       0.98      0.97      0.97        40

[[35  0]
 [ 1  4]]
0.875
------------------------------------------
             precision    recall  f1-score   support

nonsoftware       0.97      0.97      0.97        35
   software       0.80      0.80      0.80  

# Using 400 and Lukas

## CV with StratifiedKFold

In [47]:
### Pipelining
text_clf = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1, 4), max_df=0.7, analyzer="word")),
        #                    ('to_dense', DenseTransformer()),
        #                    ('tfidf', TfidfVectorizer(max_df=0.8)),
        ("sel", SelectKBest(chi2, k=200)),
        #                    ('pca', PCA(n_components=300)),
        #                    ('tfidf_trans', TfidfTransformer(sublinear_tf=True)),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=5000,
                criterion="gini",
                max_depth=10,
                max_features=0.3,
                min_samples_leaf=5,
                n_jobs=-1,
            ),
        ),
    ]
)


### Cross-Validation
##### Define CV
cv = StratifiedKFold(
    df.label, n_folds=10, indices=None, shuffle=True, random_state=None
)
scores_f = cross_val_score(
    text_clf,
    df.text.values,
    df.software_man_class.values,
    cv=cv,
    scoring=kappa_scorer,
    n_jobs=3,
)

[ 0.37292162  0.43083004  0.34246575  0.25890736  0.31428571  0.43083004
  0.75460123  0.40344168  0.37292162  0.52473596]
0.42059410146


This does not work. I would contribute all the additional bias to the difference in definitions.

## Output Confusion Matrix

In [None]:
class DenseTransformer:
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self


def func(a) -> int:
    if a == "nonsoftware":
        return 0
    return 1


text_clf = Pipeline(
    [
        ("vect", CountVectorizer(max_df=0.7, ngram_range=(1, 4), analyzer="words")),
        #                    ('to_dense', DenseTransformer()),
        #                    ('tfidf', TfidfVectorizer(max_df=0.8)),
        ("sel", SelectKBest(chi2, k=200)),
        #                    ('pca', PCA(n_components=300)),
        #                    ('tfidf_trans', TfidfTransformer(sublinear_tf=True)),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=1000,
                criterion="gini",
                max_depth=10,
                max_features=0.3,
                min_samples_leaf=5,
                n_jobs=-1,
            ),
        ),
    ]
)


sss = StratifiedShuffleSplit(
    df.label.values, n_iter=10, test_size=0.2, random_state=None
)
for train_index, test_index in sss:
    x_train, x_test = df.text.iloc[train_index], df.text.iloc[test_index]
    y_train, y_test = df.label.iloc[train_index], df.label.iloc[test_index]
    classifier = text_clf.fit(x_train, y_train)
    predicted = text_clf.predict(x_test)

    vfunc = np.vectorize(func)
    predicted_bool = vfunc(predicted)
    y_test_bool = vfunc(y_test)

# Only Lukas

## CV with StratifiedKFold

In [48]:
### Pipelining
text_clf = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1, 4), max_df=0.7, analyzer="word")),
        #                    ('to_dense', DenseTransformer()),
        #                    ('tfidf', TfidfVectorizer(max_df=0.8)),
        ("sel", SelectKBest(chi2, k=200)),
        #                    ('pca', PCA(n_components=300)),
        #                    ('tfidf_trans', TfidfTransformer(sublinear_tf=True)),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=5000,
                criterion="gini",
                max_depth=10,
                max_features=0.3,
                min_samples_leaf=5,
                n_jobs=-1,
            ),
        ),
    ]
)


### Cross-Validation
##### Define CV
cv = StratifiedKFold(
    dftrain.label, n_folds=10, indices=None, shuffle=True, random_state=None
)
scores_f = cross_val_score(
    text_clf,
    dftrain.text.values,
    dftrain.software_man_class.values,
    cv=cv,
    scoring=kappa_scorer,
    n_jobs=3,
)

[-0.0326087   0.45714286 -0.0326087   0.          0.13103448  0.32413793
 -0.08617594  0.46002805 -0.03340292  0.15057915]
0.13381262154


Devastating! Of course, the classifier is not set up for this work, but identifying latent topics such as automatisation is very difficult.

# REAL OVERSAMPLING - 400 and Lukas Software Patents

## CV with StratifiedKFold

In [101]:
### Pipelining
text_clf = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1, 4), max_df=0.7, analyzer="word")),
        #                    ('to_dense', DenseTransformer()),
        #                    ('tfidf', TfidfVectorizer(max_df=0.8)),
        ("sel", SelectKBest(chi2, k=200)),
        #                    ('pca', PCA(n_components=300)),
        #                    ('tfidf_trans', TfidfTransformer(sublinear_tf=True)),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=5000,
                criterion="gini",
                max_depth=10,
                max_features=0.3,
                min_samples_leaf=5,
                n_jobs=-1,
            ),
        ),
    ]
)


### Cross-Validation
##### Define CV
cv = StratifiedKFold(
    dfover.label, n_folds=10, indices=None, shuffle=True, random_state=None
)
scores_f = cross_val_score(
    text_clf,
    dfover.text.values,
    dfover.software_man_class.values,
    cv=cv,
    scoring=kappa_scorer,
    n_jobs=3,
)

[ 0.6259542   0.47058824  0.671875    0.608       0.47058824  0.77981651
  0.51569507  0.4679803   0.53648069  0.78947368]
0.593645191656


Nice try! This kind of oversampling is not working. Of course, I used the same classifier as above and maybe some grid search will help. The problem might be, that these software patents are very specific and since overfitting is already one of my problem it is now working in a different way towards Lukas' software patents. Regarding the number of software patents (bh2007: 54, Lukas: ~85), this might be one explanation.

## Creating the DataFrame

In [90]:
dftrain_1 = dftrain[dftrain.label == "software"]
dftrain_1 = dftrain_1[
    [
        "highly_uncertain",
        "patentnr",
        "software_man_class",
        "classnr",
        "week",
        "year",
        "label",
        "text",
    ]
]
dftrain_1 = dftrain_1.reset_index(drop=True)
df_over = pd.concat([dftest, dftrain_1], axis=0, ignore_index=True).reset_index(
    drop=True
)
df_over = df_over[
    [
        "highly_uncertain",
        "patentnr",
        "software_man_class",
        "classnr",
        "week",
        "year",
        "label",
        "text",
    ]
]
df_over.to_pickle("real_oversampling_400+luksoft.pickle")

# Including patent classes

In [None]:
class ItemSelector(BaseEstimator, TransformerMixin):
    #     For data grouped by feature, select subset of data at a provided key.

    #     The data is expected to be stored in a 2D data structure, where the first
    #     index is over features and the second is over samples.  i.e.

    #     >> len(data[key]) == n_samples

    #     Please note that this is the opposite convention to sklearn feature
    #     matrixes (where the first index corresponds to sample).

    #     ItemSelector only requires that the collection implement getitem
    #     (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    #     DataFrame, numpy record array, etc.

    #     >> data = {'a': [1, 5, 2, 5, 2, 8],
    #                'b': [9, 4, 1, 4, 1, 3]}
    #     >> ds = ItemSelector(key='a')
    #     >> data['a'] == ds.transform(data)

    #     ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    #     list of dicts).  If your data is structured this way, consider a
    #     transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    #     Parameters
    #     ----------
    #     key : hashable, required
    #         The key corresponding to the desired value in a mappable.

    def __init__(self, key) -> None:
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


patent_clf = Pipeline(
    [
        # Use FeatureUnion to combine text with classnr
        (
            "union",
            FeatureUnion(
                transformer_list=[
                    (
                        "classnr",
                        Pipeline(
                            [
                                ("selector", ItemSelector(key="classnr")),
                            ]
                        ),
                    ),
                    (
                        "text",
                        Pipeline(
                            [
                                ("selector", ItemSelector(key="text")),
                                (
                                    "vect",
                                    CountVectorizer(
                                        ngram_range=(1, 4), max_df=0.7, analyzer="word"
                                    ),
                                ),
                                ("sel", SelectKBest(chi2, k=200)),
                            ]
                        ),
                    ),
                ],
            ),
        ),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=5000,
                criterion="gini",
                max_depth=10,
                max_features=0.3,
                min_samples_leaf=5,
                #                           oob_score=True, warm_start=True,
            ),
        ),
    ]
)

patent_clf = patent_clf.fit(
    dftest[["classnr", "text"]], dftest.software_man_class.values
)
y = patent_clf.predict(dftest[["classnr", "text"]])

### Cross-Validation
##### Define CV
# cv = StratifiedKFold(dftest.label, n_folds=10, indices=None, shuffle=True, random_state=None)
# scores_f = cross_val_score(patent_clf, dftest[['text', 'classnr']], dftest.software_man_class.values, cv=cv, scoring=kappa_scorer, n_jobs=1)
# print(scores_f)
# print(scores_f.mean())

# PCA analysis

In [9]:
data = CountVectorizer(ngram_range=(1, 1), max_df=0.7, analyzer="word").fit_transform(
    dftest.text.values
)

In [10]:
data = data.todense()

(399, 39623)


In [11]:
pca = PCA(n_components=200).fit(data)

In [13]:
pca.explained_variance_ratio_

array([ 0.10582409,  0.10481709,  0.04807325,  0.0396829 ,  0.03566621,
        0.0258221 ,  0.01914194,  0.01700291,  0.01588821,  0.01379683,
        0.01341883,  0.012811  ,  0.01217448,  0.01171879,  0.01158666,
        0.01095529,  0.01070691,  0.00968206,  0.00931362,  0.00904757,
        0.00845281,  0.00822351,  0.00803343,  0.00783608,  0.00752051,
        0.00708226,  0.00690281,  0.00685378,  0.00669231,  0.00662687,
        0.00637655,  0.00634484,  0.00620333,  0.00601109,  0.00586248,
        0.00584506,  0.00568548,  0.00547855,  0.00536456,  0.00515166,
        0.00493291,  0.0048548 ,  0.00483365,  0.0048139 ,  0.00471311,
        0.00458722,  0.0045315 ,  0.00438892,  0.00426935,  0.00416489,
        0.00402159,  0.00398604,  0.0039372 ,  0.00386506,  0.00372854,
        0.00368302,  0.00359796,  0.00354931,  0.00347091,  0.00343143,
        0.00338491,  0.00330203,  0.00327289,  0.00311969,  0.00305027,
        0.00298252,  0.00292361,  0.00288909,  0.00285151,  0.00

In [6]:
class DenseTransformer:
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self


### Pipelining
text_clf = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1, 1), max_df=0.7, analyzer="word")),
        ("to_dense", DenseTransformer()),
        #                    ('tfidf', TfidfVectorizer(max_df=0.8)),
        ("scale", StandardScaler(copy=True, with_mean=False, with_std=True)),
        #                    ('sel', SelectKBest(chi2, k=200)),
        ("pca", PCA(n_components=175)),
        #                    ('tfidf_trans', TfidfTransformer(sublinear_tf=True)),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=5000,
                criterion="gini",
                max_depth=10,
                max_features=0.3,
                min_samples_leaf=5,
                n_jobs=-1,
            ),
        ),
    ]
)


### Cross-Validation
##### Define CV
####### Does have problems with multiprocessing. You never know...
cv = StratifiedKFold(
    dftest.label, n_folds=10, indices=None, shuffle=True, random_state=None
)
scores_f = cross_val_score(
    text_clf,
    dftest.text.values,
    dftest.software_man_class.values,
    cv=cv,
    scoring=kappa_scorer,
    n_jobs=1,
)

  "got %s" % (estimator, X.dtype))
  "got %s" % (estimator, X.dtype))
  "got %s" % (estimator, X.dtype))
  "got %s" % (estimator, X.dtype))
  "got %s" % (estimator, X.dtype))
  "got %s" % (estimator, X.dtype))
  "got %s" % (estimator, X.dtype))


KeyboardInterrupt: 

In [None]:
class DenseTransformer:
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self


parameters = {
    #    'vect__max_df': (0.7, 0.8),
    #    'vect__min_df': (0),
    #    'vect__stop_words': ('english', None),
    #    'vect__analyzer': ('word', None),
    #    'vect__strip_accents': ('unicode', None)
    #    'vect__ngram_range': ((1,4), (1,5)),
    #    'sel__score_func': (chi2, f_classif),
    #    'sel__k': (150, 200, 250),
    #    'pca__n_components': (100, 200, 500),
    #    'pca_whiten': (True, False),
    #    'clf__n_estimators': (2000, 5000, 10000),
    #    'clf__criterion': ('gini', 'entropy'),
    #    'clf__max_depth': (10, 11, 12, 15, 20, 25),
    #    'clf__min_samples_leaf': (1, 3, 5, 7),
    #    'clf__max_features': (2, 2.75, 0.3, 3.25, 3.5, 4),
}

text_clf = Pipeline(
    [
        (
            "vect",
            CountVectorizer(
                #                min_df=0
                max_df=0.7,
                analyzer="word",
                ngram_range=(1, 4),
            ),
        ),
        #                     ('sel', SelectKBest(
        #                score_func=chi2,
        #                k=200,
        #            )),
        ("to_dense", DenseTransformer()),
        ("pca", PCA(n_components=175)),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=1000,
                criterion="gini",
                min_samples_leaf=5,
                max_features=0.3,
                max_depth=10,
                n_jobs=-1,
            ),
        ),
    ]
)

cvg = StratifiedKFold(
    dftest.label, n_folds=10, indices=None, shuffle=True, random_state=None
)
gs_clf = GridSearchCV(
    text_clf, parameters, n_jobs=3, scoring=kappa_scorer, error_score=0, cv=cvg
)  #### !!!ATTENTION: JOBS!!!
gs_clf = gs_clf.fit(dftest.text.values, dftest.software_man_class.values)

best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for _param_name in sorted(parameters.keys()):
    pass