# Introduction

This notebook replicates the final approach in my bachelor thesis. I cannot exactly replicate the findings but it more or less the same.

In [31]:
import os
import numpy as np
import numpy.testing as npt
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import cohen_kappa_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics

ORIGINAL_RESULTS = [0.875, 0.875, 0.875, 0.771, 0.895,
                    0.875, 0.543, 0.875, 0.448, 0.314]

In [8]:
with pd.HDFStore(os.path.join('..', '..', 'bld', 'out', 'data', 'db_analysis.hdf')) as store:
    print(store.keys())
    bh_class = store.get('bh2007_classified')

['/bh2007_classified', '/patents_catalogue_classified']


In [9]:
y = bh_class.classification_manual
x = bh_class.title.str.cat([bh_class.abstract, bh_class.description], sep='\n')

In [16]:
text_clf = Pipeline([
                    ('vect', CountVectorizer(max_df=0.7, ngram_range=(1,4), analyzer='word')),
                    ('sel', SelectKBest(chi2, k=200)),
                    ('clf', RandomForestClassifier(n_estimators=5000, 
                            criterion='gini', max_features=0.3, n_jobs=-1)),
                    ])
    
scores = []

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=10)
for train_index, test_index in sss.split(x, y):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    classifier = text_clf.fit(x_train, y_train)
    predicted = text_clf.predict(x_test)
    
    # Append scores to list
    scores.append(cohen_kappa_score(y_test, predicted))

#     print(metrics.classification_report(y_test, predicted, target_names=['nonsoftware', 'software']))
#     print(metrics.confusion_matrix(y_test, predicted))
#     print(cohen_kappa_score(y_test, predicted))
#     print('------------------------------------------')


             precision    recall  f1-score   support

nonsoftware       0.95      1.00      0.97        35
   software       1.00      0.60      0.75         5

avg / total       0.95      0.95      0.94        40

[[35  0]
 [ 2  3]]
0.724137931034
------------------------------------------
             precision    recall  f1-score   support

nonsoftware       0.97      1.00      0.99        35
   software       1.00      0.80      0.89         5

avg / total       0.98      0.97      0.97        40

[[35  0]
 [ 1  4]]
0.875
------------------------------------------
             precision    recall  f1-score   support

nonsoftware       0.97      1.00      0.99        35
   software       1.00      0.80      0.89         5

avg / total       0.98      0.97      0.97        40

[[35  0]
 [ 1  4]]
0.875
------------------------------------------
             precision    recall  f1-score   support

nonsoftware       0.97      0.97      0.97        35
   software       0.80      0.80   

In [32]:
npt.assert_array_almost_equal(ORIGINAL_RESULTS, sorted(scores), decimal=3)

AssertionError: 
Arrays are not almost equal to 3 decimals

(mismatch 50.0%)
 x: array([ 0.314,  0.448,  0.543,  0.771,  0.875,  0.875,  0.875,  0.875,
        0.875,  0.895])
 y: array([ 0.314,  0.543,  0.625,  0.724,  0.771,  0.771,  0.875,  0.875,
        0.875,  0.895])

In [30]:
npt.assert_almost_equal(np.mean(ORIGINAL_RESULTS), np.mean(scores), decimal=2)

In [19]:
np.mean(scores)

0.72688747731397452