# Passive learning

This notebook is used to find the best results by using passive learning. 

In [19]:
import sys
import os
import warnings

sys.path.insert(0, os.path.join('..', 'examples'))

import numpy as np

# sklearn functions
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score,
                             confusion_matrix,
                             precision_score,
                             recall_score, 
                             classification_report, 
                             f1_score, 
                             make_scorer)
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.exceptions import UndefinedMetricWarning

# project specific functions
from utils import load_ptsd_data

In [7]:
warnings.simplefilter(action='ignore', category=UndefinedMetricWarning)

In [8]:
# get the texts and their corresponding labels
texts, labels = load_ptsd_data()

## Naive Bayes

### Grid Search

In [21]:
estimators = [
    ('count_vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())]
pipe = Pipeline(estimators)

param_grid = [
    {'count_vect__max_features': np.arange(4000, 6000, 500),
     'count_vect__min_df': np.arange(0.0, 0.1, 0.05),
     'count_vect__max_df': np.arange(0.6, 0.8, 0.05),
     'tfidf': [None, TfidfTransformer()]}
]

clf_nb = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring=make_scorer(f1_score, average='weighted'),
    verbose=1,
    n_jobs=-1
)
clf_nb.fit(texts, labels)
prediction = clf_nb.predict(texts)


Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  4.2min finished


In [None]:
means = clf_nb.cv_results_['mean_test_score']
stds = clf_nb.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf_nb.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

In [22]:
print(clf_nb.best_params_)
print(clf_nb.best_estimator_)

{'count_vect__max_df': 0.8000000000000002, 'count_vect__max_features': 5500, 'count_vect__min_df': 0.0, 'tfidf': None}
Pipeline(memory=None,
     steps=[('count_vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8000000000000002, max_features=5500,
        min_df=0.0, ngram_range=(1, 1), preprocessor=None, stop_wo...bulary=None)), ('tfidf', None), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])


In [23]:
print("results")
print("F1 score: ", f1_score(labels, prediction, average='weighted'))
print(classification_report(labels, prediction))

results
F1 score:  0.9890778216977931
             precision    recall  f1-score   support

          0       1.00      0.99      0.99      5037
          1       0.35      0.97      0.52        40

avg / total       0.99      0.99      0.99      5077



In [24]:
print(confusion_matrix(labels, prediction))
print(recall_score(labels, prediction, average='weighted'))
print(precision_score(labels, prediction, average='weighted'))

[[4966   71]
 [   1   39]]
0.9858183966909593
0.9947149350340608


## SVC

### Grid Search

In [29]:
estimators = [
    ('count_vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC())]
pipe = Pipeline(estimators)

param_grid = [
    {'count_vect__max_features': np.arange(4000, 6000, 500),
     'count_vect__min_df': np.arange(0.0, 0.1, 0.05),
     'count_vect__max_df': np.arange(0.6, 0.8, 0.05),
     'tfidf': [None, TfidfTransformer()], 
     'clf__C': [0.8, 1.0, 1.2]
    }
]

clf_svc = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring=make_scorer(f1_score, average='weighted'),
    verbose=1,
    n_jobs=-1
)
clf_svc.fit(texts, labels)
prediction_svc = clf_svc.predict(texts)


Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   51.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 13.5min finished


In [30]:
means = clf_svc.cv_results_['mean_test_score']
stds = clf_svc.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf_svc.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

0.988 (+/-0.001) for {'clf__C': 0.8, 'count_vect__max_df': 0.6, 'count_vect__max_features': 4000, 'count_vect__min_df': 0.0, 'tfidf': None}
0.988 (+/-0.001) for {'clf__C': 0.8, 'count_vect__max_df': 0.6, 'count_vect__max_features': 4000, 'count_vect__min_df': 0.0, 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)}
0.988 (+/-0.001) for {'clf__C': 0.8, 'count_vect__max_df': 0.6, 'count_vect__max_features': 4000, 'count_vect__min_df': 0.05, 'tfidf': None}
0.988 (+/-0.001) for {'clf__C': 0.8, 'count_vect__max_df': 0.6, 'count_vect__max_features': 4000, 'count_vect__min_df': 0.05, 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)}
0.988 (+/-0.001) for {'clf__C': 0.8, 'count_vect__max_df': 0.6, 'count_vect__max_features': 4500, 'count_vect__min_df': 0.0, 'tfidf': None}
0.988 (+/-0.001) for {'clf__C': 0.8, 'count_vect__max_df': 0.6, 'count_vect__max_features': 4500, 'count_vect__min_df': 0.0, 'tfidf': TfidfTransformer

In [31]:
print(clf_svc.best_params_)
print(clf_svc.best_estimator_)

{'clf__C': 0.8, 'count_vect__max_df': 0.6, 'count_vect__max_features': 4000, 'count_vect__min_df': 0.0, 'tfidf': None}
Pipeline(memory=None,
     steps=[('count_vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.6, max_features=4000, min_df=0.0,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
     ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])


In [32]:
print("results")
print("F1 score: ", f1_score(labels, prediction_svc, average='weighted'))
print(classification_report(labels, prediction_svc))

results
F1 score:  0.9881975769705756
             precision    recall  f1-score   support

          0       0.99      1.00      1.00      5037
          1       0.00      0.00      0.00        40

avg / total       0.98      0.99      0.99      5077



In [33]:
print(confusion_matrix(labels, prediction_svc))
print(recall_score(labels, prediction_svc, average='weighted'))
print(precision_score(labels, prediction_svc, average='weighted'))

[[5037    0]
 [  40    0]]
0.9921213314949774
0.9843047364073668
