In [58]:
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV    
from preprocessor import Preprocessor
from util import Loader
import pandas as pd
import logging
import sys
from sklearn.metrics import classification_report

s_dataset = '/exp/datasets/docs_rotulados/SyskillWebert-Parsed'        
l = Loader()
d = l.from_files(s_dataset)

In [28]:
print(str(len(d['corpus']))+" documents")
print(str(len(d['class_index']))+" labelled documents")
print("class names: " + ' '.join(d['class_names']))

334 documents
334 labelled documents
class names: Goats BioMedical Sheep Bands


In [30]:
# Creating Pipeline

def pipe(ops):
    s_count_vec = 'count_vec'
    s_hash_vec = 'hash_vec'
    s_tfidf_vec = 'tfidf_vec'
    s_my_preproc = 'my_preproc'
    s_tfidf_trans = 'tfidf_trans'
    s_mnb_cls = 'mnb_cls'
    
    pipe_dic = {}    
    ####  Pré-Processing ##########################         
    pipe_dic[s_my_preproc] = Preprocessor() if s_my_preproc in ops else None
    pipe_dic[s_count_vec] = CountVectorizer() if s_count_vec in ops else None
    # n_features=2**20 (default value)
    pipe_dic[s_hash_vec] = HashingVectorizer() if s_hash_vec in ops else None
    pipe_dic[s_tfidf_vec] = TfidfVectorizer() if s_tfidf_vec in ops else None
    
    # transformer    
    pipe_dic[s_tfidf_trans] = TfidfTransformer() if s_tfidf_trans in ops else None

    ### Classification Algorithms #################
    pipe_dic[s_mnb_cls] = MultinomialNB() if s_mnb_cls in ops else None    
    
    return Pipeline([(key, pipe_dic[key]) for key in ops if pipe_dic[key] != None ])

text_clf = pipe(['my_preproc', 'tfidf_vec', 'mnb_cls'])

In [56]:

parameters = {}
scoring = {'accuracy', 'f1_micro'}

gs_clf = GridSearchCV(text_clf, parameters, cv=10, scoring='accuracy', n_jobs=-1)
gs_clf = gs_clf.fit(d['corpus'], d['class_index'])
df = pd.DataFrame(gs_clf.cv_results_)
df

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,3.412583,0.381076,0.649701,0.83467,{},1,0.583333,0.842282,0.676471,0.833333,...,0.625,0.837748,0.65625,0.831126,0.625,0.831126,0.264827,0.108892,0.048012,0.005151


In [61]:
print("Best parameters set found on development set:")
print()
print(gs_clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = gs_clf.cv_results_['mean_test_score']
stds = gs_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs_clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = d['class_index'], gs_clf.predict(d['corpus'])
print(classification_report(y_true, y_pred))
print()


Best parameters set found on development set:

{}

Grid scores on development set:

0.650 (+/-0.096) for {}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

          0       1.00      0.65      0.79        71
          1       0.72      1.00      0.84       137
          2       0.97      0.55      0.71        65
          3       1.00      0.98      0.99        61

avg / total       0.88      0.84      0.83       334


