In [6]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging
import os
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Display progress logs on stdout
# logging.basicConfig(level=logging.INFO,
#                     format='%(asctime)s %(levelname)s %(message)s')

In [7]:
dir_path = os.path.join(os.getcwd(), 'vnexpress')
categories = list()

data = list()
for directory in os.listdir(dir_path):
#     print(directory)
    if '.' not in directory:
        list_file_path = os.path.join(dir_path, directory)
        count = 0
        for file_name in os.listdir(list_file_path):
            data_dict = dict()
            data_dict['file_name'] = file_name
            data_dict['category'] = directory
            file_path = os.path.join(list_file_path, file_name)
            file = open(file_path,'r')
            data_dict['data'] = file.read()
            data.append(data_dict)
data_df = pd.DataFrame(data)
sample_df = data_df.sample(10000)
train, test = train_test_split(sample_df, test_size = 0.3)

In [16]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'tfidf__max_df': (0.5, 0.75, 1.0),
    'tfidf__smooth_idf': (True, False),
    'tfidf__sublinear_tf':(True, False),
    'tfidf__binary':(True, False),
    #'vect__max_features': (None, 5000, 10000, 50000),
    #'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams

    'tfidf__norm': ('l1', 'l2'),
    'clf__C': (1e4,1e5,1e6,),
    'clf__C': (1e4,1e5,1e6,),
#     'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}
if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
#     grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
    grid_search = GridSearchCV(pipeline, parameters)
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train.data, train.category)
    print("done in %0.3fs" % (time() - t0))
    print(grid_search.estimator.get_params().keys())

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__C': (1e-05, 1e-06),
 'tfidf__max_df': (0.5, 0.75, 1.0),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__smooth_idf': (True, False)}
done in 300.256s
dict_keys(['tfidf', 'clf__verbose', 'tfidf__dtype', 'clf__penalty', 'tfidf__norm', 'tfidf__max_df', 'tfidf__stop_words', 'clf__class_weight', 'tfidf__encoding', 'steps', 'clf__fit_intercept', 'tfidf__min_df', 'tfidf__lowercase', 'clf', 'tfidf__tokenizer', 'tfidf__decode_error', 'tfidf__vocabulary', 'tfidf__use_idf', 'tfidf__sublinear_tf', 'tfidf__ngram_range', 'clf__solver', 'tfidf__preprocessor', 'tfidf__smooth_idf', 'tfidf__analyzer', 'clf__random_state', 'tfidf__strip_accents', 'clf__warm_start', 'tfidf__input', 'clf__dual', 'clf__intercept_scaling', 'clf__max_iter', 'tfidf__binary', 'clf__n_jobs', 'clf__C', 'clf__tol', 'tfidf__max_features', 'tfidf__token_pattern', 'clf__multi_class'])
Best score: 0.271
Best parameters set:
	clf__C: 1e-05
	tfidf__max_df: 0.5
	tfidf__norm: '