In [15]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
X_train = np.asarray(newsgroups_train.data)
y_train = newsgroups_train.target
targets = newsgroups_train.target_names
print(targets)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

vectorizer = TfidfVectorizer(min_df=3, max_features=None,
                                token_pattern=r'\w+',
                                strip_accents='unicode', analyzer='word',
                                ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
                                stop_words='english')

clf = LogisticRegression(C=1000)
pipeline = make_pipeline(vectorizer, clf)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [17]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
newsgroups_test = fetch_20newsgroups(subset='test')
X_test = newsgroups_test.data
y_test = newsgroups_test.target
y_pred = pipeline.predict(X_test)

print('ACCURACY :', accuracy_score(y_pred, y_test))
print('F1       :', f1_score(y_pred, y_test, average='macro'))

ACCURACY : 0.862719065321
F1       : 0.857925443834


In [18]:
def predict(text):
    idx = pipeline.predict([text])[0]
    return targets[idx]

In [19]:
predict('Windows is an operating system')

'comp.os.ms-windows.misc'

In [20]:
predict('I sell my soul')

'misc.forsale'

In [21]:
predict('Which is the fastest car?')

'rec.autos'

In [22]:
predict('Which is the fastest bike?')

'rec.motorcycles'

In [23]:
from sklearn.externals import joblib
!rm -rf models
!mkdir models/
joblib.dump(pipeline, 'models/pipeline.pickle')

['models/pipeline.pickle',
 'models/pipeline.pickle_01.npy',
 'models/pipeline.pickle_02.npy',
 'models/pipeline.pickle_03.npy',
 'models/pipeline.pickle_04.npy',
 'models/pipeline.pickle_05.npy',
 'models/pipeline.pickle_06.npy']

In [24]:
joblib.load('models/pipeline.pickle')

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])