In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
X_train = newsgroups_train.data
y_train = newsgroups_train.target
targets = newsgroups_train.target_names
print(targets)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

vectorizer = TfidfVectorizer(min_df=3, max_features=None,
                                token_pattern=r'\w+',
                                strip_accents='unicode', analyzer='word',
                                ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
                                stop_words='english')

clf = LogisticRegression(C=1000, multi_class="auto")
pipeline = make_pipeline(vectorizer, clf)
pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
newsgroups_test = fetch_20newsgroups(subset='test')
X_test = newsgroups_test.data
y_test = newsgroups_test.target
y_pred = pipeline.predict(X_test)

print('ACCURACY :', accuracy_score(y_pred, y_test))
print('F1       :', f1_score(y_pred, y_test, average='macro'))

In [None]:
def predict(text):
    idx = pipeline.predict([text])[0]
    return targets[idx]

In [None]:
predict('Windows is an operating system')

In [None]:
predict('I sell my soul')

In [None]:
predict('Which is the fastest car?')

In [None]:
predict('Which is the fastest bike?')

In [None]:
from sklearn.externals import joblib
!rm -rf models
!mkdir models/
joblib.dump(pipeline, 'models/pipeline.pickle')
joblib.dump(targets, 'models/targets.pickle')

In [None]:
joblib.load('models/pipeline.pickle')