In [34]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn import metrics

In [8]:
# stop_words : this argument allows us to pass a list of words we do not want to take into account, 
# such as too frequent words, or words we do not a priori expect to provide information about the 
# particular topic
def get_stop_words():
    result = set()
    for line in open('stopwords_en.txt', 'r').readlines():
        result.add(line.strip())
    return result

In [25]:
def evaluate_cross_validation(clf,X,y,K):
    cv = KFold(len(y),K,shuffle=True, random_state=0)
    # by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf,X,y,cv=cv,scoring='accuracy')
    print(scores)

In [32]:
def train_and_evaluate(clf, X_train,X_test,y_train,y_test):
    clf.fit(X_train, y_train)
    print("Accuracy on training set:")
    print(clf.score(X_train,y_train))
    print("Accuracy on testing set:")
    print(clf.score(X_test,y_test))
    y_pred = clf.predict(X_test)
    print("Classification Report:")
    print(metrics.classification_report(y_test,y_pred))
    print("Confusion Matrix:")
    print(metrics.confusion_matrix(y_test,y_pred))

In [27]:
clf = Pipeline([
    ('vect',TfidfVectorizer(
            stop_words=get_stop_words(),
            token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),
    ('clf',MultinomialNB(alpha=0.01))
])

In [28]:
# evaluate the model performance using cross validation
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
X_train, X_test, y_train, y_test = train_test_split(news.data,news.target, test_size=0.25, random_state=0)
len(X_train)
evaluate_cross_validation(clf,X_train,y_train,5)

[0.91192076 0.91864167 0.91333569 0.9101521  0.91153574]


In [35]:
# now lets actually test the model and publish report
train_and_evaluate(clf,X_train,X_test,y_train,y_test)

Accuracy on training set:
0.9967454365360124
Accuracy on testing set:
0.9178692699490663
Classification Report:
             precision    recall  f1-score   support

          0       0.93      0.93      0.93       205
          1       0.81      0.88      0.84       245
          2       0.88      0.82      0.85       250
          3       0.76      0.84      0.80       243
          4       0.89      0.89      0.89       255
          5       0.90      0.90      0.90       240
          6       0.89      0.82      0.86       249
          7       0.92      0.92      0.92       219
          8       0.99      0.96      0.97       246
          9       0.97      0.98      0.97       227
         10       0.98      0.99      0.98       287
         11       0.96      0.98      0.97       234
         12       0.91      0.87      0.89       247
         13       0.96      0.95      0.95       250
         14       0.94      0.96      0.95       240
         15       0.92      0.94      0