In [4]:
from sklearn.datasets import fetch_20newsgroups

### 20 newsgroup 

http://qwone.com/~jason/20Newsgroups/

Tutorial for text classification in Scikit-Learn

http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html


In [5]:
newsgroup_train = fetch_20newsgroups(subset='train')



In [7]:
newsgroup_test = fetch_20newsgroups(subset='test')

In [9]:
from pprint import pprint

pprint(list(newsgroup_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [14]:
newsgroup_train.filenames.shape


(11314,)

In [16]:
newsgroup_train.target.shape

(11314,)

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS, CountVectorizer
categories = ['alt.atheism','talk.religion.misc','comp.graphics','sci.space']

newsgroup_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)

In [86]:
print("\n".join(newsgroup_train.data[0].split("\n")[:10]))

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?


In [87]:
print(newsgroup_train.target_names[newsgroup_train.target[0]])

comp.graphics


In [88]:
# vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer = CountVectorizer()

In [89]:
vectors = vectorizer.fit_transform(newsgroup_train.data)

In [90]:
vectors.shape

(2034, 26879)

In [91]:
vectors.nnz/float(vectors.shape[0])

96.70599803343165

In [92]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

newsgroup_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories = categories)

vectors_test = vectorizer.transform(newsgroup_test.data)
classifier = MultinomialNB(alpha=.01)
classifier.fit(vectors, newsgroup_train.target)
predictions = classifier.predict(vectors_test)
metrics.f1_score(newsgroup_test.target,predictions, average='weighted')



0.7784841608710783

In [93]:
import numpy as np

def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print ("%s: %s" % (category, " ".join(feature_names[top10])))
     

In [95]:
show_top10(classifier, vectorizer, newsgroup_train.target_names)

alt.atheism: not you it in and that is to of the
comp.graphics: that you it in for is of and to the
sci.space: on it that for is in and to of the
talk.religion.misc: not it you in is that and to of the


In [98]:
print(metrics.classification_report(newsgroup_test.target, predictions, target_names=newsgroup_test.target_names))

                    precision    recall  f1-score   support

       alt.atheism       0.66      0.71      0.68       319
     comp.graphics       0.92      0.90      0.91       389
         sci.space       0.82      0.87      0.84       394
talk.religion.misc       0.66      0.55      0.60       251

       avg / total       0.78      0.78      0.78      1353



In [102]:
metrics.confusion_matrix(newsgroup_test.target, predictions)

array([[227,   3,  29,  60],
       [ 15, 349,  23,   2],
       [ 20,  21, 343,  10],
       [ 84,   6,  24, 137]])

Pipelines

Grid search