In [1]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [4]:
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [5]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [6]:
len(twenty_train.data)

2257

In [7]:
len(twenty_train.filenames)

2257

In [9]:
print(twenty_train.data[0].split("\n")[:3])

['From: sd345@city.ac.uk (Michael Collier)', 'Subject: Converting images to HP LaserJet III?', 'Nntp-Posting-Host: hampton']


In [11]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [12]:
twenty_train.target[:100]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2, 3, 1, 0, 0, 1, 1, 2, 0, 3, 0, 3, 0,
       3, 1, 1, 1, 3, 3, 2, 2, 2, 3, 2, 3, 2, 3, 0, 0, 0, 1, 3, 0, 1, 1,
       2, 0, 3, 3, 1, 2, 1, 2, 0, 0, 2, 1, 2, 3, 0, 1, 0, 3, 1, 2, 1, 1,
       2, 0, 3, 1, 3, 2, 0, 3, 0, 1, 1, 2, 0, 1, 2, 2, 2, 2, 1, 1, 0, 2,
       1, 2, 0, 1, 1, 3, 1, 0, 1, 2, 1, 0])

In [13]:
for t in twenty_train.target[:100]:
    print(twenty_train.target_names[t])
    print('\n')

comp.graphics


comp.graphics


soc.religion.christian


soc.religion.christian


soc.religion.christian


soc.religion.christian


soc.religion.christian


sci.med


sci.med


sci.med


soc.religion.christian


comp.graphics


alt.atheism


alt.atheism


comp.graphics


comp.graphics


sci.med


alt.atheism


soc.religion.christian


alt.atheism


soc.religion.christian


alt.atheism


soc.religion.christian


comp.graphics


comp.graphics


comp.graphics


soc.religion.christian


soc.religion.christian


sci.med


sci.med


sci.med


soc.religion.christian


sci.med


soc.religion.christian


sci.med


soc.religion.christian


alt.atheism


alt.atheism


alt.atheism


comp.graphics


soc.religion.christian


alt.atheism


comp.graphics


comp.graphics


sci.med


alt.atheism


soc.religion.christian


soc.religion.christian


comp.graphics


sci.med


comp.graphics


sci.med


alt.atheism


alt.atheism


sci.med


comp.graphics


sci.med


soc.religion.christian


alt.atheism


comp

In [14]:
for t in twenty_train.target[:10]:
    print(twenty_train.data[t])
    print('\n')

From: ani@ms.uky.edu (Aniruddha B. Deglurkar)
Subject: help: Splitting a trimming region along a mesh 
Organization: University Of Kentucky, Dept. of Math Sciences
Lines: 28



	Hi,

	I have a problem, I hope some of the 'gurus' can help me solve.

	Background of the problem:
	I have a rectangular mesh in the uv domain, i.e  the mesh is a 
	mapping of a 3d Bezier patch into 2d. The area in this domain
	which is inside a trimming loop had to be rendered. The trimming
	loop is a set of 2d Bezier curve segments.
	For the sake of notation: the mesh is made up of cells.

	My problem is this :
	The trimming area has to be split up into individual smaller
	cells bounded by the trimming curve segments. If a cell
	is wholly inside the area...then it is output as a whole ,
	else it is trivially rejected. 

	Does any body know how thiss can be done, or is there any algo. 
	somewhere for doing this.

	Any help would be appreciated.

	Thanks, 
	Ani.
-- 
To get irritated is human, to stay cool, divi

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
count_vect = CountVectorizer()

In [17]:
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [18]:
X_train_counts.shape

(2257, 35788)

In [19]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer

In [24]:
tf_transformer = TfidfTransformer()

In [25]:
X_train_tfidf = tf_transformer.fit_transform(X_train_counts)

In [29]:
X_train_tfidf.shape

(2257, 35788)

In [27]:
from sklearn.naive_bayes import MultinomialNB

In [28]:
bayes = MultinomialNB()

In [30]:
bayes.fit(X_train_tfidf, twenty_train.target)

MultinomialNB()

In [31]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']

In [32]:
X_new_counts = count_vect.transform(docs_new)

In [33]:
X_new_tfidf = tf_transformer.transform(X_new_counts)

In [34]:
prediction = bayes.predict(X_new_tfidf)

In [36]:
for doc, category in zip(docs_new, prediction):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [37]:
from sklearn.pipeline import Pipeline

In [38]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB())
])

In [40]:
pipeline.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('bayes', MultinomialNB())])

In [41]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42 )

In [42]:
docs_test = twenty_test.data

In [45]:
predictions = pipeline.predict(docs_test)

In [46]:
np.mean(predictions==twenty_test.target)

0.8348868175765646

In [47]:
from sklearn.linear_model import SGDClassifier

In [50]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', alpha=1e-3, random_state=42, max_iter=5, tol=None))
])

In [51]:
pipeline.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [52]:
predicted = pipeline.predict(docs_test)

In [53]:
np.mean(predicted==twenty_test.target)

0.9101198402130493

In [54]:
from sklearn import metrics

In [None]:
from sklear